Track articles with malformed wikitext

This commit is contained in:
mtkennerly 2024-04-21 22:41:41 -04:00
parent f055532220
commit 2870f7686e
No known key found for this signature in database
GPG key ID: E764BE00BE6E6408
3 changed files with 23 additions and 8 deletions

View file

@ -113351,6 +113351,7 @@ Moonlight thief:
pageId: 130474 pageId: 130474
steam: 1006830 steam: 1006830
Moonlighter: Moonlighter:
malformed: true
pageId: 61691 pageId: 61691
Moonlit Mayhem: Moonlit Mayhem:
pageId: 51342 pageId: 51342

View file

@ -67,7 +67,7 @@ impl SteamCache {
i += 1; i += 1;
if i % SAVE_INTERVAL == 0 { if i % SAVE_INTERVAL == 0 {
self.save(); self.save();
println!("\n:: saved\n"); println!("\n:: saved ({i})\n");
} }
} }
} }

View file

@ -12,7 +12,6 @@ use crate::{
}; };
const SAVE_INTERVAL: u32 = 100; const SAVE_INTERVAL: u32 = 100;
const NAMESPACES: &[&str] = &["Company:", "File:", "Series:", "Topic:"];
async fn make_client() -> Result<mediawiki::api::Api, Error> { async fn make_client() -> Result<mediawiki::api::Api, Error> {
mediawiki::api::Api::new("https://www.pcgamingwiki.com/w/api.php") mediawiki::api::Api::new("https://www.pcgamingwiki.com/w/api.php")
@ -296,12 +295,17 @@ impl WikiCache {
if let Some(new_title) = latest.new_title.take() { if let Some(new_title) = latest.new_title.take() {
println!(" page {} redirected to '{}'", cached.page_id, &new_title); println!(" page {} redirected to '{}'", cached.page_id, &new_title);
for namespace in NAMESPACES { match is_game_article(&new_title).await {
if new_title.starts_with(namespace) { Ok(true) => {}
Ok(false) => {
println!(" page is no longer a game"); println!(" page is no longer a game");
self.0.remove(title); self.0.remove(title);
continue; continue;
} }
Err(e) => {
eprintln!(" unable to check if still a game: {e}");
return Err(e);
}
} }
let cached = self.0.get(&new_title).cloned().unwrap_or_default(); let cached = self.0.get(&new_title).cloned().unwrap_or_default();
@ -327,12 +331,17 @@ impl WikiCache {
println!(" page {} renamed to '{}'", cached.page_id, &new_title); println!(" page {} renamed to '{}'", cached.page_id, &new_title);
for namespace in NAMESPACES { match is_game_article(&new_title).await {
if new_title.starts_with(namespace) { Ok(true) => {}
Ok(false) => {
println!(" page is no longer a game"); println!(" page is no longer a game");
self.0.remove(title); self.0.remove(title);
continue; continue;
} }
Err(e) => {
eprintln!(" unable to check if still a game: {e}");
return Err(e);
}
} }
let mut latest = match WikiCacheEntry::fetch_from_page(new_title.clone()).await { let mut latest = match WikiCacheEntry::fetch_from_page(new_title.clone()).await {
@ -365,7 +374,7 @@ impl WikiCache {
i += 1; i += 1;
if i % SAVE_INTERVAL == 0 { if i % SAVE_INTERVAL == 0 {
self.save(); self.save();
println!("\n:: saved\n"); println!("\n:: saved ({i})\n");
} }
} }
@ -384,6 +393,8 @@ pub struct WikiCacheEntry {
pub gog_side: BTreeSet<u64>, pub gog_side: BTreeSet<u64>,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub lutris: Option<String>, pub lutris: Option<String>,
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub malformed: bool,
pub page_id: u64, pub page_id: u64,
#[serde(skip_serializing_if = "Vec::is_empty")] #[serde(skip_serializing_if = "Vec::is_empty")]
pub renamed_from: Vec<String>, pub renamed_from: Vec<String>,
@ -434,7 +445,10 @@ impl WikiCacheEntry {
.as_str() .as_str()
.ok_or(Error::WikiData("parse.wikitext"))?; .ok_or(Error::WikiData("parse.wikitext"))?;
let wikitext = wikitext_parser::parse_wikitext(raw_wikitext, article, |e| println!(" Error: {}", e)); let wikitext = wikitext_parser::parse_wikitext(raw_wikitext, article, |e| {
out.malformed = true;
println!(" Error: {}", e);
});
for template in wikitext.list_double_brace_expressions() { for template in wikitext.list_double_brace_expressions() {
if let TextPiece::DoubleBraceExpression { tag, attributes } = &template { if let TextPiece::DoubleBraceExpression { tag, attributes } = &template {