From 2870f7686e9063a4eabd643fc9ad036ac5dab0f3 Mon Sep 17 00:00:00 2001 From: mtkennerly Date: Sun, 21 Apr 2024 22:41:41 -0400 Subject: [PATCH] Track articles with malformed wikitext --- data/wiki-game-cache.yaml | 1 + src/steam.rs | 2 +- src/wiki.rs | 28 +++++++++++++++++++++------- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/data/wiki-game-cache.yaml b/data/wiki-game-cache.yaml index 25959c58..f5b96e96 100644 --- a/data/wiki-game-cache.yaml +++ b/data/wiki-game-cache.yaml @@ -113351,6 +113351,7 @@ Moonlight thief: pageId: 130474 steam: 1006830 Moonlighter: + malformed: true pageId: 61691 Moonlit Mayhem: pageId: 51342 diff --git a/src/steam.rs b/src/steam.rs index 85d8a29e..d2aabde3 100644 --- a/src/steam.rs +++ b/src/steam.rs @@ -67,7 +67,7 @@ impl SteamCache { i += 1; if i % SAVE_INTERVAL == 0 { self.save(); - println!("\n:: saved\n"); + println!("\n:: saved ({i})\n"); } } } diff --git a/src/wiki.rs b/src/wiki.rs index 8c6fd0fa..77884679 100644 --- a/src/wiki.rs +++ b/src/wiki.rs @@ -12,7 +12,6 @@ use crate::{ }; const SAVE_INTERVAL: u32 = 100; -const NAMESPACES: &[&str] = &["Company:", "File:", "Series:", "Topic:"]; async fn make_client() -> Result { mediawiki::api::Api::new("https://www.pcgamingwiki.com/w/api.php") @@ -296,12 +295,17 @@ impl WikiCache { if let Some(new_title) = latest.new_title.take() { println!(" page {} redirected to '{}'", cached.page_id, &new_title); - for namespace in NAMESPACES { - if new_title.starts_with(namespace) { + match is_game_article(&new_title).await { + Ok(true) => {} + Ok(false) => { println!(" page is no longer a game"); self.0.remove(title); continue; } + Err(e) => { + eprintln!(" unable to check if still a game: {e}"); + return Err(e); + } } let cached = self.0.get(&new_title).cloned().unwrap_or_default(); @@ -327,12 +331,17 @@ impl WikiCache { println!(" page {} renamed to '{}'", cached.page_id, &new_title); - for namespace in NAMESPACES { - if new_title.starts_with(namespace) { + match is_game_article(&new_title).await { + Ok(true) => {} + Ok(false) => { println!(" page is no longer a game"); self.0.remove(title); continue; } + Err(e) => { + eprintln!(" unable to check if still a game: {e}"); + return Err(e); + } } let mut latest = match WikiCacheEntry::fetch_from_page(new_title.clone()).await { @@ -365,7 +374,7 @@ impl WikiCache { i += 1; if i % SAVE_INTERVAL == 0 { self.save(); - println!("\n:: saved\n"); + println!("\n:: saved ({i})\n"); } } @@ -384,6 +393,8 @@ pub struct WikiCacheEntry { pub gog_side: BTreeSet, #[serde(skip_serializing_if = "Option::is_none")] pub lutris: Option, + #[serde(skip_serializing_if = "std::ops::Not::not")] + pub malformed: bool, pub page_id: u64, #[serde(skip_serializing_if = "Vec::is_empty")] pub renamed_from: Vec, @@ -434,7 +445,10 @@ impl WikiCacheEntry { .as_str() .ok_or(Error::WikiData("parse.wikitext"))?; - let wikitext = wikitext_parser::parse_wikitext(raw_wikitext, article, |e| println!(" Error: {}", e)); + let wikitext = wikitext_parser::parse_wikitext(raw_wikitext, article, |e| { + out.malformed = true; + println!(" Error: {}", e); + }); for template in wikitext.list_double_brace_expressions() { if let TextPiece::DoubleBraceExpression { tag, attributes } = &template {