#31: Handle some tags in side ID lists

This commit is contained in:
mtkennerly 2023-12-02 14:31:22 +08:00
parent f7bd61d373
commit 3c3937a2a4
No known key found for this signature in database
GPG key ID: E764BE00BE6E6408

View file

@ -25,6 +25,22 @@ impl ResourceFile for WikiCache {
const FILE_NAME: &'static str = "data/wiki-game-cache.yaml"; const FILE_NAME: &'static str = "data/wiki-game-cache.yaml";
} }
/// The parser does not handle HTML tags, so we remove some tags that are only used for annotations.
/// Others, like `code` and `sup`, are used both for path segments and annotations,
/// so we can't assume how to replace them properly.
fn preprocess_text(raw: &str) -> String {
let mut out = raw.to_string();
static HTML_COMMENT: Lazy<Regex> = Lazy::new(|| Regex::new(r"<!--.+?-->").unwrap());
static HTML_REF: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ref>.+?</ref>").unwrap());
for (pattern, replacement) in [(&HTML_COMMENT, ""), (&HTML_REF, "")] {
out = pattern.replace_all(&out, replacement).to_string();
}
out
}
async fn get_page_title(id: u64) -> Result<Option<String>, Error> { async fn get_page_title(id: u64) -> Result<Option<String>, Error> {
let wiki = make_client().await?; let wiki = make_client().await?;
let params = wiki.params_into(&[("action", "query"), ("pageids", id.to_string().as_str())]); let params = wiki.params_into(&[("action", "query"), ("pageids", id.to_string().as_str())]);
@ -347,32 +363,28 @@ impl WikiCacheEntry {
for attribute in attributes { for attribute in attributes {
match attribute.name.as_deref() { match attribute.name.as_deref() {
Some("steam appid") => { Some("steam appid") => {
if let Ok(value) = attribute.value.to_string().parse::<u32>() { if let Ok(value) = preprocess_text(&attribute.value.to_string()).parse::<u32>() {
if value > 0 { if value > 0 {
out.steam = Some(value); out.steam = Some(value);
} }
} }
} }
Some("steam appid side") => { Some("steam appid side") => {
out.steam_side = attribute out.steam_side = preprocess_text(&attribute.value.to_string())
.value
.to_string()
.split(',') .split(',')
.filter_map(|x| x.trim().parse::<u32>().ok()) .filter_map(|x| x.trim().parse::<u32>().ok())
.filter(|x| *x > 0) .filter(|x| *x > 0)
.collect(); .collect();
} }
Some("gogcom id") => { Some("gogcom id") => {
if let Ok(value) = attribute.value.to_string().parse::<u64>() { if let Ok(value) = preprocess_text(&attribute.value.to_string()).parse::<u64>() {
if value > 0 { if value > 0 {
out.gog = Some(value); out.gog = Some(value);
} }
} }
} }
Some("gogcom id side") => { Some("gogcom id side") => {
out.gog_side = attribute out.gog_side = preprocess_text(&attribute.value.to_string())
.value
.to_string()
.split(',') .split(',')
.filter_map(|x| x.trim().parse::<u64>().ok()) .filter_map(|x| x.trim().parse::<u64>().ok())
.filter(|x| *x > 0) .filter(|x| *x > 0)
@ -411,22 +423,6 @@ impl WikiCacheEntry {
Ok(out) Ok(out)
} }
/// The parser does not handle HTML tags, so we remove some tags that are only used for annotations.
/// Others, like `code` and `sup`, are used both for path segments and annotations,
/// so we can't assume how to replace them properly.
fn preprocess_template(template: &str) -> String {
let mut out = template.to_string();
static HTML_COMMENT: Lazy<Regex> = Lazy::new(|| Regex::new(r"<!--.+?-->").unwrap());
static HTML_REF: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ref>.+?</ref>").unwrap());
for (pattern, replacement) in [(&HTML_COMMENT, ""), (&HTML_REF, "")] {
out = pattern.replace_all(&out, replacement).to_string();
}
out
}
pub fn parse_paths(&self, article: String) -> Vec<WikiPath> { pub fn parse_paths(&self, article: String) -> Vec<WikiPath> {
self.parse_all_paths(article) self.parse_all_paths(article)
.into_iter() .into_iter()
@ -438,7 +434,7 @@ impl WikiCacheEntry {
let mut out = vec![]; let mut out = vec![];
for raw in &self.templates { for raw in &self.templates {
let preprocessed = Self::preprocess_template(raw); let preprocessed = preprocess_text(raw);
let parsed = wikitext_parser::parse_wikitext(&preprocessed, article.clone(), |_| ()); let parsed = wikitext_parser::parse_wikitext(&preprocessed, article.clone(), |_| ());
for template in parsed.list_double_brace_expressions() { for template in parsed.list_double_brace_expressions() {
if let TextPiece::DoubleBraceExpression { tag, attributes } = &template { if let TextPiece::DoubleBraceExpression { tag, attributes } = &template {