#31: Handle some tags in side ID lists
This commit is contained in:
parent
f7bd61d373
commit
3c3937a2a4
1 changed files with 21 additions and 25 deletions
46
src/wiki.rs
46
src/wiki.rs
|
@ -25,6 +25,22 @@ impl ResourceFile for WikiCache {
|
||||||
const FILE_NAME: &'static str = "data/wiki-game-cache.yaml";
|
const FILE_NAME: &'static str = "data/wiki-game-cache.yaml";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The parser does not handle HTML tags, so we remove some tags that are only used for annotations.
|
||||||
|
/// Others, like `code` and `sup`, are used both for path segments and annotations,
|
||||||
|
/// so we can't assume how to replace them properly.
|
||||||
|
fn preprocess_text(raw: &str) -> String {
|
||||||
|
let mut out = raw.to_string();
|
||||||
|
|
||||||
|
static HTML_COMMENT: Lazy<Regex> = Lazy::new(|| Regex::new(r"<!--.+?-->").unwrap());
|
||||||
|
static HTML_REF: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ref>.+?</ref>").unwrap());
|
||||||
|
|
||||||
|
for (pattern, replacement) in [(&HTML_COMMENT, ""), (&HTML_REF, "")] {
|
||||||
|
out = pattern.replace_all(&out, replacement).to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
async fn get_page_title(id: u64) -> Result<Option<String>, Error> {
|
async fn get_page_title(id: u64) -> Result<Option<String>, Error> {
|
||||||
let wiki = make_client().await?;
|
let wiki = make_client().await?;
|
||||||
let params = wiki.params_into(&[("action", "query"), ("pageids", id.to_string().as_str())]);
|
let params = wiki.params_into(&[("action", "query"), ("pageids", id.to_string().as_str())]);
|
||||||
|
@ -347,32 +363,28 @@ impl WikiCacheEntry {
|
||||||
for attribute in attributes {
|
for attribute in attributes {
|
||||||
match attribute.name.as_deref() {
|
match attribute.name.as_deref() {
|
||||||
Some("steam appid") => {
|
Some("steam appid") => {
|
||||||
if let Ok(value) = attribute.value.to_string().parse::<u32>() {
|
if let Ok(value) = preprocess_text(&attribute.value.to_string()).parse::<u32>() {
|
||||||
if value > 0 {
|
if value > 0 {
|
||||||
out.steam = Some(value);
|
out.steam = Some(value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some("steam appid side") => {
|
Some("steam appid side") => {
|
||||||
out.steam_side = attribute
|
out.steam_side = preprocess_text(&attribute.value.to_string())
|
||||||
.value
|
|
||||||
.to_string()
|
|
||||||
.split(',')
|
.split(',')
|
||||||
.filter_map(|x| x.trim().parse::<u32>().ok())
|
.filter_map(|x| x.trim().parse::<u32>().ok())
|
||||||
.filter(|x| *x > 0)
|
.filter(|x| *x > 0)
|
||||||
.collect();
|
.collect();
|
||||||
}
|
}
|
||||||
Some("gogcom id") => {
|
Some("gogcom id") => {
|
||||||
if let Ok(value) = attribute.value.to_string().parse::<u64>() {
|
if let Ok(value) = preprocess_text(&attribute.value.to_string()).parse::<u64>() {
|
||||||
if value > 0 {
|
if value > 0 {
|
||||||
out.gog = Some(value);
|
out.gog = Some(value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some("gogcom id side") => {
|
Some("gogcom id side") => {
|
||||||
out.gog_side = attribute
|
out.gog_side = preprocess_text(&attribute.value.to_string())
|
||||||
.value
|
|
||||||
.to_string()
|
|
||||||
.split(',')
|
.split(',')
|
||||||
.filter_map(|x| x.trim().parse::<u64>().ok())
|
.filter_map(|x| x.trim().parse::<u64>().ok())
|
||||||
.filter(|x| *x > 0)
|
.filter(|x| *x > 0)
|
||||||
|
@ -411,22 +423,6 @@ impl WikiCacheEntry {
|
||||||
Ok(out)
|
Ok(out)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The parser does not handle HTML tags, so we remove some tags that are only used for annotations.
|
|
||||||
/// Others, like `code` and `sup`, are used both for path segments and annotations,
|
|
||||||
/// so we can't assume how to replace them properly.
|
|
||||||
fn preprocess_template(template: &str) -> String {
|
|
||||||
let mut out = template.to_string();
|
|
||||||
|
|
||||||
static HTML_COMMENT: Lazy<Regex> = Lazy::new(|| Regex::new(r"<!--.+?-->").unwrap());
|
|
||||||
static HTML_REF: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ref>.+?</ref>").unwrap());
|
|
||||||
|
|
||||||
for (pattern, replacement) in [(&HTML_COMMENT, ""), (&HTML_REF, "")] {
|
|
||||||
out = pattern.replace_all(&out, replacement).to_string();
|
|
||||||
}
|
|
||||||
|
|
||||||
out
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn parse_paths(&self, article: String) -> Vec<WikiPath> {
|
pub fn parse_paths(&self, article: String) -> Vec<WikiPath> {
|
||||||
self.parse_all_paths(article)
|
self.parse_all_paths(article)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
@ -438,7 +434,7 @@ impl WikiCacheEntry {
|
||||||
let mut out = vec![];
|
let mut out = vec![];
|
||||||
|
|
||||||
for raw in &self.templates {
|
for raw in &self.templates {
|
||||||
let preprocessed = Self::preprocess_template(raw);
|
let preprocessed = preprocess_text(raw);
|
||||||
let parsed = wikitext_parser::parse_wikitext(&preprocessed, article.clone(), |_| ());
|
let parsed = wikitext_parser::parse_wikitext(&preprocessed, article.clone(), |_| ());
|
||||||
for template in parsed.list_double_brace_expressions() {
|
for template in parsed.list_double_brace_expressions() {
|
||||||
if let TextPiece::DoubleBraceExpression { tag, attributes } = &template {
|
if let TextPiece::DoubleBraceExpression { tag, attributes } = &template {
|
||||||
|
|
Reference in a new issue