Issue #7393: Properly sniff mislabeled feeds.

This commit is contained in:
Simon Martin 2015-08-30 00:15:23 +02:00
parent 67cbda4be3
commit 5301e59965
7 changed files with 52 additions and 6 deletions

View file

@ -125,6 +125,10 @@ impl <'a, T: Iterator<Item=&'a u8> + Clone> Matches for T {
// Side effects
// moves the iterator when match is found
fn matches(&mut self, matches: &[u8]) -> bool {
if self.clone().nth(matches.len()).is_none() {
// there are less than matches.len() elements in self
return false
}
let result = self.clone().zip(matches).all(|(s, m)| *s == *m);
if result {
self.nth(matches.len());
@ -381,9 +385,10 @@ where T: Iterator<Item=&'a u8> + Clone {
struct FeedsClassifier;
impl FeedsClassifier {
// Implements sniffing for mislabeled feeds (https://mimesniff.spec.whatwg.org/#sniffing-a-mislabeled-feed)
fn classify_impl(&self, data: &[u8]) -> Option<(&'static str, &'static str)> {
// can not be feed unless length is > 3
// Step 4: can not be feed unless length is > 3
if data.len() < 3 {
return None;
}
@ -403,6 +408,7 @@ impl FeedsClassifier {
return None;
}
// Steps 5.2.1 to 5.2.4
match eats_until(&mut matcher, b"?", b"?>")
.chain(|| eats_until(&mut matcher, b"!--", b"-->"))
.chain(|| eats_until(&mut matcher, b"!", b">")) {
@ -411,20 +417,23 @@ impl FeedsClassifier {
Match::Start => return None
}
// Step 5.2.5
if matcher.matches(b"rss") {
return Some(("application", "rss+xml"));
}
// Step 5.2.6
if matcher.matches(b"feed") {
return Some(("application", "atom+xml"));
}
if matcher.matches(b"rdf: RDF") {
// Step 5.2.7
if matcher.matches(b"rdf:RDF") {
while matcher.next().is_some() {
match eats_until(&mut matcher,
b"http: //purl.org/rss/1.0/",
b"http: //www.w3.org/1999/02/22-rdf-syntax-ns#")
b"http://purl.org/rss/1.0/",
b"http://www.w3.org/1999/02/22-rdf-syntax-ns#")
.chain(|| eats_until(&mut matcher,
b"http: //www.w3.org/1999/02/22-rdf-syntax-ns#",
b"http: //purl.org/rss/1.0/")) {
b"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
b"http://purl.org/rss/1.0/")) {
Match::StartAndEnd => return Some(("application", "rss+xml")),
Match::DidNotMatch => {},
Match::Start => return None

View file

@ -434,7 +434,14 @@ fn test_sniff_utf_8_bom() {
#[test]
fn test_sniff_rss_feed() {
// RSS feeds
test_sniff_full(&PathBuf::from("text/xml/feed.rss"), "application", "rss+xml", Some(("text", "html")));
test_sniff_full(&PathBuf::from("text/xml/rdf_rss.xml"), "application", "rss+xml", Some(("text", "html")));
// Not RSS feeds
test_sniff_full(&PathBuf::from("text/xml/rdf_rss_ko_1.xml"), "text", "html", Some(("text", "html")));
test_sniff_full(&PathBuf::from("text/xml/rdf_rss_ko_2.xml"), "text", "html", Some(("text", "html")));
test_sniff_full(&PathBuf::from("text/xml/rdf_rss_ko_3.xml"), "text", "html", Some(("text", "html")));
test_sniff_full(&PathBuf::from("text/xml/rdf_rss_ko_4.xml"), "text", "html", Some(("text", "html")));
}
#[test]

View file

@ -0,0 +1,7 @@
<!-- Good format for a "RDF feed" -->
<?xml version="1.0"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
>
</rdf:RDF>

View file

@ -0,0 +1,7 @@
<!-- Bad format for a "RDF feed" (space between "rdf:" and "RDF") -->
<?xml version="1.0"?>
<rdf: RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
>
</rdf:RDF>

View file

@ -0,0 +1,3 @@
<!-- Bad format for a "RDF feed" (2 missing URLs) -->
<?xml version="1.0"?>
<rdf:RDF/>

View file

@ -0,0 +1,6 @@
<!-- Bad format for a "RDF feed" (one missing URL) -->
<?xml version="1.0"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
>
</rdf:RDF>

View file

@ -0,0 +1,7 @@
<!-- Bad format for a "RDF feed" (unexpected space in first URL) -->
<?xml version="1.0"?>
<rdf:RDF
xmlns:rdf="http: //www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
>
</rdf:RDF>