From 44930b0fb0b150d141652ba4e0e9d99da4599d30 Mon Sep 17 00:00:00 2001 From: Nathan Climer Date: Fri, 14 Nov 2014 08:45:32 -0500 Subject: [PATCH 1/6] Implement MIME sniffing. --- components/net/data_loader.rs | 2 +- components/net/lib.rs | 1 + components/net/mime_classifier.rs | 1418 +++++++++++++++++ components/net/sniffer_task.rs | 75 +- .../application/font-woff/test.wof | 1 + .../parsable_mime/application/ogg/small.ogg | Bin 0 -> 105243 bytes .../parsable_mime/application/pdf/test.pdf | 157 ++ .../application/postscript/test.ps | 1 + .../vnd.ms-fontobject/vnd.ms-fontobject | Bin 0 -> 36 bytes .../parsable_mime/application/x-gzip/test.gz | Bin 0 -> 1239 bytes .../application/x-rar-compressed/test.rar | Bin 0 -> 7 bytes .../parsable_mime/application/zip/test.zip | 1 + .../content/parsable_mime/audio/aiff/test.aif | Bin 0 -> 47122 bytes .../content/parsable_mime/audio/basic/test.au | Bin 0 -> 47086 bytes .../content/parsable_mime/audio/midi/test.mid | Bin 0 -> 8444 bytes .../content/parsable_mime/audio/mpeg/test.mp3 | Bin 0 -> 33621 bytes .../content/parsable_mime/audio/wave/test.wav | Bin 0 -> 47196 bytes .../content/parsable_mime/image/bmp/test.bmp | Bin 0 -> 11914 bytes tests/content/parsable_mime/image/gif/test87a | Bin 0 -> 1303 bytes .../parsable_mime/image/gif/test89a.gif | Bin 0 -> 1303 bytes .../content/parsable_mime/image/jpeg/test.jpg | Bin 0 -> 3744 bytes .../content/parsable_mime/image/png/test.png | Bin 0 -> 4293 bytes .../parsable_mime/image/webp/test.webp | Bin 0 -> 14 bytes .../parsable_mime/image/x-icon/test.ico | Bin 0 -> 12206 bytes .../image/x-icon/test_cursor.ico | Bin 0 -> 12206 bytes .../text/html/text_html_a_20.html | 3 + .../text/html/text_html_a_20_u.html | 3 + .../text/html/text_html_a_3e.html | 3 + .../text/html/text_html_a_3e_u.html | 3 + .../text/html/text_html_b_20.html | 3 + .../text/html/text_html_b_20_u.html | 3 + .../text/html/text_html_b_3e.html | 3 + .../text/html/text_html_b_3e_u.html | 3 + .../text/html/text_html_body_20.html | 3 + .../text/html/text_html_body_20_u.html | 3 + .../text/html/text_html_body_3e.html | 3 + .../text/html/text_html_body_3e_u.html | 3 + .../text/html/text_html_br_20.html | 3 + .../text/html/text_html_br_20_u.html | 3 + .../text/html/text_html_br_3e.html | 3 + .../text/html/text_html_br_3e_u.html | 3 + .../text/html/text_html_comment_20.html | 3 + .../text/html/text_html_comment_20_u.html | 3 + .../text/html/text_html_comment_3e.html | 3 + .../text/html/text_html_comment_3e_u.html | 3 + .../text/html/text_html_div_20.html | 3 + .../text/html/text_html_div_20_u.html | 3 + .../text/html/text_html_div_3e.html | 3 + .../text/html/text_html_div_3e_u.html | 3 + .../text/html/text_html_doctype_20.html | 3 + .../text/html/text_html_doctype_20_u.html | 3 + .../text/html/text_html_doctype_3e.html | 4 + .../text/html/text_html_doctype_3e_u.html | 4 + .../text/html/text_html_font_20.html | 3 + .../text/html/text_html_font_20_u.html | 3 + .../text/html/text_html_font_3e.html | 3 + .../text/html/text_html_font_3e_u.html | 3 + .../text/html/text_html_h1_20.html | 3 + .../text/html/text_html_h1_20_u.html | 3 + .../text/html/text_html_h1_3e.html | 3 + .../text/html/text_html_h1_3e_u.html | 3 + .../text/html/text_html_head_20.html | 3 + .../text/html/text_html_head_20_u.html | 3 + .../text/html/text_html_head_3e.html | 3 + .../text/html/text_html_head_3e_u.html | 3 + .../text/html/text_html_iframe_20.html | 3 + .../text/html/text_html_iframe_20_u.html | 3 + .../text/html/text_html_iframe_3e.html | 3 + .../text/html/text_html_iframe_3e_u.html | 3 + .../text/html/text_html_p_20.html | 3 + .../text/html/text_html_p_20_u.html | 3 + .../text/html/text_html_p_3e.html | 3 + .../text/html/text_html_p_3e_u.html | 3 + .../text/html/text_html_page_20.html | 3 + .../text/html/text_html_page_20_u.html | 3 + .../text/html/text_html_page_3e.html | 3 + .../text/html/text_html_page_3e_u.html | 3 + .../text/html/text_html_script_20.html | 3 + .../text/html/text_html_script_20_u.html | 3 + .../text/html/text_html_script_3e.html | 3 + .../text/html/text_html_script_3e_u.html | 3 + .../text/html/text_html_style_20.html | 3 + .../text/html/text_html_style_20_u.html | 3 + .../text/html/text_html_style_3e.html | 3 + .../text/html/text_html_style_3e_u.html | 3 + .../text/html/text_html_table_20.html | 3 + .../text/html/text_html_table_20_u.html | 3 + .../text/html/text_html_table_3e.html | 3 + .../text/html/text_html_table_3e_u.html | 3 + .../text/html/text_html_title_20.html | 3 + .../text/html/text_html_title_20_u.html | 3 + .../text/html/text_html_title_3e.html | 3 + .../text/html/text_html_title_3e_u.html | 3 + .../parsable_mime/text/plain/utf16bebom.txt | Bin 0 -> 42 bytes .../parsable_mime/text/plain/utf16lebom.txt | Bin 0 -> 40 bytes .../parsable_mime/text/plain/utf8bom.txt | 1 + .../content/parsable_mime/text/xml/feed.atom | 1 + tests/content/parsable_mime/text/xml/feed.rss | 151 ++ tests/content/parsable_mime/text/xml/test.xml | 6 + tests/content/parsable_mime/unknown/open_type | 1 + .../parsable_mime/unknown/true_type.ttf | Bin 0 -> 333616 bytes .../unknown/true_type_collection.ttc | 1 + .../content/parsable_mime/video/avi/test.avi | Bin 0 -> 675840 bytes .../content/parsable_mime/video/mp4/test.mp4 | Bin 0 -> 383631 bytes .../parsable_mime/video/webm/test.webm | Bin 0 -> 229455 bytes 105 files changed, 2021 insertions(+), 2 deletions(-) create mode 100644 components/net/mime_classifier.rs create mode 100755 tests/content/parsable_mime/application/font-woff/test.wof create mode 100644 tests/content/parsable_mime/application/ogg/small.ogg create mode 100644 tests/content/parsable_mime/application/pdf/test.pdf create mode 100755 tests/content/parsable_mime/application/postscript/test.ps create mode 100755 tests/content/parsable_mime/application/vnd.ms-fontobject/vnd.ms-fontobject create mode 100644 tests/content/parsable_mime/application/x-gzip/test.gz create mode 100755 tests/content/parsable_mime/application/x-rar-compressed/test.rar create mode 100755 tests/content/parsable_mime/application/zip/test.zip create mode 100644 tests/content/parsable_mime/audio/aiff/test.aif create mode 100644 tests/content/parsable_mime/audio/basic/test.au create mode 100644 tests/content/parsable_mime/audio/midi/test.mid create mode 100644 tests/content/parsable_mime/audio/mpeg/test.mp3 create mode 100644 tests/content/parsable_mime/audio/wave/test.wav create mode 100644 tests/content/parsable_mime/image/bmp/test.bmp create mode 100644 tests/content/parsable_mime/image/gif/test87a create mode 100644 tests/content/parsable_mime/image/gif/test89a.gif create mode 100644 tests/content/parsable_mime/image/jpeg/test.jpg create mode 100644 tests/content/parsable_mime/image/png/test.png create mode 100755 tests/content/parsable_mime/image/webp/test.webp create mode 100644 tests/content/parsable_mime/image/x-icon/test.ico create mode 100644 tests/content/parsable_mime/image/x-icon/test_cursor.ico create mode 100644 tests/content/parsable_mime/text/html/text_html_a_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_a_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_a_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_a_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_b_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_b_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_b_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_b_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_body_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_body_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_body_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_body_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_br_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_br_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_br_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_br_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_comment_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_comment_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_comment_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_comment_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_div_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_div_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_div_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_div_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_doctype_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_doctype_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_doctype_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_doctype_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_font_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_font_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_font_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_font_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_h1_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_h1_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_h1_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_h1_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_head_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_head_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_head_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_head_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_iframe_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_iframe_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_iframe_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_iframe_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_p_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_p_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_p_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_p_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_page_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_page_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_page_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_page_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_script_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_script_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_script_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_script_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_style_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_style_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_style_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_style_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_table_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_table_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_table_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_table_3e_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_title_20.html create mode 100644 tests/content/parsable_mime/text/html/text_html_title_20_u.html create mode 100644 tests/content/parsable_mime/text/html/text_html_title_3e.html create mode 100644 tests/content/parsable_mime/text/html/text_html_title_3e_u.html create mode 100644 tests/content/parsable_mime/text/plain/utf16bebom.txt create mode 100644 tests/content/parsable_mime/text/plain/utf16lebom.txt create mode 100644 tests/content/parsable_mime/text/plain/utf8bom.txt create mode 100755 tests/content/parsable_mime/text/xml/feed.atom create mode 100644 tests/content/parsable_mime/text/xml/feed.rss create mode 100644 tests/content/parsable_mime/text/xml/test.xml create mode 100644 tests/content/parsable_mime/unknown/open_type create mode 100644 tests/content/parsable_mime/unknown/true_type.ttf create mode 100644 tests/content/parsable_mime/unknown/true_type_collection.ttc create mode 100644 tests/content/parsable_mime/video/avi/test.avi create mode 100644 tests/content/parsable_mime/video/mp4/test.mp4 create mode 100644 tests/content/parsable_mime/video/webm/test.webm diff --git a/components/net/data_loader.rs b/components/net/data_loader.rs index a758ce33550..013563bfa1e 100644 --- a/components/net/data_loader.rs +++ b/components/net/data_loader.rs @@ -96,7 +96,7 @@ fn assert_parse(url: &'static str, use sniffer_task; let (start_chan, start_port) = channel(); - let sniffer_task = sniffer_task::new_sniffer_task(); + let sniffer_task = sniffer_task::new_mock_sniffer_task(); load(LoadData::new(Url::parse(url).unwrap(), start_chan), sniffer_task); let response = start_port.recv().unwrap(); diff --git a/components/net/lib.rs b/components/net/lib.rs index 46b123630d8..c4461330c31 100644 --- a/components/net/lib.rs +++ b/components/net/lib.rs @@ -47,6 +47,7 @@ pub mod pub_domains; pub mod resource_task; pub mod storage_task; mod sniffer_task; +mod mime_classifier; /// An implementation of the [Fetch spec](http://fetch.spec.whatwg.org/) pub mod fetch { diff --git a/components/net/mime_classifier.rs b/components/net/mime_classifier.rs new file mode 100644 index 00000000000..e75056c980f --- /dev/null +++ b/components/net/mime_classifier.rs @@ -0,0 +1,1418 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use std::borrow::ToOwned; +use std::cmp::max; + +pub struct MIMEClassifier { + image_classifier: GroupedClassifier, + audio_video_classifer: GroupedClassifier, + scriptable_classifier: GroupedClassifier, + plaintext_classifier: GroupedClassifier, + archive_classifer: GroupedClassifier, + binary_or_plaintext: BinaryOrPlaintextClassifier, + feeds_classifier: FeedsClassifier +} + +impl MIMEClassifier { + //Performs MIME Type Sniffing Algorithm (section 7) + pub fn classify(&self, + no_sniff: bool, + check_for_apache_bug: bool, + supplied_type: &Option<(String, String)>, + data: &Vec) -> Option<(String, String)> { + + match *supplied_type{ + None => { + return self.sniff_unknown_type(!no_sniff, data); + } + Some((ref media_type, ref media_subtype)) => { + match (media_type.as_slice(), media_subtype.as_slice()) { + ("uknown", "unknown") | ("application", "uknown") | ("*", "*") => { + return self.sniff_unknown_type(!no_sniff,data); + } + _ => { + if no_sniff { + return supplied_type.clone(); + } + if check_for_apache_bug { + return self.sniff_text_or_data(data); + } + + if MIMEClassifier::is_xml(media_type, media_subtype) { + return supplied_type.clone(); + } + //Inplied in section 7.3, but flow is not clear + if MIMEClassifier::is_html(media_type, media_subtype) { + return self.feeds_classifier + .classify(data) + .or(supplied_type.clone()); + } + + if media_type.as_slice() == "image" { + let tp = self.image_classifier.classify(data); + if tp.is_some() { + return tp; + } + } + + match (media_type.as_slice(), media_subtype.as_slice()) { + ("audio", _) | ("video", _) | ("application", "ogg") => { + let tp = self.audio_video_classifer.classify(data); + if tp.is_some() { + return tp; + } + } + _ => {} + } + } + } + } + } + return supplied_type.clone(); + } + + pub fn new()->MIMEClassifier { + MIMEClassifier{ + image_classifier: GroupedClassifier::image_classifer(), + audio_video_classifer: GroupedClassifier::audio_video_classifer(), + scriptable_classifier: GroupedClassifier::scriptable_classifier(), + plaintext_classifier: GroupedClassifier::plaintext_classifier(), + archive_classifer: GroupedClassifier::archive_classifier(), + binary_or_plaintext: BinaryOrPlaintextClassifier, + feeds_classifier: FeedsClassifier + } + } + //some sort of iterator over the classifiers might be better? + fn sniff_unknown_type(&self, sniff_scriptable: bool, data: &Vec) -> + Option<(String,String)> { + if sniff_scriptable { + let tp = self.scriptable_classifier.classify(data); + if tp.is_some() {return tp;} + } + + let tp = self.plaintext_classifier.classify(data); + if tp.is_some() {return tp;} + + let tp = self.image_classifier.classify(data); + if tp.is_some() {return tp;} + + let tp = self.audio_video_classifer.classify(data); + if tp.is_some() {return tp;} + + let tp = self.archive_classifer.classify(data); + if tp.is_some() {return tp;} + + self.binary_or_plaintext.classify(data) + } + + fn sniff_text_or_data(&self, data: &Vec) -> Option<(String, String)> { + self.binary_or_plaintext.classify(data) + } + fn is_xml(tp: &str, sub_tp: &str) -> bool { + let suffix = &sub_tp[(max((sub_tp.len() as int) - ("+xml".len() as int), 0i) as uint)..]; + match (tp, sub_tp, suffix) { + (_, _, "+xml") | ("application", "xml",_) | ("text", "xml",_) => {true} + _ => {false} + } + } + fn is_html(tp: &str, sub_tp: &str) -> bool { + tp=="text" && sub_tp=="html" + } +} + +fn as_string_option(tup: Option<(&'static str, &'static str)>) -> Option<(String,String)> { + tup.map(|(a, b)| (a.to_owned(), b.to_owned())) +} + +//Interface used for composite types +trait MIMEChecker { + fn classify(&self, data: &Vec)->Option<(String, String)>; +} + +trait Matches { + fn matches(&mut self, matches: &[u8])->bool; +} + +impl <'a, T: Iterator + Clone> Matches for T { + + // Matching function that works on an iterator. + // see if the next matches.len() bytes in data_iterator equal matches + // move iterator and return true or just return false + // + // Params + // self: an iterator + // matches: a vector of bytes to match + // + // Return + // true if the next n elements of self match n elements of matches + // false otherwise + // + // Side effects + // moves the iterator when match is found + fn matches(&mut self, matches: &[u8]) -> bool { + for (byte_a, byte_b) in self.clone().take(matches.len()).zip(matches.iter()) { + if byte_a != byte_b { + return false; + } + } + self.nth(matches.len()); + true + } +} + +struct ByteMatcher { + pattern: &'static [u8], + mask: &'static [u8], + leading_ignore: &'static [u8], + content_type: (&'static str,&'static str) +} + +impl ByteMatcher { + fn matches(&self, data: &Vec) -> Option { + + if data.len() < self.pattern.len() { + return None; + } + //TODO replace with iterators if I ever figure them out... + let mut i = 0u; + let max_i = data.len()-self.pattern.len(); + + loop { + if !self.leading_ignore.iter().any(|x| *x == data[i]) { + break; + } + + i=i + 1; + if i > max_i { + return None; + } + } + for j in range(0u,self.pattern.len()) { + if (data[i] & self.mask[j]) != (self.pattern[j] & self.mask[j]) { + return None; + } + i = i + 1; + } + Some(i) + } +} + +impl MIMEChecker for ByteMatcher { + fn classify(&self, data: &Vec) -> Option<(String, String)> { + self.matches(data).map(|_| { + (self.content_type.0.to_owned(), self.content_type.1.to_owned()) + }) + } +} + +struct TagTerminatedByteMatcher { + matcher: ByteMatcher +} + +impl MIMEChecker for TagTerminatedByteMatcher { + fn classify(&self, data: &Vec) -> Option<(String, String)> { + let pattern = self.matcher.matches(data); + let pattern_matches = pattern.map(|j| j < data.len() && (data[j] == b' ' || data[j] == b'>')); + if pattern_matches.unwrap_or(false) { + Some((self.matcher.content_type.0.to_owned(), + self.matcher.content_type.1.to_owned())) + } else { + None + } + } +} +struct Mp4Matcher; + +impl Mp4Matcher { + fn matches(&self,data: &Vec) -> bool { + if data.len() < 12 { + return false; + } + let box_size = ((data[0] as u32) << 3 | (data[1] as u32) << 2 | + (data[2] as u32) << 1 | (data[3] as u32)) as uint; + if (data.len() < box_size) || (box_size % 4 != 0) { + return false; + } + //TODO replace with iterators + let ftyp = [0x66, 0x74, 0x79, 0x70]; + let mp4 = [0x6D, 0x70, 0x34]; + + for i in range(4u,8u) { + if data[i] != ftyp[i - 4] { + return false; + } + } + let mut all_match = true; + for i in range(8u,11u) { + if data[i]!=mp4[i - 8u] { + all_match = false; + break; + } + } + if all_match { + return true; + } + + let mut bytes_read = 16u; + + while bytes_read < box_size { + all_match = true; + for i in range(0u,3u) { + if mp4[i] != data[i + bytes_read] { + all_match = false; + break; + } + } + if all_match { + return true; + } + + bytes_read = bytes_read + 4; + } + false + } + +} +impl MIMEChecker for Mp4Matcher { + fn classify(&self, data: &Vec) -> Option<(String, String)> { + if self.matches(data) { + Some(("video".to_owned(), "mp4".to_owned())) + } else { + None + } + } +} + +struct BinaryOrPlaintextClassifier; + +impl BinaryOrPlaintextClassifier { + fn classify_impl(&self, data: &Vec) -> Option<(&'static str, &'static str)> { + if (data.len() >=2 && + ((data[0] == 0xFFu8 && data[1] == 0xFEu8) || + (data[0] == 0xFEu8 && data[1] == 0xFFu8))) || + (data.len() >= 3 && data[0] == 0xEFu8 && data[1] == 0xBBu8 && data[2] == 0xBFu8) + { + Some(("text", "plain")) + } + else if data.len() >= 1 && data.iter().any(|&x| x <= 0x08u8 || + x == 0x0Bu8 || + (x >= 0x0Eu8 && x <= 0x1Au8) || + (x >= 0x1Cu8 && x <= 0x1Fu8)) { + Some(("application", "octet-stream")) + } + else { + Some(("text", "plain")) + } + } +} +impl MIMEChecker for BinaryOrPlaintextClassifier { + fn classify(&self, data: &Vec) -> Option<(String, String)> { + return as_string_option(self.classify_impl(data)); + } +} +struct GroupedClassifier { + byte_matchers: Vec>, +} +impl GroupedClassifier { + fn image_classifer() -> GroupedClassifier { + GroupedClassifier { + byte_matchers: vec![ + box ByteMatcher::image_x_icon() as Box, + box ByteMatcher::image_x_icon_cursor(), + box ByteMatcher::image_bmp(), + box ByteMatcher::image_gif89a(), + box ByteMatcher::image_gif87a(), + box ByteMatcher::image_webp(), + box ByteMatcher::image_png(), + box ByteMatcher::image_jpeg(), + ] + } + } + fn audio_video_classifer() -> GroupedClassifier { + GroupedClassifier{ + byte_matchers: vec![ + box ByteMatcher::video_webm() as Box, + box ByteMatcher::audio_basic(), + box ByteMatcher::audio_aiff(), + box ByteMatcher::audio_mpeg(), + box ByteMatcher::application_ogg(), + box ByteMatcher::audio_midi(), + box ByteMatcher::video_avi(), + box ByteMatcher::audio_wave(), + box Mp4Matcher + ] + } + } + fn scriptable_classifier() -> GroupedClassifier { + GroupedClassifier{ + byte_matchers: vec![ + box ByteMatcher::text_html_doctype() as Box, + box ByteMatcher::text_html_page(), + box ByteMatcher::text_html_head(), + box ByteMatcher::text_html_script(), + box ByteMatcher::text_html_iframe(), + box ByteMatcher::text_html_h1(), + box ByteMatcher::text_html_div(), + box ByteMatcher::text_html_font(), + box ByteMatcher::text_html_table(), + box ByteMatcher::text_html_a(), + box ByteMatcher::text_html_style(), + box ByteMatcher::text_html_title(), + box ByteMatcher::text_html_b(), + box ByteMatcher::text_html_body(), + box ByteMatcher::text_html_br(), + box ByteMatcher::text_html_p(), + box ByteMatcher::text_html_comment(), + box ByteMatcher::text_xml(), + box ByteMatcher::application_pdf() + ] + } + + } + fn plaintext_classifier() -> GroupedClassifier { + GroupedClassifier{ + byte_matchers: vec![ + box ByteMatcher::text_plain_utf_8_bom() as Box, + box ByteMatcher::text_plain_utf_16le_bom(), + box ByteMatcher::text_plain_utf_16be_bom(), + box ByteMatcher::application_postscript() + ] + } + } + fn archive_classifier() -> GroupedClassifier { + GroupedClassifier { + byte_matchers: vec![ + box ByteMatcher::application_x_gzip() as Box, + box ByteMatcher::application_zip(), + box ByteMatcher::application_x_rar_compressed() + ] + } + } + + // TODO: Use this in font context classifier + #[allow(dead_code)] + fn font_classifier() -> GroupedClassifier { + GroupedClassifier { + byte_matchers: vec![ + box ByteMatcher::application_font_woff() as Box, + box ByteMatcher::true_type_collection(), + box ByteMatcher::open_type(), + box ByteMatcher::true_type(), + box ByteMatcher::application_vnd_ms_font_object(), + ] + } + } +} +impl MIMEChecker for GroupedClassifier { + fn classify(&self,data: &Vec) -> Option<(String, String)> { + self.byte_matchers + .iter() + .filter_map(|matcher| matcher.classify(data)) + .next() + } +} + +struct FeedsClassifier; +impl FeedsClassifier { + fn classify_impl(&self,data: &Vec) -> Option<(&'static str,&'static str)> { + let length = data.len(); + let mut data_iterator = data.iter(); + + // acceptable byte sequences + let utf8_bom = &[0xEFu8, 0xBBu8, 0xBFu8]; + + // can not be feed unless length is > 3 + if length < 3 { + return None; + } + + // eat the first three bytes if they are equal to UTF-8 BOM + data_iterator.matches(utf8_bom); + + // continuously search for next "<" until end of data_iterator + // TODO: need max_bytes to prevent inadvertently examining html document + // eg. an html page with a feed example + while !data_iterator.find(|&data_iterator| *data_iterator == b'<').is_none() { + + if data_iterator.matches(b"?") { + // eat until ?> + while !data_iterator.matches(b"?>") { + if data_iterator.next().is_none() { + return None; + } + } + } else if data_iterator.matches(b"!--") { + // eat until --> + while !data_iterator.matches(b"-->") { + if data_iterator.next().is_none() { + return None; + } + } + } else if data_iterator.matches(b"!") { + data_iterator.find(|&data_iterator| *data_iterator == b'>'); + } else if data_iterator.matches(b"rss") { + return Some(("application", "rss+xml")); + } else if data_iterator.matches(b"feed") { + return Some(("application", "atom+xml")); + } else if data_iterator.matches(b"rdf: RDF") { + while !data_iterator.next().is_none() { + if data_iterator.matches(b"http: //purl.org/rss/1.0/") { + while !data_iterator.next().is_none() { + if data_iterator.matches(b"http: //www.w3.org/1999/02/22-rdf-syntax-ns#") { + return Some(("application", "rss+xml")); + } + } + } else if data_iterator.matches(b"http: //www.w3.org/1999/02/22-rdf-syntax-ns#") { + while !data_iterator.next().is_none() { + if data_iterator.matches(b"http: //purl.org/rss/1.0/") { + return Some(("application", "rss+xml")); + } + } + } + } + } + } + + None + } +} + +impl MIMEChecker for FeedsClassifier { + fn classify(&self,data: &Vec) -> Option<(String, String)> { + as_string_option(self.classify_impl(data)) + } +} + +//Contains hard coded byte matchers +//TODO: These should be configured and not hard coded +impl ByteMatcher { + //A Windows Icon signature + fn image_x_icon()->ByteMatcher { + ByteMatcher{ + pattern: b"\x00\x00\x01\x00", + mask: b"\xFF\xFF\xFF\xFF", + content_type: ("image", "x-icon"), + leading_ignore: &[] + } + } + //A Windows Cursor signature. + fn image_x_icon_cursor()->ByteMatcher { + ByteMatcher{ + pattern: b"\x00\x00\x02\x00", + mask: b"\xFF\xFF\xFF\xFF", + content_type: ("image", "x-icon"), + leading_ignore: &[] + } + } + //The string "BM", a BMP signature. + fn image_bmp()->ByteMatcher { + ByteMatcher{ + pattern: b"BM", + mask: b"\xFF\xFF", + content_type: ("image", "bmp"), + leading_ignore: &[] + } + } + //The string "GIF89a", a GIF signature. + fn image_gif89a()->ByteMatcher { + ByteMatcher{ + pattern: b"GIF89a", + mask: b"\xFF\xFF\xFF\xFF\xFF\xFF", + content_type: ("image", "gif"), + leading_ignore: &[] + } + } + //The string "GIF87a", a GIF signature. + fn image_gif87a()->ByteMatcher { + ByteMatcher{ + pattern: b"GIF87a", + mask: b"\xFF\xFF\xFF\xFF\xFF\xFF", + content_type: ("image", "gif"), + leading_ignore: &[] + } + } + //The string "RIFF" followed by four bytes followed by the string "WEBPVP". + fn image_webp()->ByteMatcher { + ByteMatcher{ + pattern: b"RIFF\x00\x00\x00\x00WEBPVP", + mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00,\xFF\xFF\xFF\xFF\xFF\xFF", + content_type: ("image", "webp"), + leading_ignore: &[] + } + } + //An error-checking byte followed by the string "PNG" followed by CR LF SUB LF, the PNG + //signature. + fn image_png()->ByteMatcher { + ByteMatcher{ + pattern: b"\x89PNG\r\n\x1A\n", + mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", + content_type: ("image", "png"), + leading_ignore: &[] + } + } + // The JPEG Start of Image marker followed by the indicator byte of another marker. + fn image_jpeg()->ByteMatcher { + ByteMatcher{ + pattern: b"\xFF\xD8\xFF", + mask: b"\xFF\xFF\xFF", + content_type: ("image", "jpeg"), + leading_ignore: &[] + } + } + //The WebM signature. [TODO: Use more bytes?] + fn video_webm()->ByteMatcher { + ByteMatcher{ + pattern: b"\x1A\x45\xDF\xA3", + mask: b"\xFF\xFF\xFF\xFF", + content_type: ("video", "webm"), + leading_ignore: &[] + } + } + //The string ".snd", the basic audio signature. + fn audio_basic()->ByteMatcher { + ByteMatcher{ + pattern: b".snd", + mask: b"\xFF\xFF\xFF\xFF", + content_type: ("audio", "basic"), + leading_ignore: &[] + } + } + //The string "FORM" followed by four bytes followed by the string "AIFF", the AIFF signature. + fn audio_aiff()->ByteMatcher { + ByteMatcher{ + pattern: b"FORM\x00\x00\x00\x00AIFF", + mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + content_type: ("audio", "aiff"), + leading_ignore: &[] + } + } + //The string "ID3", the ID3v2-tagged MP3 signature. + fn audio_mpeg()->ByteMatcher { + ByteMatcher{ + pattern: b"ID3", + mask: b"\xFF\xFF\xFF", + content_type: ("audio", "mpeg"), + leading_ignore: &[] + } + } + //The string "OggS" followed by NUL, the Ogg container signature. + fn application_ogg()->ByteMatcher { + ByteMatcher{ + pattern: b"OggS", + mask: b"\xFF\xFF\xFF\xFF\xFF", + content_type: ("application", "ogg"), + leading_ignore: &[] + } + } + //The string "MThd" followed by four bytes representing the number 6 in 32 bits (big-endian), + //the MIDI signature. + fn audio_midi()->ByteMatcher { + ByteMatcher{ + pattern: b"MThd\x00\x00\x00\x06", + mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", + content_type: ("audio", "midi"), + leading_ignore: &[] + } + } + //The string "RIFF" followed by four bytes followed by the string "AVI ", the AVI signature. + fn video_avi()->ByteMatcher { + ByteMatcher{ + pattern: b"RIFF\x00\x00\x00\x00AVI ", + mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + content_type: ("video", "avi"), + leading_ignore: &[] + } + } + // The string "RIFF" followed by four bytes followed by the string "WAVE", the WAVE signature. + fn audio_wave()->ByteMatcher { + ByteMatcher{ + pattern: b"RIFF\x00\x00\x00\x00WAVE", + mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + content_type: ("audio", "wave"), + leading_ignore: &[] + } + } + // doctype terminated with Tag terminating (TT) Byte + fn text_html_doctype()->TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher { + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b" diff --git a/tests/content/parsable_mime/text/html/text_html_comment_3e_u.html b/tests/content/parsable_mime/text/html/text_html_comment_3e_u.html new file mode 100644 index 00000000000..44a94ca5a7a --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_comment_3e_u.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/content/parsable_mime/text/html/text_html_div_20.html b/tests/content/parsable_mime/text/html/text_html_div_20.html new file mode 100644 index 00000000000..2ed34363b2f --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_div_20.html @@ -0,0 +1,3 @@ + + +
diff --git a/tests/content/parsable_mime/text/html/text_html_div_3e_u.html b/tests/content/parsable_mime/text/html/text_html_div_3e_u.html new file mode 100644 index 00000000000..c117f0f4cdd --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_div_3e_u.html @@ -0,0 +1,3 @@ + + +
diff --git a/tests/content/parsable_mime/text/html/text_html_doctype_20.html b/tests/content/parsable_mime/text/html/text_html_doctype_20.html new file mode 100644 index 00000000000..dbeb5a41c2a --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_doctype_20.html @@ -0,0 +1,3 @@ + + + + diff --git a/tests/content/parsable_mime/text/html/text_html_doctype_3e_u.html b/tests/content/parsable_mime/text/html/text_html_doctype_3e_u.html new file mode 100644 index 00000000000..8b16e40458e --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_doctype_3e_u.html @@ -0,0 +1,4 @@ + + + + diff --git a/tests/content/parsable_mime/text/html/text_html_font_20.html b/tests/content/parsable_mime/text/html/text_html_font_20.html new file mode 100644 index 00000000000..a18fa850617 --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_font_20.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/content/parsable_mime/text/html/text_html_font_3e_u.html b/tests/content/parsable_mime/text/html/text_html_font_3e_u.html new file mode 100644 index 00000000000..1181517947b --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_font_3e_u.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/content/parsable_mime/text/html/text_html_h1_20.html b/tests/content/parsable_mime/text/html/text_html_h1_20.html new file mode 100644 index 00000000000..3ed0eb125ff --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_h1_20.html @@ -0,0 +1,3 @@ + + +

diff --git a/tests/content/parsable_mime/text/html/text_html_h1_3e_u.html b/tests/content/parsable_mime/text/html/text_html_h1_3e_u.html new file mode 100644 index 00000000000..bae85229fcf --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_h1_3e_u.html @@ -0,0 +1,3 @@ + + +

diff --git a/tests/content/parsable_mime/text/html/text_html_head_20.html b/tests/content/parsable_mime/text/html/text_html_head_20.html new file mode 100644 index 00000000000..eb322c946e0 --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_head_20.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/content/parsable_mime/text/html/text_html_head_3e_u.html b/tests/content/parsable_mime/text/html/text_html_head_3e_u.html new file mode 100644 index 00000000000..8a33d623daa --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_head_3e_u.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/content/parsable_mime/text/html/text_html_iframe_20.html b/tests/content/parsable_mime/text/html/text_html_iframe_20.html new file mode 100644 index 00000000000..e632915590a --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_iframe_20.html @@ -0,0 +1,3 @@ + + +