diff --git a/components/net/about_loader.rs b/components/net/about_loader.rs index 7f3b93d293e..3fd1a09c315 100644 --- a/components/net/about_loader.rs +++ b/components/net/about_loader.rs @@ -4,7 +4,8 @@ use net_traits::{LoadData, Metadata}; use net_traits::ProgressMsg::Done; -use resource_task::{TargetedLoadResponse, start_sending, ResponseSenders}; +use mime_classifier::MIMEClassifier; +use resource_task::start_sending; use file_loader; use url::Url; @@ -13,16 +14,13 @@ use util::resource_files::resources_dir_path; use std::borrow::IntoCow; use std::fs::PathExt; -use std::sync::mpsc::Sender; +use std::sync::Arc; -pub fn factory(mut load_data: LoadData, start_chan: Sender) { - let senders = ResponseSenders { - immediate_consumer: start_chan.clone(), - eventual_consumer: load_data.consumer.clone(), - }; +pub fn factory(mut load_data: LoadData, classifier: Arc) { match load_data.url.non_relative_scheme_data().unwrap() { "blank" => { - let chan = start_sending(senders, Metadata { + let start_chan = load_data.consumer; + let chan = start_sending(start_chan, Metadata { final_url: load_data.url, content_type: Some(("text".to_string(), "html".to_string())), charset: Some("utf-8".to_string()), @@ -40,10 +38,11 @@ pub fn factory(mut load_data: LoadData, start_chan: Sender load_data.url = Url::from_file_path(&*path).unwrap(); } _ => { - start_sending(senders, Metadata::default(load_data.url)) + let start_chan = load_data.consumer; + start_sending(start_chan, Metadata::default(load_data.url)) .send(Done(Err("Unknown about: URL.".to_string()))).unwrap(); return } }; - file_loader::factory(load_data, start_chan) + file_loader::factory(load_data, classifier) } diff --git a/components/net/data_loader.rs b/components/net/data_loader.rs index a758ce33550..dc1b4356390 100644 --- a/components/net/data_loader.rs +++ b/components/net/data_loader.rs @@ -4,34 +4,30 @@ use net_traits::{LoadData, Metadata}; use net_traits::ProgressMsg::{Payload, Done}; -use resource_task::{TargetedLoadResponse, start_sending, ResponseSenders}; +use mime_classifier::MIMEClassifier; +use resource_task::start_sending; use rustc_serialize::base64::FromBase64; use hyper::mime::Mime; +use std::sync::Arc; use url::{percent_decode, SchemeData}; -use std::sync::mpsc::Sender; - -pub fn factory(load_data: LoadData, start_chan: Sender) { +pub fn factory(load_data: LoadData, _classifier: Arc) { // NB: we don't spawn a new task. // Hypothesis: data URLs are too small for parallel base64 etc. to be worth it. // Should be tested at some point. // Left in separate function to allow easy moving to a task, if desired. - load(load_data, start_chan) + load(load_data) } -fn load(load_data: LoadData, start_chan: Sender) { +fn load(load_data: LoadData) { + let start_chan = load_data.consumer; let url = load_data.url; assert!(&*url.scheme == "data"); let mut metadata = Metadata::default(url.clone()); - let senders = ResponseSenders { - immediate_consumer: start_chan, - eventual_consumer: load_data.consumer, - }; - // Split out content type and data. let mut scheme_data = match url.scheme_data { SchemeData::NonRelative(scheme_data) => scheme_data, @@ -46,7 +42,7 @@ fn load(load_data: LoadData, start_chan: Sender) { } let parts: Vec<&str> = scheme_data.splitn(1, ',').collect(); if parts.len() != 2 { - start_sending(senders, metadata).send(Done(Err("invalid data uri".to_string()))).unwrap(); + start_sending(start_chan, metadata).send(Done(Err("invalid data uri".to_string()))).unwrap(); return; } @@ -64,7 +60,7 @@ fn load(load_data: LoadData, start_chan: Sender) { let content_type: Option = ct_str.parse().ok(); metadata.set_content_type(content_type.as_ref()); - let progress_chan = start_sending(senders, metadata); + let progress_chan = start_sending(start_chan, metadata); let bytes = percent_decode(parts[1].as_bytes()); if is_base64 { @@ -93,11 +89,9 @@ fn assert_parse(url: &'static str, data: Option>) { use std::sync::mpsc::channel; use url::Url; - use sniffer_task; let (start_chan, start_port) = channel(); - let sniffer_task = sniffer_task::new_sniffer_task(); - load(LoadData::new(Url::parse(url).unwrap(), start_chan), sniffer_task); + load(LoadData::new(Url::parse(url).unwrap(), start_chan)); let response = start_port.recv().unwrap(); assert_eq!(&response.metadata.content_type, &content_type); diff --git a/components/net/file_loader.rs b/components/net/file_loader.rs index 636c14f60e5..83fcc7ed59f 100644 --- a/components/net/file_loader.rs +++ b/components/net/file_loader.rs @@ -4,55 +4,78 @@ use net_traits::{LoadData, Metadata, ProgressMsg}; use net_traits::ProgressMsg::{Payload, Done}; -use resource_task::{start_sending, TargetedLoadResponse, ResponseSenders}; +use mime_classifier::MIMEClassifier; +use resource_task::{start_sending, start_sending_sniffed}; use std::borrow::ToOwned; use std::io; use std::fs::File; use std::path::PathBuf; +use std::sync::Arc; use std::sync::mpsc::Sender; use util::task::spawn_named; static READ_SIZE: uint = 8192; +enum ReadStatus { + Partial(Vec), + EOF, +} + +fn read_block(reader: &mut io::Read) -> Result { + let mut buf = vec![0; READ_SIZE]; + match reader.read(buf.as_mut_slice()) { + Ok(0) => return Ok(ReadStatus::EOF), + Ok(n) => { + buf.truncate(n); + Ok(ReadStatus::Partial(buf)) + } + Err(e) => Err(e.description().to_string()), + } +} + fn read_all(reader: &mut io::Read, progress_chan: &Sender) - -> Result<(), String> { + -> Result<(), String> { loop { - let mut buf = vec![0; READ_SIZE]; - match reader.read(buf.as_mut_slice()) { - Ok(0) => return Ok(()), - Ok(n) => { - buf.truncate(n); - progress_chan.send(Payload(buf)).unwrap(); - }, - Err(e) => return Err(e.description().to_string()), + match try!(read_block(reader)) { + ReadStatus::Partial(buf) => progress_chan.send(Payload(buf)).unwrap(), + ReadStatus::EOF => return Ok(()), } } } -pub fn factory(load_data: LoadData, start_chan: Sender) { +pub fn factory(load_data: LoadData, classifier: Arc) { let url = load_data.url; + let start_chan = load_data.consumer; assert!(&*url.scheme == "file"); - let senders = ResponseSenders { - immediate_consumer: start_chan, - eventual_consumer: load_data.consumer, - }; - let progress_chan = start_sending(senders, Metadata::default(url.clone())); spawn_named("file_loader".to_owned(), move || { + let metadata = Metadata::default(url.clone()); let file_path: Result = url.to_file_path(); match file_path { Ok(file_path) => { match File::open(&file_path) { Ok(ref mut reader) => { - let res = read_all(reader, &progress_chan); + let res = read_block(reader); + let (res, progress_chan) = match res { + Ok(ReadStatus::Partial(buf)) => { + let progress_chan = start_sending_sniffed(start_chan, metadata, + classifier, &buf); + progress_chan.send(Payload(buf)).unwrap(); + (read_all(reader, &progress_chan), progress_chan) + } + Ok(ReadStatus::EOF) | Err(_) => + (res.map(|_| ()), start_sending(start_chan, metadata)), + }; progress_chan.send(Done(res)).unwrap(); } Err(e) => { + let progress_chan = start_sending(start_chan, metadata); progress_chan.send(Done(Err(e.description().to_string()))).unwrap(); } } } Err(_) => { + let progress_chan = start_sending(start_chan, metadata); progress_chan.send(Done(Err(url.to_string()))).unwrap(); } } diff --git a/components/net/http_loader.rs b/components/net/http_loader.rs index 2dadcdfa3d9..c59066293dd 100644 --- a/components/net/http_loader.rs +++ b/components/net/http_loader.rs @@ -2,10 +2,10 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -use net_traits::{ControlMsg, CookieSource, LoadData, Metadata}; -use net_traits::ProgressMsg; +use net_traits::{ControlMsg, CookieSource, LoadData, LoadResponse, Metadata}; use net_traits::ProgressMsg::{Payload, Done}; -use resource_task::{TargetedLoadResponse, start_sending_opt, ResponseSenders}; +use mime_classifier::MIMEClassifier; +use resource_task::{start_sending_opt, start_sending_sniffed_opt}; use log; use std::collections::HashSet; @@ -21,6 +21,7 @@ use hyper::status::{StatusCode, StatusClass}; use std::error::Error; use openssl::ssl::{SslContext, SslVerifyMode}; use std::io::{self, Read, Write}; +use std::sync::Arc; use std::sync::mpsc::{Sender, channel}; use std::thunk::Invoke; use util::task::spawn_named; @@ -31,36 +32,50 @@ use url::{Url, UrlParser}; use std::borrow::ToOwned; pub fn factory(cookies_chan: Sender) - -> Box)> + Send> { - box move |(load_data, start_chan)| { - spawn_named("http_loader".to_owned(), move || load(load_data, start_chan, cookies_chan)) + -> Box)> + Send> { + box move |(load_data, classifier)| { + spawn_named("http_loader".to_owned(), move || load(load_data, classifier, cookies_chan)) } } -fn send_error(url: Url, err: String, senders: ResponseSenders) { +fn send_error(url: Url, err: String, start_chan: Sender) { let mut metadata: Metadata = Metadata::default(url); metadata.status = None; - match start_sending_opt(senders, metadata) { + match start_sending_opt(start_chan, metadata) { Ok(p) => p.send(Done(Err(err))).unwrap(), _ => {} }; } -fn load(mut load_data: LoadData, start_chan: Sender, cookies_chan: Sender) { +enum ReadResult { + Payload(Vec), + EOF, +} + +fn read_block(reader: &mut R) -> Result { + let mut buf = vec![0; 1024]; + + match reader.read(buf.as_mut_slice()) { + Ok(len) if len > 0 => { + unsafe { buf.set_len(len); } + Ok(ReadResult::Payload(buf)) + } + Ok(_) => Ok(ReadResult::EOF), + Err(_) => Err(()), + } +} + +fn load(mut load_data: LoadData, classifier: Arc, cookies_chan: Sender) { // FIXME: At the time of writing this FIXME, servo didn't have any central // location for configuration. If you're reading this and such a // repository DOES exist, please update this constant to use it. let max_redirects = 50u; let mut iters = 0u; + let start_chan = load_data.consumer; let mut url = load_data.url.clone(); let mut redirected_to = HashSet::new(); - let senders = ResponseSenders { - immediate_consumer: start_chan, - eventual_consumer: load_data.consumer - }; - // If the URL is a view-source scheme then the scheme data contains the // real URL that should be used for which the source is to be viewed. // Change our existing URL to that and keep note that we are viewing @@ -73,7 +88,7 @@ fn load(mut load_data: LoadData, start_chan: Sender, cooki "http" | "https" => {} _ => { let s = format!("The {} scheme with view-source is not supported", url.scheme); - send_error(url, s, senders); + send_error(url, s, start_chan); return; } }; @@ -84,7 +99,7 @@ fn load(mut load_data: LoadData, start_chan: Sender, cooki iters = iters + 1; if iters > max_redirects { - send_error(url, "too many redirects".to_string(), senders); + send_error(url, "too many redirects".to_string(), start_chan); return; } @@ -92,7 +107,7 @@ fn load(mut load_data: LoadData, start_chan: Sender, cooki "http" | "https" => {} _ => { let s = format!("{} request, but we don't support that scheme", url.scheme); - send_error(url, s, senders); + send_error(url, s, start_chan); return; } } @@ -125,13 +140,13 @@ reason: \"certificate verify failed\" }]"; ) => { let mut image = resources_dir_path(); image.push("badcert.html"); - let load_data = LoadData::new(Url::from_file_path(&*image).unwrap(), senders.eventual_consumer); - file_loader::factory(load_data, senders.immediate_consumer); + let load_data = LoadData::new(Url::from_file_path(&*image).unwrap(), start_chan); + file_loader::factory(load_data, classifier); return; }, Err(e) => { println!("{:?}", e); - send_error(url, e.description().to_string(), senders); + send_error(url, e.description().to_string(), start_chan); return; } }; @@ -179,13 +194,13 @@ reason: \"certificate verify failed\" }]"; let mut writer = match req.start() { Ok(w) => w, Err(e) => { - send_error(url, e.description().to_string(), senders); + send_error(url, e.description().to_string(), start_chan); return; } }; match writer.write_all(&*data) { Err(e) => { - send_error(url, e.description().to_string(), senders); + send_error(url, e.description().to_string(), start_chan); return; } _ => {} @@ -200,7 +215,7 @@ reason: \"certificate verify failed\" }]"; match req.start() { Ok(w) => w, Err(e) => { - send_error(url, e.description().to_string(), senders); + send_error(url, e.description().to_string(), start_chan); return; } } @@ -209,7 +224,7 @@ reason: \"certificate verify failed\" }]"; let mut response = match writer.send() { Ok(r) => r, Err(e) => { - send_error(url, e.description().to_string(), senders); + send_error(url, e.description().to_string(), start_chan); return; } }; @@ -240,7 +255,7 @@ reason: \"certificate verify failed\" }]"; Some(ref c) => { if c.preflight { // The preflight lied - send_error(url, "Preflight fetch inconsistent with main fetch".to_string(), senders); + send_error(url, "Preflight fetch inconsistent with main fetch".to_string(), start_chan); return; } else { // XXXManishearth There are some CORS-related steps here, @@ -252,7 +267,7 @@ reason: \"certificate verify failed\" }]"; let new_url = match UrlParser::new().base_url(&url).parse(&new_url) { Ok(u) => u, Err(e) => { - send_error(url, e.to_string(), senders); + send_error(url, e.to_string(), start_chan); return; } }; @@ -268,7 +283,7 @@ reason: \"certificate verify failed\" }]"; } if redirected_to.contains(&url) { - send_error(url, "redirect loop".to_string(), senders); + send_error(url, "redirect loop".to_string(), start_chan); return; } @@ -291,11 +306,6 @@ reason: \"certificate verify failed\" }]"; metadata.headers = Some(adjusted_headers); metadata.status = Some(response.status_raw().clone()); - let progress_chan = match start_sending_opt(senders, metadata) { - Ok(p) => p, - _ => return - }; - let mut encoding_str: Option = None; //FIXME: Implement Content-Encoding Header https://github.com/hyperium/hyper/issues/391 if let Some(encodings) = response.headers.get_raw("content-encoding") { @@ -313,14 +323,14 @@ reason: \"certificate verify failed\" }]"; Some(encoding) => { if encoding == "gzip" { let mut response_decoding = GzDecoder::new(response).unwrap(); - send_data(&mut response_decoding, progress_chan); + send_data(&mut response_decoding, start_chan, metadata, classifier); } else if encoding == "deflate" { let mut response_decoding = DeflateDecoder::new(response); - send_data(&mut response_decoding, progress_chan); + send_data(&mut response_decoding, start_chan, metadata, classifier); } }, None => { - send_data(&mut response, progress_chan); + send_data(&mut response, start_chan, metadata, classifier); } } @@ -329,25 +339,35 @@ reason: \"certificate verify failed\" }]"; } } -fn send_data(reader: &mut R, progress_chan: Sender) { - loop { - let mut buf = Vec::with_capacity(1024); +fn send_data(reader: &mut R, + start_chan: Sender, + metadata: Metadata, + classifier: Arc) { + let (progress_chan, mut chunk) = { + let buf = match read_block(reader) { + Ok(ReadResult::Payload(buf)) => buf, + _ => vec!(), + }; + let p = match start_sending_sniffed_opt(start_chan, metadata, classifier, &buf) { + Ok(p) => p, + _ => return + }; + (p, buf) + }; - unsafe { buf.set_len(1024); } - match reader.read(buf.as_mut_slice()) { - Ok(len) if len > 0 => { - unsafe { buf.set_len(len); } - if progress_chan.send(Payload(buf)).is_err() { - // The send errors when the receiver is out of scope, - // which will happen if the fetch has timed out (or has been aborted) - // so we don't need to continue with the loading of the file here. - return; - } - } - Ok(_) | Err(_) => { - let _ = progress_chan.send(Done(Ok(()))); - break; - } + loop { + if progress_chan.send(Payload(chunk)).is_err() { + // The send errors when the receiver is out of scope, + // which will happen if the fetch has timed out (or has been aborted) + // so we don't need to continue with the loading of the file here. + return; } + + chunk = match read_block(reader) { + Ok(ReadResult::Payload(buf)) => buf, + Ok(ReadResult::EOF) | Err(_) => break, + }; } + + let _ = progress_chan.send(Done(Ok(()))); } diff --git a/components/net/image_cache_task.rs b/components/net/image_cache_task.rs index 03ee5c8a52e..df38f9d7c31 100644 --- a/components/net/image_cache_task.rs +++ b/components/net/image_cache_task.rs @@ -428,11 +428,10 @@ mod tests { use net_traits::image_cache_task::ImageResponseMsg::*; use net_traits::image_cache_task::Msg::*; - use resource_task::{start_sending, ResponseSenders}; + use resource_task::start_sending; use net_traits::{ControlMsg, Metadata, ProgressMsg, ResourceTask}; use net_traits::image_cache_task::{ImageCacheTask, ImageResponseMsg, Msg}; use net_traits::ProgressMsg::{Payload, Done}; - use sniffer_task; use profile::time; use std::sync::mpsc::{Sender, channel, Receiver}; use url::Url; @@ -534,12 +533,7 @@ mod tests { loop { match port.recv().unwrap() { ControlMsg::Load(response) => { - let sniffer_task = sniffer_task::new_sniffer_task(); - let senders = ResponseSenders { - immediate_consumer: sniffer_task, - eventual_consumer: response.consumer.clone(), - }; - let chan = start_sending(senders, Metadata::default( + let chan = start_sending(response.consumer, Metadata::default( Url::parse("file:///fake").unwrap())); on_load.invoke(chan); } @@ -709,12 +703,7 @@ mod tests { loop { match port.recv().unwrap() { ControlMsg::Load(response) => { - let sniffer_task = sniffer_task::new_sniffer_task(); - let senders = ResponseSenders { - immediate_consumer: sniffer_task, - eventual_consumer: response.consumer.clone(), - }; - let chan = start_sending(senders, Metadata::default( + let chan = start_sending(response.consumer, Metadata::default( Url::parse("file:///fake").unwrap())); chan.send(Payload(test_image_bin())); chan.send(Done(Ok(()))); @@ -763,12 +752,7 @@ mod tests { loop { match port.recv().unwrap() { ControlMsg::Load(response) => { - let sniffer_task = sniffer_task::new_sniffer_task(); - let senders = ResponseSenders { - immediate_consumer: sniffer_task, - eventual_consumer: response.consumer.clone(), - }; - let chan = start_sending(senders, Metadata::default( + let chan = start_sending(response.consumer, Metadata::default( Url::parse("file:///fake").unwrap())); chan.send(Payload(test_image_bin())); chan.send(Done(Err("".to_string()))); diff --git a/components/net/lib.rs b/components/net/lib.rs index 46b123630d8..bd2bd882570 100644 --- a/components/net/lib.rs +++ b/components/net/lib.rs @@ -46,7 +46,7 @@ pub mod image_cache_task; pub mod pub_domains; pub mod resource_task; pub mod storage_task; -mod sniffer_task; +mod mime_classifier; /// An implementation of the [Fetch spec](http://fetch.spec.whatwg.org/) pub mod fetch { diff --git a/components/net/mime_classifier.rs b/components/net/mime_classifier.rs new file mode 100644 index 00000000000..773436d92d9 --- /dev/null +++ b/components/net/mime_classifier.rs @@ -0,0 +1,1415 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use std::borrow::ToOwned; +use std::cmp::max; + +pub struct MIMEClassifier { + image_classifier: GroupedClassifier, + audio_video_classifer: GroupedClassifier, + scriptable_classifier: GroupedClassifier, + plaintext_classifier: GroupedClassifier, + archive_classifer: GroupedClassifier, + binary_or_plaintext: BinaryOrPlaintextClassifier, + feeds_classifier: FeedsClassifier +} + +impl MIMEClassifier { + //Performs MIME Type Sniffing Algorithm (section 7) + pub fn classify(&self, + no_sniff: bool, + check_for_apache_bug: bool, + supplied_type: &Option<(String, String)>, + data: &Vec) -> Option<(String, String)> { + + match *supplied_type{ + None => { + return self.sniff_unknown_type(!no_sniff, data); + } + Some((ref media_type, ref media_subtype)) => { + match (media_type.as_slice(), media_subtype.as_slice()) { + ("uknown", "unknown") | ("application", "uknown") | ("*", "*") => { + return self.sniff_unknown_type(!no_sniff,data); + } + _ => { + if no_sniff { + return supplied_type.clone(); + } + if check_for_apache_bug { + return self.sniff_text_or_data(data); + } + + if MIMEClassifier::is_xml(media_type, media_subtype) { + return supplied_type.clone(); + } + //Inplied in section 7.3, but flow is not clear + if MIMEClassifier::is_html(media_type, media_subtype) { + return self.feeds_classifier + .classify(data) + .or(supplied_type.clone()); + } + + if media_type.as_slice() == "image" { + let tp = self.image_classifier.classify(data); + if tp.is_some() { + return tp; + } + } + + match (media_type.as_slice(), media_subtype.as_slice()) { + ("audio", _) | ("video", _) | ("application", "ogg") => { + let tp = self.audio_video_classifer.classify(data); + if tp.is_some() { + return tp; + } + } + _ => {} + } + } + } + } + } + return supplied_type.clone(); + } + + pub fn new()->MIMEClassifier { + MIMEClassifier{ + image_classifier: GroupedClassifier::image_classifer(), + audio_video_classifer: GroupedClassifier::audio_video_classifer(), + scriptable_classifier: GroupedClassifier::scriptable_classifier(), + plaintext_classifier: GroupedClassifier::plaintext_classifier(), + archive_classifer: GroupedClassifier::archive_classifier(), + binary_or_plaintext: BinaryOrPlaintextClassifier, + feeds_classifier: FeedsClassifier + } + } + //some sort of iterator over the classifiers might be better? + fn sniff_unknown_type(&self, sniff_scriptable: bool, data: &Vec) -> + Option<(String,String)> { + if sniff_scriptable { + let tp = self.scriptable_classifier.classify(data); + if tp.is_some() {return tp;} + } + + let tp = self.plaintext_classifier.classify(data); + if tp.is_some() {return tp;} + + let tp = self.image_classifier.classify(data); + if tp.is_some() {return tp;} + + let tp = self.audio_video_classifer.classify(data); + if tp.is_some() {return tp;} + + let tp = self.archive_classifer.classify(data); + if tp.is_some() {return tp;} + + self.binary_or_plaintext.classify(data) + } + + fn sniff_text_or_data(&self, data: &Vec) -> Option<(String, String)> { + self.binary_or_plaintext.classify(data) + } + fn is_xml(tp: &str, sub_tp: &str) -> bool { + let suffix = &sub_tp[(max((sub_tp.len() as int) - ("+xml".len() as int), 0i) as uint)..]; + match (tp, sub_tp, suffix) { + (_, _, "+xml") | ("application", "xml",_) | ("text", "xml",_) => {true} + _ => {false} + } + } + fn is_html(tp: &str, sub_tp: &str) -> bool { + tp=="text" && sub_tp=="html" + } +} + +fn as_string_option(tup: Option<(&'static str, &'static str)>) -> Option<(String,String)> { + tup.map(|(a, b)| (a.to_owned(), b.to_owned())) +} + +//Interface used for composite types +trait MIMEChecker { + fn classify(&self, data: &Vec)->Option<(String, String)>; +} + +trait Matches { + fn matches(&mut self, matches: &[u8])->bool; +} + +impl <'a, T: Iterator + Clone> Matches for T { + + // Matching function that works on an iterator. + // see if the next matches.len() bytes in data_iterator equal matches + // move iterator and return true or just return false + // + // Params + // self: an iterator + // matches: a vector of bytes to match + // + // Return + // true if the next n elements of self match n elements of matches + // false otherwise + // + // Side effects + // moves the iterator when match is found + fn matches(&mut self, matches: &[u8]) -> bool { + for (byte_a, byte_b) in self.clone().take(matches.len()).zip(matches.iter()) { + if byte_a != byte_b { + return false; + } + } + self.nth(matches.len()); + true + } +} + +struct ByteMatcher { + pattern: &'static [u8], + mask: &'static [u8], + leading_ignore: &'static [u8], + content_type: (&'static str,&'static str) +} + +impl ByteMatcher { + fn matches(&self, data: &Vec) -> Option { + + if data.len() < self.pattern.len() { + return None; + } + //TODO replace with iterators if I ever figure them out... + let mut i = 0u; + let max_i = data.len()-self.pattern.len(); + + loop { + if !self.leading_ignore.iter().any(|x| *x == data[i]) { + break; + } + + i=i + 1; + if i > max_i { + return None; + } + } + for j in range(0u,self.pattern.len()) { + if (data[i] & self.mask[j]) != (self.pattern[j] & self.mask[j]) { + return None; + } + i = i + 1; + } + Some(i) + } +} + +impl MIMEChecker for ByteMatcher { + fn classify(&self, data: &Vec) -> Option<(String, String)> { + self.matches(data).map(|_| { + (self.content_type.0.to_owned(), self.content_type.1.to_owned()) + }) + } +} + +struct TagTerminatedByteMatcher { + matcher: ByteMatcher +} + +impl MIMEChecker for TagTerminatedByteMatcher { + fn classify(&self, data: &Vec) -> Option<(String, String)> { + let pattern = self.matcher.matches(data); + let pattern_matches = pattern.map(|j| j < data.len() && (data[j] == b' ' || data[j] == b'>')); + if pattern_matches.unwrap_or(false) { + Some((self.matcher.content_type.0.to_owned(), + self.matcher.content_type.1.to_owned())) + } else { + None + } + } +} +struct Mp4Matcher; + +impl Mp4Matcher { + fn matches(&self,data: &Vec) -> bool { + if data.len() < 12 { + return false; + } + let box_size = ((data[0] as u32) << 3 | (data[1] as u32) << 2 | + (data[2] as u32) << 1 | (data[3] as u32)) as uint; + if (data.len() < box_size) || (box_size % 4 != 0) { + return false; + } + //TODO replace with iterators + let ftyp = [0x66, 0x74, 0x79, 0x70]; + let mp4 = [0x6D, 0x70, 0x34]; + + for i in range(4u,8u) { + if data[i] != ftyp[i - 4] { + return false; + } + } + let mut all_match = true; + for i in range(8u,11u) { + if data[i]!=mp4[i - 8u] { + all_match = false; + break; + } + } + if all_match { + return true; + } + + let mut bytes_read = 16u; + + while bytes_read < box_size { + all_match = true; + for i in range(0u,3u) { + if mp4[i] != data[i + bytes_read] { + all_match = false; + break; + } + } + if all_match { + return true; + } + + bytes_read = bytes_read + 4; + } + false + } + +} +impl MIMEChecker for Mp4Matcher { + fn classify(&self, data: &Vec) -> Option<(String, String)> { + if self.matches(data) { + Some(("video".to_owned(), "mp4".to_owned())) + } else { + None + } + } +} + +struct BinaryOrPlaintextClassifier; + +impl BinaryOrPlaintextClassifier { + fn classify_impl(&self, data: &Vec) -> Option<(&'static str, &'static str)> { + if (data.len() >=2 && + ((data[0] == 0xFFu8 && data[1] == 0xFEu8) || + (data[0] == 0xFEu8 && data[1] == 0xFFu8))) || + (data.len() >= 3 && data[0] == 0xEFu8 && data[1] == 0xBBu8 && data[2] == 0xBFu8) + { + Some(("text", "plain")) + } + else if data.len() >= 1 && data.iter().any(|&x| x <= 0x08u8 || + x == 0x0Bu8 || + (x >= 0x0Eu8 && x <= 0x1Au8) || + (x >= 0x1Cu8 && x <= 0x1Fu8)) { + Some(("application", "octet-stream")) + } + else { + Some(("text", "plain")) + } + } +} +impl MIMEChecker for BinaryOrPlaintextClassifier { + fn classify(&self, data: &Vec) -> Option<(String, String)> { + return as_string_option(self.classify_impl(data)); + } +} +struct GroupedClassifier { + byte_matchers: Vec>, +} +impl GroupedClassifier { + fn image_classifer() -> GroupedClassifier { + GroupedClassifier { + byte_matchers: vec![ + box ByteMatcher::image_x_icon(), + box ByteMatcher::image_x_icon_cursor(), + box ByteMatcher::image_bmp(), + box ByteMatcher::image_gif89a(), + box ByteMatcher::image_gif87a(), + box ByteMatcher::image_webp(), + box ByteMatcher::image_png(), + box ByteMatcher::image_jpeg(), + ] + } + } + fn audio_video_classifer() -> GroupedClassifier { + GroupedClassifier{ + byte_matchers: vec![ + box ByteMatcher::video_webm(), + box ByteMatcher::audio_basic(), + box ByteMatcher::audio_aiff(), + box ByteMatcher::audio_mpeg(), + box ByteMatcher::application_ogg(), + box ByteMatcher::audio_midi(), + box ByteMatcher::video_avi(), + box ByteMatcher::audio_wave(), + box Mp4Matcher + ] + } + } + fn scriptable_classifier() -> GroupedClassifier { + GroupedClassifier{ + byte_matchers: vec![ + box ByteMatcher::text_html_doctype(), + box ByteMatcher::text_html_page(), + box ByteMatcher::text_html_head(), + box ByteMatcher::text_html_script(), + box ByteMatcher::text_html_iframe(), + box ByteMatcher::text_html_h1(), + box ByteMatcher::text_html_div(), + box ByteMatcher::text_html_font(), + box ByteMatcher::text_html_table(), + box ByteMatcher::text_html_a(), + box ByteMatcher::text_html_style(), + box ByteMatcher::text_html_title(), + box ByteMatcher::text_html_b(), + box ByteMatcher::text_html_body(), + box ByteMatcher::text_html_br(), + box ByteMatcher::text_html_p(), + box ByteMatcher::text_html_comment(), + box ByteMatcher::text_xml(), + box ByteMatcher::application_pdf() + ] + } + + } + fn plaintext_classifier() -> GroupedClassifier { + GroupedClassifier{ + byte_matchers: vec![ + box ByteMatcher::text_plain_utf_8_bom(), + box ByteMatcher::text_plain_utf_16le_bom(), + box ByteMatcher::text_plain_utf_16be_bom(), + box ByteMatcher::application_postscript() + ] + } + } + fn archive_classifier() -> GroupedClassifier { + GroupedClassifier { + byte_matchers: vec![ + box ByteMatcher::application_x_gzip(), + box ByteMatcher::application_zip(), + box ByteMatcher::application_x_rar_compressed() + ] + } + } + + // TODO: Use this in font context classifier + #[allow(dead_code)] + fn font_classifier() -> GroupedClassifier { + GroupedClassifier { + byte_matchers: vec![ + box ByteMatcher::application_font_woff(), + box ByteMatcher::true_type_collection(), + box ByteMatcher::open_type(), + box ByteMatcher::true_type(), + box ByteMatcher::application_vnd_ms_font_object(), + ] + } + } +} +impl MIMEChecker for GroupedClassifier { + fn classify(&self,data: &Vec) -> Option<(String, String)> { + self.byte_matchers + .iter() + .filter_map(|matcher| matcher.classify(data)) + .next() + } +} + +struct FeedsClassifier; +impl FeedsClassifier { + fn classify_impl(&self,data: &Vec) -> Option<(&'static str,&'static str)> { + let length = data.len(); + let mut data_iterator = data.iter(); + + // acceptable byte sequences + let utf8_bom = &[0xEFu8, 0xBBu8, 0xBFu8]; + + // can not be feed unless length is > 3 + if length < 3 { + return None; + } + + // eat the first three bytes if they are equal to UTF-8 BOM + data_iterator.matches(utf8_bom); + + // continuously search for next "<" until end of data_iterator + // TODO: need max_bytes to prevent inadvertently examining html document + // eg. an html page with a feed example + while !data_iterator.find(|&data_iterator| *data_iterator == b'<').is_none() { + + if data_iterator.matches(b"?") { + // eat until ?> + while !data_iterator.matches(b"?>") { + if data_iterator.next().is_none() { + return None; + } + } + } else if data_iterator.matches(b"!--") { + // eat until --> + while !data_iterator.matches(b"-->") { + if data_iterator.next().is_none() { + return None; + } + } + } else if data_iterator.matches(b"!") { + data_iterator.find(|&data_iterator| *data_iterator == b'>'); + } else if data_iterator.matches(b"rss") { + return Some(("application", "rss+xml")); + } else if data_iterator.matches(b"feed") { + return Some(("application", "atom+xml")); + } else if data_iterator.matches(b"rdf: RDF") { + while !data_iterator.next().is_none() { + if data_iterator.matches(b"http: //purl.org/rss/1.0/") { + while !data_iterator.next().is_none() { + if data_iterator.matches(b"http: //www.w3.org/1999/02/22-rdf-syntax-ns#") { + return Some(("application", "rss+xml")); + } + } + } else if data_iterator.matches(b"http: //www.w3.org/1999/02/22-rdf-syntax-ns#") { + while !data_iterator.next().is_none() { + if data_iterator.matches(b"http: //purl.org/rss/1.0/") { + return Some(("application", "rss+xml")); + } + } + } + } + } + } + + None + } +} + +impl MIMEChecker for FeedsClassifier { + fn classify(&self,data: &Vec) -> Option<(String, String)> { + as_string_option(self.classify_impl(data)) + } +} + +//Contains hard coded byte matchers +//TODO: These should be configured and not hard coded +impl ByteMatcher { + //A Windows Icon signature + fn image_x_icon()->ByteMatcher { + ByteMatcher{ + pattern: b"\x00\x00\x01\x00", + mask: b"\xFF\xFF\xFF\xFF", + content_type: ("image", "x-icon"), + leading_ignore: &[] + } + } + //A Windows Cursor signature. + fn image_x_icon_cursor()->ByteMatcher { + ByteMatcher{ + pattern: b"\x00\x00\x02\x00", + mask: b"\xFF\xFF\xFF\xFF", + content_type: ("image", "x-icon"), + leading_ignore: &[] + } + } + //The string "BM", a BMP signature. + fn image_bmp()->ByteMatcher { + ByteMatcher{ + pattern: b"BM", + mask: b"\xFF\xFF", + content_type: ("image", "bmp"), + leading_ignore: &[] + } + } + //The string "GIF89a", a GIF signature. + fn image_gif89a()->ByteMatcher { + ByteMatcher{ + pattern: b"GIF89a", + mask: b"\xFF\xFF\xFF\xFF\xFF\xFF", + content_type: ("image", "gif"), + leading_ignore: &[] + } + } + //The string "GIF87a", a GIF signature. + fn image_gif87a()->ByteMatcher { + ByteMatcher{ + pattern: b"GIF87a", + mask: b"\xFF\xFF\xFF\xFF\xFF\xFF", + content_type: ("image", "gif"), + leading_ignore: &[] + } + } + //The string "RIFF" followed by four bytes followed by the string "WEBPVP". + fn image_webp()->ByteMatcher { + ByteMatcher{ + pattern: b"RIFF\x00\x00\x00\x00WEBPVP", + mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00,\xFF\xFF\xFF\xFF\xFF\xFF", + content_type: ("image", "webp"), + leading_ignore: &[] + } + } + //An error-checking byte followed by the string "PNG" followed by CR LF SUB LF, the PNG + //signature. + fn image_png()->ByteMatcher { + ByteMatcher{ + pattern: b"\x89PNG\r\n\x1A\n", + mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", + content_type: ("image", "png"), + leading_ignore: &[] + } + } + // The JPEG Start of Image marker followed by the indicator byte of another marker. + fn image_jpeg()->ByteMatcher { + ByteMatcher{ + pattern: b"\xFF\xD8\xFF", + mask: b"\xFF\xFF\xFF", + content_type: ("image", "jpeg"), + leading_ignore: &[] + } + } + //The WebM signature. [TODO: Use more bytes?] + fn video_webm()->ByteMatcher { + ByteMatcher{ + pattern: b"\x1A\x45\xDF\xA3", + mask: b"\xFF\xFF\xFF\xFF", + content_type: ("video", "webm"), + leading_ignore: &[] + } + } + //The string ".snd", the basic audio signature. + fn audio_basic()->ByteMatcher { + ByteMatcher{ + pattern: b".snd", + mask: b"\xFF\xFF\xFF\xFF", + content_type: ("audio", "basic"), + leading_ignore: &[] + } + } + //The string "FORM" followed by four bytes followed by the string "AIFF", the AIFF signature. + fn audio_aiff()->ByteMatcher { + ByteMatcher{ + pattern: b"FORM\x00\x00\x00\x00AIFF", + mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + content_type: ("audio", "aiff"), + leading_ignore: &[] + } + } + //The string "ID3", the ID3v2-tagged MP3 signature. + fn audio_mpeg()->ByteMatcher { + ByteMatcher{ + pattern: b"ID3", + mask: b"\xFF\xFF\xFF", + content_type: ("audio", "mpeg"), + leading_ignore: &[] + } + } + //The string "OggS" followed by NUL, the Ogg container signature. + fn application_ogg()->ByteMatcher { + ByteMatcher{ + pattern: b"OggS", + mask: b"\xFF\xFF\xFF\xFF\xFF", + content_type: ("application", "ogg"), + leading_ignore: &[] + } + } + //The string "MThd" followed by four bytes representing the number 6 in 32 bits (big-endian), + //the MIDI signature. + fn audio_midi()->ByteMatcher { + ByteMatcher{ + pattern: b"MThd\x00\x00\x00\x06", + mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", + content_type: ("audio", "midi"), + leading_ignore: &[] + } + } + //The string "RIFF" followed by four bytes followed by the string "AVI ", the AVI signature. + fn video_avi()->ByteMatcher { + ByteMatcher{ + pattern: b"RIFF\x00\x00\x00\x00AVI ", + mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + content_type: ("video", "avi"), + leading_ignore: &[] + } + } + // The string "RIFF" followed by four bytes followed by the string "WAVE", the WAVE signature. + fn audio_wave()->ByteMatcher { + ByteMatcher{ + pattern: b"RIFF\x00\x00\x00\x00WAVE", + mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + content_type: ("audio", "wave"), + leading_ignore: &[] + } + } + // doctype terminated with Tag terminating (TT) Byte + fn text_html_doctype()->TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher { + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b"TagTerminatedByteMatcher { + TagTerminatedByteMatcher { + matcher: ByteMatcher{ + pattern: b" diff --git a/tests/content/parsable_mime/text/html/text_html_comment_3e_u.html b/tests/content/parsable_mime/text/html/text_html_comment_3e_u.html new file mode 100644 index 00000000000..44a94ca5a7a --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_comment_3e_u.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/content/parsable_mime/text/html/text_html_div_20.html b/tests/content/parsable_mime/text/html/text_html_div_20.html new file mode 100644 index 00000000000..2ed34363b2f --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_div_20.html @@ -0,0 +1,3 @@ + + +
diff --git a/tests/content/parsable_mime/text/html/text_html_div_3e_u.html b/tests/content/parsable_mime/text/html/text_html_div_3e_u.html new file mode 100644 index 00000000000..c117f0f4cdd --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_div_3e_u.html @@ -0,0 +1,3 @@ + + +
diff --git a/tests/content/parsable_mime/text/html/text_html_doctype_20.html b/tests/content/parsable_mime/text/html/text_html_doctype_20.html new file mode 100644 index 00000000000..dbeb5a41c2a --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_doctype_20.html @@ -0,0 +1,3 @@ + + + + diff --git a/tests/content/parsable_mime/text/html/text_html_doctype_3e_u.html b/tests/content/parsable_mime/text/html/text_html_doctype_3e_u.html new file mode 100644 index 00000000000..8b16e40458e --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_doctype_3e_u.html @@ -0,0 +1,4 @@ + + + + diff --git a/tests/content/parsable_mime/text/html/text_html_font_20.html b/tests/content/parsable_mime/text/html/text_html_font_20.html new file mode 100644 index 00000000000..a18fa850617 --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_font_20.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/content/parsable_mime/text/html/text_html_font_3e_u.html b/tests/content/parsable_mime/text/html/text_html_font_3e_u.html new file mode 100644 index 00000000000..1181517947b --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_font_3e_u.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/content/parsable_mime/text/html/text_html_h1_20.html b/tests/content/parsable_mime/text/html/text_html_h1_20.html new file mode 100644 index 00000000000..3ed0eb125ff --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_h1_20.html @@ -0,0 +1,3 @@ + + +

diff --git a/tests/content/parsable_mime/text/html/text_html_h1_3e_u.html b/tests/content/parsable_mime/text/html/text_html_h1_3e_u.html new file mode 100644 index 00000000000..bae85229fcf --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_h1_3e_u.html @@ -0,0 +1,3 @@ + + +

diff --git a/tests/content/parsable_mime/text/html/text_html_head_20.html b/tests/content/parsable_mime/text/html/text_html_head_20.html new file mode 100644 index 00000000000..eb322c946e0 --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_head_20.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/content/parsable_mime/text/html/text_html_head_3e_u.html b/tests/content/parsable_mime/text/html/text_html_head_3e_u.html new file mode 100644 index 00000000000..8a33d623daa --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_head_3e_u.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/content/parsable_mime/text/html/text_html_iframe_20.html b/tests/content/parsable_mime/text/html/text_html_iframe_20.html new file mode 100644 index 00000000000..e632915590a --- /dev/null +++ b/tests/content/parsable_mime/text/html/text_html_iframe_20.html @@ -0,0 +1,3 @@ + + +