Hookup mime classifier to main document loader (#39109)

While we don't perform any sniffing yet, it does now
check all relevant mime types. Before, we would only
check specific versions of specific mime types. However,
the specification is more elaborate, which MimeClassifier
already handles.

There are two new test failures, but they actually fail
in the same way in other browsers. These tests are still
being discussed what the correct behavior should be [1]

Part of #14024

[1]:
https://github.com/whatwg/mimesniff/issues/189#issuecomment-2081559661

---------

Signed-off-by: Tim van der Lippe <tvanderlippe@gmail.com>
This commit is contained in:
Tim van der Lippe 2025-09-04 10:30:55 +02:00 committed by GitHub
parent a44b98c358
commit dab195ca80
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 284 additions and 619 deletions

View file

@ -13,7 +13,6 @@ pub struct MimeClassifier {
plaintext_classifier: GroupedClassifier,
archive_classifier: GroupedClassifier,
binary_or_plaintext: BinaryOrPlaintextClassifier,
feeds_classifier: FeedsClassifier,
font_classifier: GroupedClassifier,
}
@ -26,8 +25,11 @@ pub enum MediaType {
JavaScript,
Json,
Font,
Text,
Css,
}
#[derive(PartialEq)]
pub enum ApacheBugFlag {
On,
Off,
@ -35,12 +37,11 @@ pub enum ApacheBugFlag {
impl ApacheBugFlag {
/// <https://mimesniff.spec.whatwg.org/#supplied-mime-type-detection-algorithm>
pub fn from_content_type(last_raw_content_type: &[u8]) -> ApacheBugFlag {
if last_raw_content_type == b"text/plain" ||
last_raw_content_type == b"text/plain; charset=ISO-8859-1" ||
last_raw_content_type == b"text/plain; charset=iso-8859-1" ||
last_raw_content_type == b"text/plain; charset=UTF-8"
{
pub fn from_content_type(mime_type: Option<&Mime>) -> ApacheBugFlag {
// TODO(36801): also handle charset ISO-8859-1
if mime_type.is_some_and(|mime_type| {
*mime_type == mime::TEXT_PLAIN || *mime_type == mime::TEXT_PLAIN_UTF_8
}) {
ApacheBugFlag::On
} else {
ApacheBugFlag::Off
@ -63,14 +64,13 @@ impl Default for MimeClassifier {
plaintext_classifier: GroupedClassifier::plaintext_classifier(),
archive_classifier: GroupedClassifier::archive_classifier(),
binary_or_plaintext: BinaryOrPlaintextClassifier,
feeds_classifier: FeedsClassifier,
font_classifier: GroupedClassifier::font_classifier(),
}
}
}
impl MimeClassifier {
// Performs MIME Type Sniffing Algorithm (sections 7 and 8)
/// <https://mimesniff.spec.whatwg.org/#mime-type-sniffing-algorithm>
pub fn classify<'a>(
&'a self,
context: LoadContext,
@ -82,39 +82,53 @@ impl MimeClassifier {
let supplied_type_or_octet_stream = supplied_type
.clone()
.unwrap_or(mime::APPLICATION_OCTET_STREAM);
// Step 1. If the supplied MIME type is an XML MIME type or HTML MIME type,
// the computed MIME type is the supplied MIME type.
if Self::is_xml(&supplied_type_or_octet_stream) ||
Self::is_html(&supplied_type_or_octet_stream)
{
return supplied_type_or_octet_stream;
}
match context {
LoadContext::Browsing => match *supplied_type {
// Step 2. If the supplied MIME type is undefined or if the supplied MIME types essence is "unknown/unknown",
// "application/unknown", or "*/*", execute the rules for identifying
// an unknown MIME type with the sniff-scriptable flag equal to the inverse of the no-sniff flag and abort these steps.
None => self.sniff_unknown_type(no_sniff_flag, data),
Some(ref supplied_type) => {
if MimeClassifier::is_explicit_unknown(supplied_type) {
self.sniff_unknown_type(no_sniff_flag, data)
} else {
match no_sniff_flag {
NoSniffFlag::On => supplied_type.clone(),
NoSniffFlag::Off => match apache_bug_flag {
ApacheBugFlag::On => self.sniff_text_or_data(data),
ApacheBugFlag::Off => {
match MimeClassifier::get_media_type(supplied_type) {
Some(MediaType::Html) => {
self.feeds_classifier.classify(data)
},
Some(MediaType::Image) => {
self.image_classifier.classify(data)
},
Some(MediaType::AudioVideo) => {
self.audio_video_classifier.classify(data)
},
Some(MediaType::JavaScript) |
Some(MediaType::Font) |
Some(MediaType::Json) |
Some(MediaType::Xml) |
None => None,
}
.unwrap_or(supplied_type.clone())
},
},
}
return self.sniff_unknown_type(no_sniff_flag, data);
}
// Step 3. If the no-sniff flag is set, the computed MIME type is the supplied MIME type.
// Abort these steps.
if no_sniff_flag == NoSniffFlag::On {
return supplied_type.clone();
}
// Step 4. If the check-for-apache-bug flag is set,
// execute the rules for distinguishing if a resource is text or binary and abort these steps.
if apache_bug_flag == ApacheBugFlag::On {
return self.sniff_text_or_data(data);
}
match MimeClassifier::get_media_type(supplied_type) {
// Step 5. If the supplied MIME type is an image MIME type supported by the user agent,
// let matched-type be the result of executing the image type pattern matching algorithm with
// the resource header as the byte sequence to be matched.
Some(MediaType::Image) => {
// Step 6. If matched-type is not undefined, the computed MIME type is matched-type.
self.image_classifier.classify(data)
},
// Step 7. If the supplied MIME type is an audio or video MIME type supported by the user agent,
// let matched-type be the result of executing the audio or video type pattern matching algorithm
// with the resource header as the byte sequence to be matched.
Some(MediaType::AudioVideo) => {
// Step 8. If matched-type is not undefined, the computed MIME type is matched-type.
self.audio_video_classifier.classify(data)
},
Some(MediaType::Html) | Some(MediaType::Xml) => unreachable!(),
_ => None,
}
// Step 9. The computed MIME type is the supplied MIME type.
.unwrap_or(supplied_type.clone())
},
},
LoadContext::Image => {
@ -195,7 +209,6 @@ impl MimeClassifier {
self.plaintext_classifier.validate()?;
self.archive_classifier.validate()?;
self.binary_or_plaintext.validate()?;
self.feeds_classifier.validate()?;
self.font_classifier.validate()?;
Ok(())
}
@ -300,6 +313,14 @@ impl MimeClassifier {
.contains(&mt.subtype().as_str())))
}
fn is_text(mt: &Mime) -> bool {
*mt == mime::TEXT_PLAIN || mt.essence_str() == "text/vtt"
}
fn is_css(mt: &Mime) -> bool {
*mt == mime::TEXT_CSS
}
pub fn get_media_type(mime: &Mime) -> Option<MediaType> {
if MimeClassifier::is_xml(mime) {
Some(MediaType::Xml)
@ -315,6 +336,10 @@ impl MimeClassifier {
Some(MediaType::Font)
} else if MimeClassifier::is_json(mime) {
Some(MediaType::Json)
} else if MimeClassifier::is_text(mime) {
Some(MediaType::Text)
} else if MimeClassifier::is_css(mime) {
Some(MediaType::Css)
} else {
None
}
@ -334,38 +359,6 @@ trait MIMEChecker {
fn validate(&self) -> Result<(), String>;
}
trait Matches {
fn matches(&mut self, matches: &[u8]) -> bool;
}
impl<'a, T: Iterator<Item = &'a u8> + Clone> Matches for T {
// Matching function that works on an iterator.
// see if the next matches.len() bytes in data_iterator equal matches
// move iterator and return true or just return false
//
// Params
// self: an iterator
// matches: a vector of bytes to match
//
// Return
// true if the next n elements of self match n elements of matches
// false otherwise
//
// Side effects
// moves the iterator when match is found
fn matches(&mut self, matches: &[u8]) -> bool {
if self.clone().nth(matches.len()).is_none() {
// there are less than matches.len() elements in self
return false;
}
let result = self.clone().zip(matches).all(|(s, m)| *s == *m);
if result {
self.nth(matches.len());
}
result
}
}
struct ByteMatcher {
pattern: &'static [u8],
mask: &'static [u8],
@ -452,29 +445,45 @@ impl MIMEChecker for TagTerminatedByteMatcher {
pub struct Mp4Matcher;
impl Mp4Matcher {
/// <https://mimesniff.spec.whatwg.org/#matches-the-signature-for-mp4>
pub fn matches(&self, data: &[u8]) -> bool {
// Step 1. Let sequence be the byte sequence to be matched,
// where sequence[s] is byte s in sequence and sequence[0] is the first byte in sequence.
// Step 2. Let length be the number of bytes in sequence.
// Step 3. If length is less than 12, return false.
if data.len() < 12 {
return false;
}
// Step 4. Let box-size be the four bytes from sequence[0] to sequence[3],
// interpreted as a 32-bit unsigned big-endian integer.
let box_size = (((data[0] as u32) << 24) |
((data[1] as u32) << 16) |
((data[2] as u32) << 8) |
(data[3] as u32)) as usize;
// Step 5. If length is less than box-size or if box-size modulo 4 is not equal to 0, return false.
if (data.len() < box_size) || (box_size % 4 != 0) {
return false;
}
// Step 6. If the four bytes from sequence[4] to sequence[7] are not equal to 0x66 0x74 0x79 0x70 ("ftyp"), return false.
let ftyp = [0x66, 0x74, 0x79, 0x70];
if !data[4..].starts_with(&ftyp) {
return false;
}
// Step 7. If the three bytes from sequence[8] to sequence[10] are equal to 0x6D 0x70 0x34 ("mp4"), return true.
let mp4 = [0x6D, 0x70, 0x34];
data[8..].starts_with(&mp4) ||
// Step 8. Let bytes-read be 16.
// Step 9. While bytes-read is less than box-size, continuously loop through these steps:
data[16..box_size]
// Step 11. Increment bytes-read by 4.
.chunks(4)
// Step 10. If the three bytes from sequence[bytes-read] to sequence[bytes-read + 2]
// are equal to 0x6D 0x70 0x34 ("mp4"), return true.
.any(|chunk| chunk.starts_with(&mp4))
// Step 12. Return false.
}
}
impl MIMEChecker for Mp4Matcher {
@ -494,7 +503,15 @@ impl MIMEChecker for Mp4Matcher {
struct BinaryOrPlaintextClassifier;
impl BinaryOrPlaintextClassifier {
/// <https://mimesniff.spec.whatwg.org/#rules-for-text-or-binary>
fn classify_impl(&self, data: &[u8]) -> Mime {
// Step 1. Let length be the number of bytes in the resource header.
// Step 2. If length is greater than or equal to 2 and
// the first 2 bytes of the resource header are equal to 0xFE 0xFF (UTF-16BE BOM)
// or 0xFF 0xFE (UTF-16LE BOM), the computed MIME type is "text/plain".
// Step 3. If length is greater than or equal to 3
// and the first 3 bytes of the resource header are equal to
// 0xEF 0xBB 0xBF (UTF-8 BOM), the computed MIME type is "text/plain".
if data.starts_with(&[0xFFu8, 0xFEu8]) ||
data.starts_with(&[0xFEu8, 0xFFu8]) ||
data.starts_with(&[0xEFu8, 0xBBu8, 0xBFu8])
@ -506,8 +523,11 @@ impl BinaryOrPlaintextClassifier {
(0x0Eu8..=0x1Au8).contains(&x) ||
(0x1Cu8..=0x1Fu8).contains(&x)
}) {
// Step 5. The computed MIME type is "application/octet-stream".
mime::APPLICATION_OCTET_STREAM
} else {
// Step 4. If the resource header contains no binary data bytes,
// the computed MIME type is "text/plain".
mime::TEXT_PLAIN
}
}
@ -629,120 +649,6 @@ impl MIMEChecker for GroupedClassifier {
}
}
enum Match {
None,
Start,
StartAndEnd,
}
impl Match {
fn chain<F: FnOnce() -> Match>(self, f: F) -> Match {
if let Match::None = self {
return f();
}
self
}
}
fn eats_until<'a, T>(matcher: &mut T, start: &[u8], end: &[u8]) -> Match
where
T: Iterator<Item = &'a u8> + Clone,
{
if !matcher.matches(start) {
Match::None
} else if end.len() == 1 {
if matcher.any(|&x| x == end[0]) {
Match::StartAndEnd
} else {
Match::Start
}
} else {
while !matcher.matches(end) {
if matcher.next().is_none() {
return Match::Start;
}
}
Match::StartAndEnd
}
}
struct FeedsClassifier;
impl FeedsClassifier {
// Implements sniffing for mislabeled feeds (https://mimesniff.spec.whatwg.org/#sniffing-a-mislabeled-feed)
fn classify_impl(&self, data: &[u8]) -> Option<Mime> {
// Step 4: can not be feed unless length is > 3
if data.len() < 3 {
return None;
}
let mut matcher = data.iter();
// eat the first three acceptable byte sequences if they are equal to UTF-8 BOM
let utf8_bom = &[0xEFu8, 0xBBu8, 0xBFu8];
matcher.matches(utf8_bom);
// continuously search for next "<" until end of matcher
// TODO: need max_bytes to prevent inadvertently examining html document
// eg. an html page with a feed example
loop {
if !matcher.any(|x| *x == b'<') {
return None;
}
// Steps 5.2.1 to 5.2.4
match eats_until(&mut matcher, b"?", b"?>")
.chain(|| eats_until(&mut matcher, b"!--", b"-->"))
.chain(|| eats_until(&mut matcher, b"!", b">"))
{
Match::StartAndEnd => continue,
Match::None => {},
Match::Start => return None,
}
// Step 5.2.5
if matcher.matches(b"rss") {
return Some("application/rss+xml".parse().unwrap());
}
// Step 5.2.6
if matcher.matches(b"feed") {
return Some("application/atom+xml".parse().unwrap());
}
// Step 5.2.7
if matcher.matches(b"rdf:RDF") {
while matcher.next().is_some() {
match eats_until(
&mut matcher,
b"http://purl.org/rss/1.0/",
b"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
)
.chain(|| {
eats_until(
&mut matcher,
b"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
b"http://purl.org/rss/1.0/",
)
}) {
Match::StartAndEnd => return Some("application/rss+xml".parse().unwrap()),
Match::None => {},
Match::Start => return None,
}
}
return None;
}
}
}
}
impl MIMEChecker for FeedsClassifier {
fn classify(&self, data: &[u8]) -> Option<Mime> {
self.classify_impl(data)
}
fn validate(&self) -> Result<(), String> {
Ok(())
}
}
// Contains hard coded byte matchers
// TODO: These should be configured and not hard coded
impl ByteMatcher {