mirror of
https://github.com/servo/servo.git
synced 2025-06-24 00:54:32 +01:00
Auto merge of #28006 - andreubotella:document-charset-bom, r=jdm
Fix `document.characterSet` not reflecting byte order marks. The process of decoding the network byte stream to Unicode is backed by an instance of `encoding_rs::Decoder`, which will switch the encoding it uses if it finds a BOM in the byte stream. However, this change in encoding is not communicated back to the caller and so `document.characterSet` gives the wrong result. This change fixes that. See whatwg/html#5359 and whatwg/encoding#203 for the spec-level backing for this change. --- <!-- Thank you for contributing to Servo! Please replace each `[ ]` by `[X]` when the step is complete, and replace `___` with appropriate data: --> - [X] `./mach build -d` does not report any errors - [X] `./mach test-tidy` does not report any errors - [X] These changes fix #28005 (GitHub issue number if applicable) <!-- Either: --> - [X] There are tests for these changes OR - [ ] These changes do not require tests because ___ <!-- Also, please make sure that "Allow edits from maintainers" checkbox is checked, so that we can help you if you get stuck somewhere along the way.--> <!-- Pull requests that do not address these steps are welcome, but they will require additional verification as part of the review process. -->
This commit is contained in:
commit
4d1641bf9b
8 changed files with 64 additions and 52 deletions
|
@ -77,7 +77,7 @@ impl DOMParserMethods for DOMParser {
|
||||||
None,
|
None,
|
||||||
Default::default(),
|
Default::default(),
|
||||||
);
|
);
|
||||||
ServoParser::parse_html_document(&document, s, url);
|
ServoParser::parse_html_document(&document, Some(s), url);
|
||||||
document.set_ready_state(DocumentReadyState::Complete);
|
document.set_ready_state(DocumentReadyState::Complete);
|
||||||
Ok(document)
|
Ok(document)
|
||||||
},
|
},
|
||||||
|
@ -97,7 +97,7 @@ impl DOMParserMethods for DOMParser {
|
||||||
None,
|
None,
|
||||||
Default::default(),
|
Default::default(),
|
||||||
);
|
);
|
||||||
ServoParser::parse_xml_document(&document, s, url);
|
ServoParser::parse_xml_document(&document, Some(s), url);
|
||||||
document.set_ready_state(DocumentReadyState::Complete);
|
document.set_ready_state(DocumentReadyState::Complete);
|
||||||
Ok(document)
|
Ok(document)
|
||||||
},
|
},
|
||||||
|
|
|
@ -83,6 +83,12 @@ pub struct ServoParser {
|
||||||
reflector: Reflector,
|
reflector: Reflector,
|
||||||
/// The document associated with this parser.
|
/// The document associated with this parser.
|
||||||
document: Dom<Document>,
|
document: Dom<Document>,
|
||||||
|
/// The BOM sniffing state.
|
||||||
|
///
|
||||||
|
/// `None` means we've found the BOM, we've found there isn't one, or
|
||||||
|
/// we're not parsing from a byte stream. `Some` contains the BOM bytes
|
||||||
|
/// found so far.
|
||||||
|
bom_sniff: DomRefCell<Option<Vec<u8>>>,
|
||||||
/// The decoder used for the network input.
|
/// The decoder used for the network input.
|
||||||
network_decoder: DomRefCell<Option<NetworkDecoder>>,
|
network_decoder: DomRefCell<Option<NetworkDecoder>>,
|
||||||
/// Input received from network.
|
/// Input received from network.
|
||||||
|
@ -142,7 +148,7 @@ impl ServoParser {
|
||||||
self.can_write() || self.tokenizer.try_borrow_mut().is_ok()
|
self.can_write() || self.tokenizer.try_borrow_mut().is_ok()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_html_document(document: &Document, input: DOMString, url: ServoUrl) {
|
pub fn parse_html_document(document: &Document, input: Option<DOMString>, url: ServoUrl) {
|
||||||
let parser = if pref!(dom.servoparser.async_html_tokenizer.enabled) {
|
let parser = if pref!(dom.servoparser.async_html_tokenizer.enabled) {
|
||||||
ServoParser::new(
|
ServoParser::new(
|
||||||
document,
|
document,
|
||||||
|
@ -163,7 +169,13 @@ impl ServoParser {
|
||||||
ParserKind::Normal,
|
ParserKind::Normal,
|
||||||
)
|
)
|
||||||
};
|
};
|
||||||
parser.parse_string_chunk(String::from(input));
|
|
||||||
|
// Set as the document's current parser and initialize with `input`, if given.
|
||||||
|
if let Some(input) = input {
|
||||||
|
parser.parse_string_chunk(String::from(input));
|
||||||
|
} else {
|
||||||
|
parser.document.set_current_parser(Some(&parser));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://html.spec.whatwg.org/multipage/#parsing-html-fragments
|
// https://html.spec.whatwg.org/multipage/#parsing-html-fragments
|
||||||
|
@ -242,17 +254,24 @@ impl ServoParser {
|
||||||
LastChunkState::NotReceived,
|
LastChunkState::NotReceived,
|
||||||
ParserKind::ScriptCreated,
|
ParserKind::ScriptCreated,
|
||||||
);
|
);
|
||||||
|
*parser.bom_sniff.borrow_mut() = None;
|
||||||
document.set_current_parser(Some(&parser));
|
document.set_current_parser(Some(&parser));
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_xml_document(document: &Document, input: DOMString, url: ServoUrl) {
|
pub fn parse_xml_document(document: &Document, input: Option<DOMString>, url: ServoUrl) {
|
||||||
let parser = ServoParser::new(
|
let parser = ServoParser::new(
|
||||||
document,
|
document,
|
||||||
Tokenizer::Xml(self::xml::Tokenizer::new(document, url)),
|
Tokenizer::Xml(self::xml::Tokenizer::new(document, url)),
|
||||||
LastChunkState::NotReceived,
|
LastChunkState::NotReceived,
|
||||||
ParserKind::Normal,
|
ParserKind::Normal,
|
||||||
);
|
);
|
||||||
parser.parse_string_chunk(String::from(input));
|
|
||||||
|
// Set as the document's current parser and initialize with `input`, if given.
|
||||||
|
if let Some(input) = input {
|
||||||
|
parser.parse_string_chunk(String::from(input));
|
||||||
|
} else {
|
||||||
|
parser.document.set_current_parser(Some(&parser));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn script_nesting_level(&self) -> usize {
|
pub fn script_nesting_level(&self) -> usize {
|
||||||
|
@ -402,6 +421,7 @@ impl ServoParser {
|
||||||
ServoParser {
|
ServoParser {
|
||||||
reflector: Reflector::new(),
|
reflector: Reflector::new(),
|
||||||
document: Dom::from_ref(document),
|
document: Dom::from_ref(document),
|
||||||
|
bom_sniff: DomRefCell::new(Some(Vec::with_capacity(3))),
|
||||||
network_decoder: DomRefCell::new(Some(NetworkDecoder::new(document.encoding()))),
|
network_decoder: DomRefCell::new(Some(NetworkDecoder::new(document.encoding()))),
|
||||||
network_input: DomRefCell::new(BufferQueue::new()),
|
network_input: DomRefCell::new(BufferQueue::new()),
|
||||||
script_input: DomRefCell::new(BufferQueue::new()),
|
script_input: DomRefCell::new(BufferQueue::new()),
|
||||||
|
@ -463,6 +483,35 @@ impl ServoParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push_bytes_input_chunk(&self, chunk: Vec<u8>) {
|
fn push_bytes_input_chunk(&self, chunk: Vec<u8>) {
|
||||||
|
// BOM sniff. This is needed because NetworkDecoder will switch the
|
||||||
|
// encoding based on the BOM, but it won't change
|
||||||
|
// `self.document.encoding` in the process.
|
||||||
|
{
|
||||||
|
let set_bom_to_none = if let Some(partial_bom) = self.bom_sniff.borrow_mut().as_mut() {
|
||||||
|
if partial_bom.len() + chunk.len() >= 3 {
|
||||||
|
partial_bom.extend(chunk.iter().take(3 - partial_bom.len()).copied());
|
||||||
|
//println!("Full BOM: {:?}", partial_bom);
|
||||||
|
if let Some((encoding, _)) = Encoding::for_bom(&partial_bom) {
|
||||||
|
//println!("Encoding: {}", encoding.name());
|
||||||
|
self.document.set_encoding(encoding);
|
||||||
|
} else {
|
||||||
|
//println!("Bytes are not a BOM.");
|
||||||
|
};
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
partial_bom.extend(chunk.iter().copied());
|
||||||
|
//println!("Partial BOM: {:?}", partial_bom);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//println!("partial_bom is None");
|
||||||
|
false
|
||||||
|
};
|
||||||
|
if set_bom_to_none {
|
||||||
|
*self.bom_sniff.borrow_mut() = None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// For byte input, we convert it to text using the network decoder.
|
// For byte input, we convert it to text using the network decoder.
|
||||||
let chunk = self
|
let chunk = self
|
||||||
.network_decoder
|
.network_decoder
|
||||||
|
@ -474,6 +523,11 @@ impl ServoParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push_string_input_chunk(&self, chunk: String) {
|
fn push_string_input_chunk(&self, chunk: String) {
|
||||||
|
// If the input is a string, we don't have a BOM.
|
||||||
|
if self.bom_sniff.borrow().is_some() {
|
||||||
|
*self.bom_sniff.borrow_mut() = None;
|
||||||
|
}
|
||||||
|
|
||||||
// The input has already been decoded as a string, so doesn't need
|
// The input has already been decoded as a string, so doesn't need
|
||||||
// to be decoded by the network decoder again.
|
// to be decoded by the network decoder again.
|
||||||
let chunk = StrTendril::from(chunk);
|
let chunk = StrTendril::from(chunk);
|
||||||
|
|
|
@ -1488,7 +1488,7 @@ impl XMLHttpRequest {
|
||||||
let (decoded, _, _) = charset.decode(&response);
|
let (decoded, _, _) = charset.decode(&response);
|
||||||
let document = self.new_doc(IsHTMLDocument::HTMLDocument);
|
let document = self.new_doc(IsHTMLDocument::HTMLDocument);
|
||||||
// TODO: Disable scripting while parsing
|
// TODO: Disable scripting while parsing
|
||||||
ServoParser::parse_html_document(&document, DOMString::from(decoded), wr.get_url());
|
ServoParser::parse_html_document(&document, Some(DOMString::from(decoded)), wr.get_url());
|
||||||
document
|
document
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1499,7 +1499,7 @@ impl XMLHttpRequest {
|
||||||
let (decoded, _, _) = charset.decode(&response);
|
let (decoded, _, _) = charset.decode(&response);
|
||||||
let document = self.new_doc(IsHTMLDocument::NonHTMLDocument);
|
let document = self.new_doc(IsHTMLDocument::NonHTMLDocument);
|
||||||
// TODO: Disable scripting while parsing
|
// TODO: Disable scripting while parsing
|
||||||
ServoParser::parse_xml_document(&document, DOMString::from(decoded), wr.get_url());
|
ServoParser::parse_xml_document(&document, Some(DOMString::from(decoded)), wr.get_url());
|
||||||
document
|
document
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3415,15 +3415,13 @@ impl ScriptThread {
|
||||||
(incomplete.browsing_context_id, incomplete.pipeline_id, None),
|
(incomplete.browsing_context_id, incomplete.pipeline_id, None),
|
||||||
);
|
);
|
||||||
|
|
||||||
let parse_input = DOMString::new();
|
|
||||||
|
|
||||||
document.set_https_state(metadata.https_state);
|
document.set_https_state(metadata.https_state);
|
||||||
document.set_navigation_start(incomplete.navigation_start_precise);
|
document.set_navigation_start(incomplete.navigation_start_precise);
|
||||||
|
|
||||||
if is_html_document == IsHTMLDocument::NonHTMLDocument {
|
if is_html_document == IsHTMLDocument::NonHTMLDocument {
|
||||||
ServoParser::parse_xml_document(&document, parse_input, final_url);
|
ServoParser::parse_xml_document(&document, None, final_url);
|
||||||
} else {
|
} else {
|
||||||
ServoParser::parse_html_document(&document, parse_input, final_url);
|
ServoParser::parse_html_document(&document, None, final_url);
|
||||||
}
|
}
|
||||||
|
|
||||||
if incomplete.activity == DocumentActivity::FullyActive {
|
if incomplete.activity == DocumentActivity::FullyActive {
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
[bom-handling.html]
|
|
||||||
[document.characterSet should match the BOM]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
[bom-handling.html]
|
|
||||||
[document.characterSet should match the BOM]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
|
@ -1,10 +1,4 @@
|
||||||
[utf-32-from-win1252.html]
|
[utf-32-from-win1252.html]
|
||||||
[Expect resources/utf-32-little-endian-bom.xml to parse as UTF-16LE]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
||||||
[Expect resources/utf-32-little-endian-bom.html to parse as UTF-16LE]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
||||||
[Expect resources/utf-32-big-endian-bom.html to parse as windows-1252]
|
[Expect resources/utf-32-big-endian-bom.html to parse as windows-1252]
|
||||||
expected: FAIL
|
expected: FAIL
|
||||||
|
|
||||||
|
|
|
@ -1,26 +0,0 @@
|
||||||
[utf-32.html]
|
|
||||||
type: testharness
|
|
||||||
[Expect resources/utf-32-big-endian-bom.html to parse as windows-1252]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
||||||
[Expect resources/utf-32-big-endian-bom.xml to parse as windows-1252]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
||||||
[Expect resources/utf-32-big-endian-nobom.html to parse as windows-1252]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
||||||
[Expect resources/utf-32-big-endian-nobom.xml to parse as windows-1252]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
||||||
[Expect resources/utf-32-little-endian-bom.html to parse as UTF-16LE]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
||||||
[Expect resources/utf-32-little-endian-bom.xml to parse as UTF-16LE]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
||||||
[Expect resources/utf-32-little-endian-nobom.html to parse as windows-1252]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
||||||
[Expect resources/utf-32-little-endian-nobom.xml to parse as windows-1252]
|
|
||||||
expected: FAIL
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue