Auto merge of #22432 - servo:encoding, r=SimonSapin

Make the parser decode input from document's encoding

<!-- Reviewable:start -->
This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/servo/22432)
<!-- Reviewable:end -->
This commit is contained in:
bors-servo 2018-12-12 15:17:32 -05:00 committed by GitHub
commit 2e01a23bad
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 43 additions and 55 deletions

8
Cargo.lock generated
View file

@ -2328,7 +2328,7 @@ dependencies = [
"serde_json 1.0.13 (registry+https://github.com/rust-lang/crates.io-index)",
"string_cache 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)",
"string_cache_codegen 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
"tendril 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"tendril 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -3310,6 +3310,7 @@ dependencies = [
"style 0.0.1",
"style_traits 0.0.1",
"swapper 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"tendril 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)",
"tinyfiledialogs 3.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-segmentation 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
@ -3996,9 +3997,10 @@ dependencies = [
[[package]]
name = "tendril"
version = "0.4.0"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding_rs 0.8.12 (registry+https://github.com/rust-lang/crates.io-index)",
"futf 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
"mac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"utf-8 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
@ -5117,7 +5119,7 @@ dependencies = [
"checksum syn 0.15.22 (registry+https://github.com/rust-lang/crates.io-index)" = "ae8b29eb5210bc5cf63ed6149cbf9adfc82ac0be023d8735c176ee74a2db4da7"
"checksum synstructure 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73687139bf99285483c96ac0add482c3776528beac1d97d444f6e91f203a2015"
"checksum tempfile 3.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "47776f63b85777d984a50ce49d6b9e58826b6a3766a449fc95bc66cd5663c15b"
"checksum tendril 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9de21546595a0873061940d994bbbc5c35f024ae4fd61ec5c5b159115684f508"
"checksum tendril 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "707feda9f2582d5d680d733e38755547a3e8fb471e7ba11452ecfd9ce93a5d3b"
"checksum termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4096add70612622289f2fdcdbd5086dc81c1e2675e6ae58d6c4f62a16c6d7f2f"
"checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096"
"checksum textwrap 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c0b59b6b4b44d867f1370ef1bd91bfb262bf07bf0ae65c202ea2fbc16153b693"

View file

@ -553,12 +553,10 @@ impl Metadata {
.as_mut()
.unwrap()
.typed_insert(ContentType::from(mime.clone()));
self.content_type = Some(Serde(ContentType::from(mime.clone())));
for (name, value) in mime.params() {
if mime::CHARSET == name {
self.charset = Some(value.to_string());
}
if let Some(charset) = mime.get_param(mime::CHARSET) {
self.charset = Some(charset.to_string());
}
self.content_type = Some(Serde(ContentType::from(mime.clone())));
}
}
}

View file

@ -103,6 +103,7 @@ smallvec = { version = "0.6", features = ["std", "union"] }
style = {path = "../style", features = ["servo"]}
style_traits = {path = "../style_traits"}
swapper = "0.1"
tendril = {version = "0.4.1", features = ["encoding_rs"]}
time = "0.1.12"
unicode-segmentation = "1.1.0"
url = "1.6"

View file

@ -56,9 +56,6 @@ use encoding_rs::{Decoder, Encoding};
use euclid::Length as EuclidLength;
use euclid::{Point2D, Rect, Transform2D, Transform3D, TypedScale, TypedSize2D, Vector2D};
use html5ever::buffer_queue::BufferQueue;
use html5ever::tendril::fmt::UTF8;
use html5ever::tendril::stream::Utf8LossyDecoder;
use html5ever::tendril::{StrTendril, TendrilSink};
use html5ever::{LocalName, Namespace, Prefix, QualName};
use http::header::HeaderMap;
use hyper::Method;
@ -128,6 +125,9 @@ use style::stylesheets::keyframes_rule::Keyframe;
use style::stylesheets::{CssRules, FontFaceRule, KeyframesRule, MediaRule, Stylesheet};
use style::stylesheets::{ImportRule, NamespaceRule, StyleRule, SupportsRule, ViewportRule};
use style::values::specified::Length;
use tendril::fmt::UTF8;
use tendril::stream::LossyDecoder;
use tendril::{StrTendril, TendrilSink};
use time::Duration;
use uuid::Uuid;
use webrender_api::{DocumentId, ImageKey, RenderApiSender};
@ -736,12 +736,12 @@ where
}
}
unsafe impl<Sink> JSTraceable for Utf8LossyDecoder<Sink>
unsafe impl<Sink> JSTraceable for LossyDecoder<Sink>
where
Sink: JSTraceable + TendrilSink<UTF8>,
{
unsafe fn trace(&self, tracer: *mut JSTracer) {
self.inner_sink.trace(tracer);
self.inner_sink().trace(tracer);
}
}

View file

@ -2586,26 +2586,32 @@ impl Document {
let interactive_time =
InteractiveMetrics::new(window.time_profiler_chan().clone(), url.clone());
let content_type = content_type.unwrap_or_else(|| {
match is_html_document {
// https://dom.spec.whatwg.org/#dom-domimplementation-createhtmldocument
IsHTMLDocument::HTMLDocument => mime::TEXT_HTML,
// https://dom.spec.whatwg.org/#concept-document-content-type
IsHTMLDocument::NonHTMLDocument => "application/xml".parse().unwrap(),
}
});
let encoding = content_type
.get_param(mime::CHARSET)
.and_then(|charset| Encoding::for_label(charset.as_str().as_bytes()))
.unwrap_or(UTF_8);
Document {
node: Node::new_document_node(),
window: Dom::from_ref(window),
has_browsing_context: has_browsing_context == HasBrowsingContext::Yes,
implementation: Default::default(),
content_type: match content_type {
Some(mime_data) => mime_data,
None => match is_html_document {
// https://dom.spec.whatwg.org/#dom-domimplementation-createhtmldocument
IsHTMLDocument::HTMLDocument => mime::TEXT_HTML,
// https://dom.spec.whatwg.org/#concept-document-content-type
IsHTMLDocument::NonHTMLDocument => "application/xml".parse().unwrap(),
},
},
content_type,
last_modified: last_modified,
url: DomRefCell::new(url),
// https://dom.spec.whatwg.org/#concept-document-quirks
quirks_mode: Cell::new(QuirksMode::NoQuirks),
// https://dom.spec.whatwg.org/#concept-document-encoding
encoding: Cell::new(UTF_8),
encoding: Cell::new(encoding),
is_html_document: is_html_document == IsHTMLDocument::HTMLDocument,
activity: Cell::new(activity),
id_map: DomRefCell::new(HashMap::new()),
@ -4340,7 +4346,7 @@ impl DocumentMethods for Document {
.clone();
*self.loader.borrow_mut() =
DocumentLoader::new_with_threads(resource_threads, Some(self.url()));
ServoParser::parse_html_script_input(self, self.url(), "text/html");
ServoParser::parse_html_script_input(self, self.url());
// Step 15
self.ready_state.set(DocumentReadyState::Loading);

View file

@ -37,9 +37,9 @@ use crate::network_listener::PreInvoke;
use crate::script_thread::ScriptThread;
use dom_struct::dom_struct;
use embedder_traits::resources::{self, Resource};
use encoding_rs::Encoding;
use html5ever::buffer_queue::BufferQueue;
use html5ever::tendril::fmt::UTF8;
use html5ever::tendril::stream::Utf8LossyDecoder;
use html5ever::tendril::{ByteTendril, StrTendril, TendrilSink};
use html5ever::tree_builder::{ElementFlags, NextParserState, NodeOrText, QuirksMode, TreeSink};
use html5ever::{Attribute, ExpandedName, LocalName, QualName};
@ -58,6 +58,7 @@ use std::borrow::Cow;
use std::cell::Cell;
use std::mem;
use style::context::QuirksMode as ServoQuirksMode;
use tendril::stream::LossyDecoder;
mod async_html;
mod html;
@ -225,7 +226,7 @@ impl ServoParser {
}
}
pub fn parse_html_script_input(document: &Document, url: ServoUrl, type_: &str) {
pub fn parse_html_script_input(document: &Document, url: ServoUrl) {
let parser = ServoParser::new(
document,
Tokenizer::Html(self::html::Tokenizer::new(
@ -238,10 +239,6 @@ impl ServoParser {
ParserKind::ScriptCreated,
);
document.set_current_parser(Some(&parser));
if !type_.eq_ignore_ascii_case("text/html") {
parser.parse_string_chunk("<pre>\n".to_owned());
parser.tokenizer.borrow_mut().set_plaintext_state();
}
}
pub fn parse_xml_document(document: &Document, input: DOMString, url: ServoUrl) {
@ -402,7 +399,7 @@ impl ServoParser {
ServoParser {
reflector: Reflector::new(),
document: Dom::from_ref(document),
network_decoder: DomRefCell::new(Some(NetworkDecoder::new())),
network_decoder: DomRefCell::new(Some(NetworkDecoder::new(document.encoding()))),
network_input: DomRefCell::new(BufferQueue::new()),
script_input: DomRefCell::new(BufferQueue::new()),
tokenizer: DomRefCell::new(tokenizer),
@ -1198,20 +1195,23 @@ fn create_element_for_token(
#[derive(JSTraceable, MallocSizeOf)]
struct NetworkDecoder {
#[ignore_malloc_size_of = "Defined in html5ever"]
decoder: Utf8LossyDecoder<NetworkSink>,
#[ignore_malloc_size_of = "Defined in tendril"]
decoder: LossyDecoder<NetworkSink>,
}
impl NetworkDecoder {
fn new() -> Self {
fn new(encoding: &'static Encoding) -> Self {
Self {
decoder: Utf8LossyDecoder::new(Default::default()),
decoder: LossyDecoder::new_encoding_rs(encoding, Default::default()),
}
}
fn decode(&mut self, chunk: Vec<u8>) -> StrTendril {
self.decoder.process(ByteTendril::from(&*chunk));
mem::replace(&mut self.decoder.inner_sink.output, Default::default())
mem::replace(
&mut self.decoder.inner_sink_mut().output,
Default::default(),
)
}
fn finish(self) -> StrTendril {

View file

@ -1,5 +0,0 @@
[the-input-byte-stream-001.html]
type: testharness
[The character encoding of a page can be set using the HTTP header charset declaration.]
expected: FAIL

View file

@ -1,5 +0,0 @@
[the-input-byte-stream-016.html]
type: testharness
[The HTTP header has a higher precedence than an encoding declaration in a meta content attribute.]
expected: FAIL

View file

@ -1,5 +0,0 @@
[the-input-byte-stream-018.html]
type: testharness
[The HTTP header has a higher precedence than an encoding declaration in a meta charset attribute.]
expected: FAIL

View file

@ -4,7 +4,3 @@
expected: FAIL
bug: https://github.com/servo/servo/issues/14912
[HTML document, shift-jis]
expected: FAIL
bug: https://github.com/servo/servo/issues/6414