From b1cd28e9c674467fc30ffc22c4ea99fa94022d0e Mon Sep 17 00:00:00 2001 From: Ms2ger Date: Fri, 18 Mar 2016 16:22:29 +0100 Subject: [PATCH 1/3] Store the encoding in the Document rather than its name. --- components/script/dom/document.rs | 76 ++++++++++++------------- components/script/dom/node.rs | 2 +- components/script/dom/xmlhttprequest.rs | 2 +- 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/components/script/dom/document.rs b/components/script/dom/document.rs index db565414ce1..ddaebcd35bf 100644 --- a/components/script/dom/document.rs +++ b/components/script/dom/document.rs @@ -76,6 +76,8 @@ use dom::touchlist::TouchList; use dom::treewalker::TreeWalker; use dom::uievent::UIEvent; use dom::window::{ReflowReason, Window}; +use encoding::EncodingRef; +use encoding::all::UTF_8; use euclid::point::Point2D; use html5ever::tree_builder::{LimitedQuirks, NoQuirks, Quirks, QuirksMode}; use ipc_channel::ipc::{self, IpcSender}; @@ -138,7 +140,7 @@ pub struct Document { location: MutNullableHeap>, content_type: DOMString, last_modified: Option, - encoding_name: DOMRefCell, + encoding: Cell, is_html_document: bool, url: Url, quirks_mode: Cell, @@ -295,11 +297,6 @@ impl Document { &*self.window } - #[inline] - pub fn encoding_name(&self) -> Ref { - self.encoding_name.borrow() - } - #[inline] pub fn is_html_document(&self) -> bool { self.is_html_document @@ -393,36 +390,12 @@ impl Document { } } - pub fn set_encoding_name(&self, name: DOMString) { - *self.encoding_name.borrow_mut() = DOMString::from( - match name.as_ref() { - "utf-8" => "UTF-8", - "ibm866" => "IBM866", - "iso-8859-2" => "ISO-8859-2", - "iso-8859-3" => "ISO-8859-3", - "iso-8859-4" => "ISO-8859-4", - "iso-8859-5" => "ISO-8859-5", - "iso-8859-6" => "ISO-8859-6", - "iso-8859-7" => "ISO-8859-7", - "iso-8859-8" => "ISO-8859-8", - "iso-8859-8-i" => "ISO-8859-8-I", - "iso-8859-10" => "ISO-8859-10", - "iso-8859-13" => "ISO-8859-13", - "iso-8859-14" => "ISO-8859-14", - "iso-8859-15" => "ISO-8859-15", - "iso-8859-16" => "ISO-8859-16", - "koi8-r" => "KOI8-R", - "koi8-u" => "KOI8-U", - "gbk" => "GBK", - "big5" => "Big5", - "euc-jp" => "EUC-JP", - "iso-2022-jp" => "ISO-2022-JP", - "shift_jis" => "Shift_JIS", - "euc-kr" => "EUC-KR", - "utf-16be" => "UTF-16BE", - "utf-16le" => "UTF-16LE", - _ => &*name - }); + pub fn encoding(&self) -> EncodingRef { + self.encoding.get() + } + + pub fn set_encoding(&self, encoding: EncodingRef) { + self.encoding.set(encoding); } pub fn content_changed(&self, node: &Node, damage: NodeDamage) { @@ -1561,7 +1534,7 @@ impl Document { // https://dom.spec.whatwg.org/#concept-document-quirks quirks_mode: Cell::new(NoQuirks), // https://dom.spec.whatwg.org/#concept-document-encoding - encoding_name: DOMRefCell::new(DOMString::from("UTF-8")), + encoding: Cell::new(UTF_8), is_html_document: is_html_document == IsHTMLDocument::HTMLDocument, id_map: DOMRefCell::new(HashMap::new()), tag_map: DOMRefCell::new(HashMap::new()), @@ -1818,7 +1791,34 @@ impl DocumentMethods for Document { // https://dom.spec.whatwg.org/#dom-document-characterset fn CharacterSet(&self) -> DOMString { - self.encoding_name.borrow().clone() + DOMString::from(match self.encoding.get().name() { + "utf-8" => "UTF-8", + "ibm866" => "IBM866", + "iso-8859-2" => "ISO-8859-2", + "iso-8859-3" => "ISO-8859-3", + "iso-8859-4" => "ISO-8859-4", + "iso-8859-5" => "ISO-8859-5", + "iso-8859-6" => "ISO-8859-6", + "iso-8859-7" => "ISO-8859-7", + "iso-8859-8" => "ISO-8859-8", + "iso-8859-8-i" => "ISO-8859-8-I", + "iso-8859-10" => "ISO-8859-10", + "iso-8859-13" => "ISO-8859-13", + "iso-8859-14" => "ISO-8859-14", + "iso-8859-15" => "ISO-8859-15", + "iso-8859-16" => "ISO-8859-16", + "koi8-r" => "KOI8-R", + "koi8-u" => "KOI8-U", + "gbk" => "GBK", + "big5" => "Big5", + "euc-jp" => "EUC-JP", + "iso-2022-jp" => "ISO-2022-JP", + "shift_jis" => "Shift_JIS", + "euc-kr" => "EUC-KR", + "utf-16be" => "UTF-16BE", + "utf-16le" => "UTF-16LE", + name => name + }) } // https://dom.spec.whatwg.org/#dom-document-charset diff --git a/components/script/dom/node.rs b/components/script/dom/node.rs index fed47bb7b24..730f69f24cd 100644 --- a/components/script/dom/node.rs +++ b/components/script/dom/node.rs @@ -1696,7 +1696,7 @@ impl Node { NodeTypeId::Document(_) => { let node_doc = node.downcast::().unwrap(); let copy_doc = copy.downcast::().unwrap(); - copy_doc.set_encoding_name(node_doc.encoding_name().clone()); + copy_doc.set_encoding(node_doc.encoding()); copy_doc.set_quirks_mode(node_doc.quirks_mode()); }, NodeTypeId::Element(..) => { diff --git a/components/script/dom/xmlhttprequest.rs b/components/script/dom/xmlhttprequest.rs index 7de3d956437..65ccb435dbf 100644 --- a/components/script/dom/xmlhttprequest.rs +++ b/components/script/dom/xmlhttprequest.rs @@ -1150,7 +1150,7 @@ impl XMLHttpRequest { _ => { return None; } } // Step 9 - temp_doc.set_encoding_name(DOMString::from(charset.name())); + temp_doc.set_encoding(charset); // Step 13 self.response_xml.set(Some(temp_doc.r())); return self.response_xml.get(); From 89b8499df8bd9a9c4a78d93fb644fb764c5fabbb Mon Sep 17 00:00:00 2001 From: askalski Date: Sat, 9 Jan 2016 21:40:45 +0100 Subject: [PATCH 2/3] Implement encoding determination for external scripts. --- components/script/dom/htmlscriptelement.rs | 21 +++-- tests/wpt/metadata/MANIFEST.json | 8 ++ .../external-script-utf8.js | 5 ++ .../external-script-windows1250.js | 5 ++ .../the-script-element/script-charset-01.html | 89 +++++++++++++++++++ .../the-script-element/script-charset-02.html | 40 +++++++++ .../serve-with-content-type.py | 15 ++++ 7 files changed, 175 insertions(+), 8 deletions(-) create mode 100644 tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/external-script-utf8.js create mode 100644 tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/external-script-windows1250.js create mode 100644 tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/script-charset-01.html create mode 100644 tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/script-charset-02.html create mode 100644 tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/serve-with-content-type.py diff --git a/components/script/dom/htmlscriptelement.rs b/components/script/dom/htmlscriptelement.rs index bbd1884dc4f..eaafcadd7cc 100644 --- a/components/script/dom/htmlscriptelement.rs +++ b/components/script/dom/htmlscriptelement.rs @@ -25,7 +25,6 @@ use dom::node::{ChildrenMutation, CloneChildrenFlag, Node}; use dom::node::{document_from_node, window_from_node}; use dom::virtualmethods::VirtualMethods; use dom::window::ScriptHelpers; -use encoding::all::UTF_8; use encoding::label::encoding_from_whatwg_label; use encoding::types::{DecoderTrap, Encoding, EncodingRef}; use html5ever::tree_builder::NextParserState; @@ -71,7 +70,7 @@ pub struct HTMLScriptElement { #[ignore_heap_size_of = "Defined in rust-encoding"] /// https://html.spec.whatwg.org/multipage/#concept-script-encoding - block_character_encoding: DOMRefCell, + block_character_encoding: DOMRefCell>, } impl HTMLScriptElement { @@ -86,7 +85,7 @@ impl HTMLScriptElement { ready_to_be_parser_executed: Cell::new(false), parser_document: JS::from_ref(document), load: DOMRefCell::new(None), - block_character_encoding: DOMRefCell::new(UTF_8 as EncodingRef), + block_character_encoding: DOMRefCell::new(None), } } @@ -248,7 +247,7 @@ impl HTMLScriptElement { // Step 13. if let Some(ref charset) = element.get_attribute(&ns!(), &atom!("charset")) { if let Some(encodingRef) = encoding_from_whatwg_label(&charset.Value()) { - *self.block_character_encoding.borrow_mut() = encodingRef; + *self.block_character_encoding.borrow_mut() = Some(encodingRef); } } @@ -391,10 +390,16 @@ impl HTMLScriptElement { // Step 2.b.1.a. ScriptOrigin::External(Ok((metadata, bytes))) => { - // TODO(#9185): implement encoding determination. - (DOMString::from(UTF_8.decode(&*bytes, DecoderTrap::Replace).unwrap()), - true, - metadata.final_url) + debug!("loading external script, url = {}", metadata.final_url); + + let encoding = metadata.charset + .and_then(|encoding| encoding_from_whatwg_label(&encoding)) + .or_else(|| *self.block_character_encoding.borrow()) + .unwrap_or_else(|| self.parser_document.encoding()); + + (DOMString::from(encoding.decode(&*bytes, DecoderTrap::Replace).unwrap()), + true, + metadata.final_url) }, // Step 2.b.1.c. diff --git a/tests/wpt/metadata/MANIFEST.json b/tests/wpt/metadata/MANIFEST.json index 6932cb31acb..feefdc2077e 100644 --- a/tests/wpt/metadata/MANIFEST.json +++ b/tests/wpt/metadata/MANIFEST.json @@ -19241,6 +19241,14 @@ "path": "html/semantics/scripting-1/the-script-element/script-before-after-events.html", "url": "/html/semantics/scripting-1/the-script-element/script-before-after-events.html" }, + { + "path": "html/semantics/scripting-1/the-script-element/script-charset-01.html", + "url": "/html/semantics/scripting-1/the-script-element/script-charset-01.html" + }, + { + "path": "html/semantics/scripting-1/the-script-element/script-charset-02.html", + "url": "/html/semantics/scripting-1/the-script-element/script-charset-02.html" + }, { "path": "html/semantics/scripting-1/the-script-element/script-for-event-xhtml.xhtml", "url": "/html/semantics/scripting-1/the-script-element/script-for-event-xhtml.xhtml" diff --git a/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/external-script-utf8.js b/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/external-script-utf8.js new file mode 100644 index 00000000000..eb442c97bc9 --- /dev/null +++ b/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/external-script-utf8.js @@ -0,0 +1,5 @@ +(function() { + window.getSomeString = function() { + return "śćążź"; //<- these are five Polish letters, similar to scazz. It can be read correctly only with windows 1250 encoding. + }; +})(); diff --git a/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/external-script-windows1250.js b/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/external-script-windows1250.js new file mode 100644 index 00000000000..50de6932ba2 --- /dev/null +++ b/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/external-script-windows1250.js @@ -0,0 +1,5 @@ +(function() { + window.getSomeString = function() { + return "œæ¹¿Ÿ"; //<- these are five Polish letters, similar to scazz. It can be read correctly only with windows 1250 encoding. + }; +})(); diff --git a/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/script-charset-01.html b/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/script-charset-01.html new file mode 100644 index 00000000000..c5ac0d0a62a --- /dev/null +++ b/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/script-charset-01.html @@ -0,0 +1,89 @@ + + + + Script @type: unknown parameters + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + Script @type: unknown parameters + + + + +
+ + + + + + + + + + diff --git a/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/serve-with-content-type.py b/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/serve-with-content-type.py new file mode 100644 index 00000000000..7cfe6f4cec3 --- /dev/null +++ b/tests/wpt/web-platform-tests/html/semantics/scripting-1/the-script-element/serve-with-content-type.py @@ -0,0 +1,15 @@ +import os + +def main(request, response): + directory = os.path.dirname(__file__) + + try: + file_name = request.GET.first("fn") + content_type = request.GET.first("ct") + with open(os.path.join(directory, file_name), "rb") as fh: + content = fh.read() + + response.headers.set("Content-Type", content_type) + response.content = content + except: + response.set_error(400, "Not enough parameters or file not found") From f1f53468a072365fe9dae72422afe428d813e794 Mon Sep 17 00:00:00 2001 From: Ms2ger Date: Fri, 18 Mar 2016 16:38:17 +0100 Subject: [PATCH 3/3] Use a Cell for HTMLScriptElement::block_character_encoding. --- components/script/dom/htmlscriptelement.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/components/script/dom/htmlscriptelement.rs b/components/script/dom/htmlscriptelement.rs index eaafcadd7cc..eb63d524c9c 100644 --- a/components/script/dom/htmlscriptelement.rs +++ b/components/script/dom/htmlscriptelement.rs @@ -70,7 +70,7 @@ pub struct HTMLScriptElement { #[ignore_heap_size_of = "Defined in rust-encoding"] /// https://html.spec.whatwg.org/multipage/#concept-script-encoding - block_character_encoding: DOMRefCell>, + block_character_encoding: Cell>, } impl HTMLScriptElement { @@ -85,7 +85,7 @@ impl HTMLScriptElement { ready_to_be_parser_executed: Cell::new(false), parser_document: JS::from_ref(document), load: DOMRefCell::new(None), - block_character_encoding: DOMRefCell::new(None), + block_character_encoding: Cell::new(None), } } @@ -247,7 +247,7 @@ impl HTMLScriptElement { // Step 13. if let Some(ref charset) = element.get_attribute(&ns!(), &atom!("charset")) { if let Some(encodingRef) = encoding_from_whatwg_label(&charset.Value()) { - *self.block_character_encoding.borrow_mut() = Some(encodingRef); + self.block_character_encoding.set(Some(encodingRef)); } } @@ -394,7 +394,7 @@ impl HTMLScriptElement { let encoding = metadata.charset .and_then(|encoding| encoding_from_whatwg_label(&encoding)) - .or_else(|| *self.block_character_encoding.borrow()) + .or_else(|| self.block_character_encoding.get()) .unwrap_or_else(|| self.parser_document.encoding()); (DOMString::from(encoding.decode(&*bytes, DecoderTrap::Replace).unwrap()),