Auto merge of #10079 - servo:script-encoding, r=jdm

Implement encoding determination for external scripts.

<!-- Reviewable:start -->
This change is [<img src="https://reviewable.io/review_button.svg" height="35" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/servo/10079)
<!-- Reviewable:end -->
This commit is contained in:
bors-servo 2016-03-20 06:28:25 +05:30
commit 8e95f54501
10 changed files with 215 additions and 48 deletions

View file

@ -76,6 +76,8 @@ use dom::touchlist::TouchList;
use dom::treewalker::TreeWalker;
use dom::uievent::UIEvent;
use dom::window::{ReflowReason, Window};
use encoding::EncodingRef;
use encoding::all::UTF_8;
use euclid::point::Point2D;
use html5ever::tree_builder::{LimitedQuirks, NoQuirks, Quirks, QuirksMode};
use ipc_channel::ipc::{self, IpcSender};
@ -138,7 +140,7 @@ pub struct Document {
location: MutNullableHeap<JS<Location>>,
content_type: DOMString,
last_modified: Option<String>,
encoding_name: DOMRefCell<DOMString>,
encoding: Cell<EncodingRef>,
is_html_document: bool,
url: Url,
quirks_mode: Cell<QuirksMode>,
@ -295,11 +297,6 @@ impl Document {
&*self.window
}
#[inline]
pub fn encoding_name(&self) -> Ref<DOMString> {
self.encoding_name.borrow()
}
#[inline]
pub fn is_html_document(&self) -> bool {
self.is_html_document
@ -393,36 +390,12 @@ impl Document {
}
}
pub fn set_encoding_name(&self, name: DOMString) {
*self.encoding_name.borrow_mut() = DOMString::from(
match name.as_ref() {
"utf-8" => "UTF-8",
"ibm866" => "IBM866",
"iso-8859-2" => "ISO-8859-2",
"iso-8859-3" => "ISO-8859-3",
"iso-8859-4" => "ISO-8859-4",
"iso-8859-5" => "ISO-8859-5",
"iso-8859-6" => "ISO-8859-6",
"iso-8859-7" => "ISO-8859-7",
"iso-8859-8" => "ISO-8859-8",
"iso-8859-8-i" => "ISO-8859-8-I",
"iso-8859-10" => "ISO-8859-10",
"iso-8859-13" => "ISO-8859-13",
"iso-8859-14" => "ISO-8859-14",
"iso-8859-15" => "ISO-8859-15",
"iso-8859-16" => "ISO-8859-16",
"koi8-r" => "KOI8-R",
"koi8-u" => "KOI8-U",
"gbk" => "GBK",
"big5" => "Big5",
"euc-jp" => "EUC-JP",
"iso-2022-jp" => "ISO-2022-JP",
"shift_jis" => "Shift_JIS",
"euc-kr" => "EUC-KR",
"utf-16be" => "UTF-16BE",
"utf-16le" => "UTF-16LE",
_ => &*name
});
pub fn encoding(&self) -> EncodingRef {
self.encoding.get()
}
pub fn set_encoding(&self, encoding: EncodingRef) {
self.encoding.set(encoding);
}
pub fn content_changed(&self, node: &Node, damage: NodeDamage) {
@ -1561,7 +1534,7 @@ impl Document {
// https://dom.spec.whatwg.org/#concept-document-quirks
quirks_mode: Cell::new(NoQuirks),
// https://dom.spec.whatwg.org/#concept-document-encoding
encoding_name: DOMRefCell::new(DOMString::from("UTF-8")),
encoding: Cell::new(UTF_8),
is_html_document: is_html_document == IsHTMLDocument::HTMLDocument,
id_map: DOMRefCell::new(HashMap::new()),
tag_map: DOMRefCell::new(HashMap::new()),
@ -1818,7 +1791,34 @@ impl DocumentMethods for Document {
// https://dom.spec.whatwg.org/#dom-document-characterset
fn CharacterSet(&self) -> DOMString {
self.encoding_name.borrow().clone()
DOMString::from(match self.encoding.get().name() {
"utf-8" => "UTF-8",
"ibm866" => "IBM866",
"iso-8859-2" => "ISO-8859-2",
"iso-8859-3" => "ISO-8859-3",
"iso-8859-4" => "ISO-8859-4",
"iso-8859-5" => "ISO-8859-5",
"iso-8859-6" => "ISO-8859-6",
"iso-8859-7" => "ISO-8859-7",
"iso-8859-8" => "ISO-8859-8",
"iso-8859-8-i" => "ISO-8859-8-I",
"iso-8859-10" => "ISO-8859-10",
"iso-8859-13" => "ISO-8859-13",
"iso-8859-14" => "ISO-8859-14",
"iso-8859-15" => "ISO-8859-15",
"iso-8859-16" => "ISO-8859-16",
"koi8-r" => "KOI8-R",
"koi8-u" => "KOI8-U",
"gbk" => "GBK",
"big5" => "Big5",
"euc-jp" => "EUC-JP",
"iso-2022-jp" => "ISO-2022-JP",
"shift_jis" => "Shift_JIS",
"euc-kr" => "EUC-KR",
"utf-16be" => "UTF-16BE",
"utf-16le" => "UTF-16LE",
name => name
})
}
// https://dom.spec.whatwg.org/#dom-document-charset

View file

@ -25,7 +25,6 @@ use dom::node::{ChildrenMutation, CloneChildrenFlag, Node};
use dom::node::{document_from_node, window_from_node};
use dom::virtualmethods::VirtualMethods;
use dom::window::ScriptHelpers;
use encoding::all::UTF_8;
use encoding::label::encoding_from_whatwg_label;
use encoding::types::{DecoderTrap, Encoding, EncodingRef};
use html5ever::tree_builder::NextParserState;
@ -71,7 +70,7 @@ pub struct HTMLScriptElement {
#[ignore_heap_size_of = "Defined in rust-encoding"]
/// https://html.spec.whatwg.org/multipage/#concept-script-encoding
block_character_encoding: DOMRefCell<EncodingRef>,
block_character_encoding: Cell<Option<EncodingRef>>,
}
impl HTMLScriptElement {
@ -86,7 +85,7 @@ impl HTMLScriptElement {
ready_to_be_parser_executed: Cell::new(false),
parser_document: JS::from_ref(document),
load: DOMRefCell::new(None),
block_character_encoding: DOMRefCell::new(UTF_8 as EncodingRef),
block_character_encoding: Cell::new(None),
}
}
@ -248,7 +247,7 @@ impl HTMLScriptElement {
// Step 13.
if let Some(ref charset) = element.get_attribute(&ns!(), &atom!("charset")) {
if let Some(encodingRef) = encoding_from_whatwg_label(&charset.Value()) {
*self.block_character_encoding.borrow_mut() = encodingRef;
self.block_character_encoding.set(Some(encodingRef));
}
}
@ -391,8 +390,14 @@ impl HTMLScriptElement {
// Step 2.b.1.a.
ScriptOrigin::External(Ok((metadata, bytes))) => {
// TODO(#9185): implement encoding determination.
(DOMString::from(UTF_8.decode(&*bytes, DecoderTrap::Replace).unwrap()),
debug!("loading external script, url = {}", metadata.final_url);
let encoding = metadata.charset
.and_then(|encoding| encoding_from_whatwg_label(&encoding))
.or_else(|| self.block_character_encoding.get())
.unwrap_or_else(|| self.parser_document.encoding());
(DOMString::from(encoding.decode(&*bytes, DecoderTrap::Replace).unwrap()),
true,
metadata.final_url)
},

View file

@ -1696,7 +1696,7 @@ impl Node {
NodeTypeId::Document(_) => {
let node_doc = node.downcast::<Document>().unwrap();
let copy_doc = copy.downcast::<Document>().unwrap();
copy_doc.set_encoding_name(node_doc.encoding_name().clone());
copy_doc.set_encoding(node_doc.encoding());
copy_doc.set_quirks_mode(node_doc.quirks_mode());
},
NodeTypeId::Element(..) => {

View file

@ -1150,7 +1150,7 @@ impl XMLHttpRequest {
_ => { return None; }
}
// Step 9
temp_doc.set_encoding_name(DOMString::from(charset.name()));
temp_doc.set_encoding(charset);
// Step 13
self.response_xml.set(Some(temp_doc.r()));
return self.response_xml.get();

View file

@ -19241,6 +19241,14 @@
"path": "html/semantics/scripting-1/the-script-element/script-before-after-events.html",
"url": "/html/semantics/scripting-1/the-script-element/script-before-after-events.html"
},
{
"path": "html/semantics/scripting-1/the-script-element/script-charset-01.html",
"url": "/html/semantics/scripting-1/the-script-element/script-charset-01.html"
},
{
"path": "html/semantics/scripting-1/the-script-element/script-charset-02.html",
"url": "/html/semantics/scripting-1/the-script-element/script-charset-02.html"
},
{
"path": "html/semantics/scripting-1/the-script-element/script-for-event-xhtml.xhtml",
"url": "/html/semantics/scripting-1/the-script-element/script-for-event-xhtml.xhtml"

View file

@ -0,0 +1,5 @@
(function() {
window.getSomeString = function() {
return "śćążź"; //<- these are five Polish letters, similar to scazz. It can be read correctly only with windows 1250 encoding.
};
})();

View file

@ -0,0 +1,5 @@
(function() {
window.getSomeString = function() {
return "œæ¹¿Ÿ"; //<- these are five Polish letters, similar to scazz. It can be read correctly only with windows 1250 encoding.
};
})();

View file

@ -0,0 +1,89 @@
<!DOCTYPE html>
<head>
<meta charset="utf-8">
<title>Script @type: unknown parameters</title>
<link rel="author" title="askalski" href="github.com/askalski">
<link rel="help" href="https://html.spec.whatwg.org/multipage/#scriptingLanguages">
<script src="/resources/testharness.js"></script>
<script src="/resources/testharnessreport.js"></script>
<div id="log"></div>
<!-- "Step1" tests -->
<!-- charset is set incorrectly via Content Type "text/javascript;charset=utf-8" in response
which has priority before a correct setting in "charset" attribute of script tag.
-->
<script type="text/javascript"
src="serve-with-content-type.py?fn=external-script-windows1250.js&ct=text/javascript%3Bcharset=utf-8" charset="windows-1250">
</script>
<script>
test(function() {
//these strings should not match, since the file charset is set incorrectly
assert_not_equals(window.getSomeString(), "śćążź");
});
</script>
<!-- charset is set correctly via Content Type "text/javascript;charset=utf-8" in response
which has priority before a incorrect setting in "charset" attribute of script tag.
-->
<script type="text/javascript"
src="serve-with-content-type.py?fn=external-script-windows1250.js&ct=text/javascript%3Bcharset=windows-1250" charset="utf-8">
</script>
<script>
//the charset is set correctly via Content Type "text/javascript;charset=windows-1250" in respones
test(function() {
assert_equals(window.getSomeString(), "śćążź");
});
</script>
<!-- end of step1 tests, now step2 tests -->
<!-- in this case, the response's Content Type does not bring charset information.
Second step takes block character encoding if available.-->
<script type="text/javascript"
src="serve-with-content-type.py?fn=external-script-windows1250.js&ct=text/javascript" charset="utf-8">
</script>
<script>
test(function() {
//these strings should not match, since the file charset is set incorrectly in "charset" tag of <script> above
assert_not_equals(window.getSomeString(), "śćążź");
});
</script>
<!-- charset is set correctly via Content Type "text/javascript;charset=utf-8" in response
which has priority before a incorrect setting in "charset" attribute of script tag.
-->
<script type="text/javascript"
src="serve-with-content-type.py?fn=external-script-windows1250.js&ct=text/javascript" charset="windows-1250">
</script>
<script>
//the charset is set correctly via content attribute in <script> above
test(function() {
assert_equals(window.getSomeString(), "śćążź");
});
</script>
<!-- end of step2 tests, now step3 tests -->
<!-- in this case, neither response's Content Type nor charset attribute bring correct charset information.
Third step takes this document's character encoding (declared correctly as UTF-8).-->
<script type="text/javascript"
src="serve-with-content-type.py?fn=external-script-windows1250.js&ct=text/javascript">
</script>
<script>
test(function() {
//these strings should not match, since the tested file is in windows-1250, and document is utf-8
assert_not_equals(window.getSomeString(), "śćążź");
});
</script>
<script type="text/javascript"
src="serve-with-content-type.py?fn=external-script-utf8.js&ct=text/javascript">
</script>
<script>
//these strings should match, both document and tested file are utf-8
test(function() {
assert_equals(window.getSomeString(), "śćążź");
});
</script>
<!-- the last portion of tests (step4) are in file script-charset-02.html
</head>

View file

@ -0,0 +1,40 @@
<!DOCTYPE html>
<head>
<!-- TODO:
askalski: while this test pass, it does not test anything now.
It should test, whether with no document.charset set in any way, the
external scripts will get decoded using utf-8 as fallback character encoding.
It seems like utf-8 is also a fallback encoding to html (my guess), so
the part of the code I was attempting to test is never reached.
-->
<title>Script @type: unknown parameters</title>
<link rel="author" title="askalski" href="github.com/askalski">
<link rel="help" href="https://html.spec.whatwg.org/multipage/#scriptingLanguages">
<script src="/resources/testharness.js"></script>
<script src="/resources/testharnessreport.js"></script>
<div id="log"></div>
<!-- test of step4, which is taking utf-8 as fallback -->
<!-- in this case, neither response's Content Type nor charset attribute bring correct charset information.
Furthermore, document's encoding is not set.-->
<script type="text/javascript"
src="serve-with-content-type.py?fn=external-script-windows1250.js&ct=text/javascript">
</script>
<script>
test(function() {
//these strings should not match, since the tested file is in windows-1250, and fallback is defined as utf-8
assert_not_equals(window.getSomeString().length, 5);
});
</script>
<script type="text/javascript"
src="serve-with-content-type.py?fn=external-script-utf8.js&ct=text/javascript">
</script>
<script>
//these strings should match, since fallback utf-8 is the correct setting.
test(function() {
assert_equals(window.getSomeString().length, 5);
});
</script>
</head>

View file

@ -0,0 +1,15 @@
import os
def main(request, response):
directory = os.path.dirname(__file__)
try:
file_name = request.GET.first("fn")
content_type = request.GET.first("ct")
with open(os.path.join(directory, file_name), "rb") as fh:
content = fh.read()
response.headers.set("Content-Type", content_type)
response.content = content
except:
response.set_error(400, "Not enough parameters or file not found")