mirror of
https://github.com/servo/servo.git
synced 2025-06-28 19:13:41 +01:00
Handle nonmappable code points in Document::encoding_parse_a_url (#37541)
This is a followup to https://github.com/servo/servo/pull/33825. Using `Encoder::encode` introduced a subtle bug: That function will silently replace nonmappable code points (such as `㐀` in euc-jp). The url spec however expects nonmappable characters to be treated differently. There is actually an open bug in the `rust-url` repo about this: https://github.com/servo/rust-url/issues/649, with the conclusion apparently being that this should not be implemented by the url crate itself. Gecko implementation of the equivalent algorithm for reference: https://searchfox.org/mozilla-central/rev/d52edf7ea4236446e118a2edc815023c5479663f/netwerk/base/nsStandardURL.cpp#116-172. Testing: More web platform tests pass Part of https://github.com/servo/servo/issues/5601 --------- Signed-off-by: Simon Wülker <simon.wuelker@arcor.de>
This commit is contained in:
parent
3a54ddd034
commit
a27c9ee691
14 changed files with 65 additions and 294148 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -7174,6 +7174,7 @@ dependencies = [
|
|||
name = "servo_url"
|
||||
version = "0.0.1"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
"malloc_size_of_derive",
|
||||
"serde",
|
||||
"servo_arc",
|
||||
|
|
|
@ -3672,7 +3672,9 @@ impl Document {
|
|||
// Step 5. Return the result of applying the URL parser to url, with baseURL and encoding.
|
||||
url::Url::options()
|
||||
.base_url(Some(base_url.as_url()))
|
||||
.encoding_override(Some(&|s| encoding.encode(s).0))
|
||||
.encoding_override(Some(&|input| {
|
||||
servo_url::encoding::encode_as_url_query_string(input, encoding)
|
||||
}))
|
||||
.parse(url)
|
||||
.map(ServoUrl::from)
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@ name = "servo_url"
|
|||
path = "lib.rs"
|
||||
|
||||
[dependencies]
|
||||
encoding_rs = { workspace = true }
|
||||
malloc_size_of = { workspace = true }
|
||||
malloc_size_of_derive = { workspace = true }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
|
|
59
components/url/encoding.rs
Normal file
59
components/url/encoding.rs
Normal file
|
@ -0,0 +1,59 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
||||
use encoding_rs::{EncoderResult, Encoding, UTF_8};
|
||||
|
||||
/// This is equivalent to [Encoding::encode], except nonmappable code points are handled
|
||||
/// according to the url specification, which expects nonmappable code points to be wrapped in `%26%23` and
|
||||
/// `%3B` (see [percent encode after encoding](https://url.spec.whatwg.org/#string-percent-encode-after-encoding)).
|
||||
pub fn encode_as_url_query_string<'a>(
|
||||
mut string: &'a str,
|
||||
encoding: &'static Encoding,
|
||||
) -> Cow<'a, [u8]> {
|
||||
let output_encoding = encoding.output_encoding();
|
||||
if output_encoding == UTF_8 {
|
||||
return Cow::Borrowed(string.as_bytes());
|
||||
}
|
||||
|
||||
let bytes = string.as_bytes();
|
||||
let valid_up_to = if output_encoding == encoding_rs::ISO_2022_JP {
|
||||
Encoding::iso_2022_jp_ascii_valid_up_to(bytes)
|
||||
} else {
|
||||
Encoding::ascii_valid_up_to(bytes)
|
||||
};
|
||||
|
||||
if valid_up_to == bytes.len() {
|
||||
// All the bytes are already correctly encoded - we don't need to do anything!
|
||||
return Cow::Borrowed(bytes);
|
||||
}
|
||||
|
||||
let mut encoder = encoding.new_encoder();
|
||||
let mut output = Vec::with_capacity(
|
||||
encoder
|
||||
.max_buffer_length_from_utf8_if_no_unmappables(string.len())
|
||||
.expect("string size would overflow `usize`"),
|
||||
);
|
||||
loop {
|
||||
match encoder.encode_from_utf8_to_vec_without_replacement(string, &mut output, true) {
|
||||
(EncoderResult::InputEmpty, _) => break,
|
||||
(EncoderResult::OutputFull, consumed) => {
|
||||
output.reserve(
|
||||
encoder
|
||||
.max_buffer_length_from_utf8_if_no_unmappables(string.len())
|
||||
.expect("string size would overflow `usize`"),
|
||||
);
|
||||
string = &string[consumed..];
|
||||
},
|
||||
(EncoderResult::Unmappable(character), consumed) => {
|
||||
use std::io::Write;
|
||||
write!(&mut output, "%26%23{}%3B", character as u32).unwrap();
|
||||
string = &string[consumed..];
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
Cow::Owned(output)
|
||||
}
|
|
@ -6,6 +6,7 @@
|
|||
#![crate_name = "servo_url"]
|
||||
#![crate_type = "rlib"]
|
||||
|
||||
pub mod encoding;
|
||||
pub mod origin;
|
||||
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,12 +1,3 @@
|
|||
[percent-encoding.window.html]
|
||||
[Input \x0eA with encoding iso-2022-jp]
|
||||
expected: FAIL
|
||||
|
||||
[Input with encoding gb18030]
|
||||
expected: FAIL
|
||||
|
||||
[Input † with encoding big5]
|
||||
expected: FAIL
|
||||
|
||||
[Input U+d800 with encoding windows-1252]
|
||||
expected: FAIL
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue