Handle nonmappable code points in Document::encoding_parse_a_url (#37541)

This is a followup to https://github.com/servo/servo/pull/33825. Using
`Encoder::encode` introduced a subtle bug: That function will silently
replace nonmappable code points (such as `㐀` in euc-jp). The url spec
however expects nonmappable characters to be treated differently. There
is actually an open bug in the `rust-url` repo about this:
https://github.com/servo/rust-url/issues/649, with the conclusion
apparently being that this should not be implemented by the url crate
itself.

Gecko implementation of the equivalent algorithm for reference:
https://searchfox.org/mozilla-central/rev/d52edf7ea4236446e118a2edc815023c5479663f/netwerk/base/nsStandardURL.cpp#116-172.

Testing: More web platform tests pass

Part of https://github.com/servo/servo/issues/5601

---------

Signed-off-by: Simon Wülker <simon.wuelker@arcor.de>
This commit is contained in:
Simon Wülker 2025-06-19 12:14:45 +02:00 committed by GitHub
parent 3a54ddd034
commit a27c9ee691
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 65 additions and 294148 deletions

1
Cargo.lock generated
View file

@ -7174,6 +7174,7 @@ dependencies = [
name = "servo_url"
version = "0.0.1"
dependencies = [
"encoding_rs",
"malloc_size_of_derive",
"serde",
"servo_arc",

View file

@ -3672,7 +3672,9 @@ impl Document {
// Step 5. Return the result of applying the URL parser to url, with baseURL and encoding.
url::Url::options()
.base_url(Some(base_url.as_url()))
.encoding_override(Some(&|s| encoding.encode(s).0))
.encoding_override(Some(&|input| {
servo_url::encoding::encode_as_url_query_string(input, encoding)
}))
.parse(url)
.map(ServoUrl::from)
}

View file

@ -12,6 +12,7 @@ name = "servo_url"
path = "lib.rs"
[dependencies]
encoding_rs = { workspace = true }
malloc_size_of = { workspace = true }
malloc_size_of_derive = { workspace = true }
serde = { workspace = true, features = ["derive"] }

View file

@ -0,0 +1,59 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
use std::borrow::Cow;
use encoding_rs::{EncoderResult, Encoding, UTF_8};
/// This is equivalent to [Encoding::encode], except nonmappable code points are handled
/// according to the url specification, which expects nonmappable code points to be wrapped in `%26%23` and
/// `%3B` (see [percent encode after encoding](https://url.spec.whatwg.org/#string-percent-encode-after-encoding)).
pub fn encode_as_url_query_string<'a>(
mut string: &'a str,
encoding: &'static Encoding,
) -> Cow<'a, [u8]> {
let output_encoding = encoding.output_encoding();
if output_encoding == UTF_8 {
return Cow::Borrowed(string.as_bytes());
}
let bytes = string.as_bytes();
let valid_up_to = if output_encoding == encoding_rs::ISO_2022_JP {
Encoding::iso_2022_jp_ascii_valid_up_to(bytes)
} else {
Encoding::ascii_valid_up_to(bytes)
};
if valid_up_to == bytes.len() {
// All the bytes are already correctly encoded - we don't need to do anything!
return Cow::Borrowed(bytes);
}
let mut encoder = encoding.new_encoder();
let mut output = Vec::with_capacity(
encoder
.max_buffer_length_from_utf8_if_no_unmappables(string.len())
.expect("string size would overflow `usize`"),
);
loop {
match encoder.encode_from_utf8_to_vec_without_replacement(string, &mut output, true) {
(EncoderResult::InputEmpty, _) => break,
(EncoderResult::OutputFull, consumed) => {
output.reserve(
encoder
.max_buffer_length_from_utf8_if_no_unmappables(string.len())
.expect("string size would overflow `usize`"),
);
string = &string[consumed..];
},
(EncoderResult::Unmappable(character), consumed) => {
use std::io::Write;
write!(&mut output, "%26%23{}%3B", character as u32).unwrap();
string = &string[consumed..];
},
};
}
Cow::Owned(output)
}

View file

@ -6,6 +6,7 @@
#![crate_name = "servo_url"]
#![crate_type = "rlib"]
pub mod encoding;
pub mod origin;
use std::collections::hash_map::DefaultHasher;

View file

@ -1,12 +1,3 @@
[percent-encoding.window.html]
[Input \x0eA with encoding iso-2022-jp]
expected: FAIL
[Input  with encoding gb18030]
expected: FAIL
[Input † with encoding big5]
expected: FAIL
[Input U+d800 with encoding windows-1252]
expected: FAIL