mirror of
https://github.com/servo/servo.git
synced 2025-06-28 11:03:39 +01:00
Handle nonmappable code points in Document::encoding_parse_a_url (#37541)
This is a followup to https://github.com/servo/servo/pull/33825. Using `Encoder::encode` introduced a subtle bug: That function will silently replace nonmappable code points (such as `㐀` in euc-jp). The url spec however expects nonmappable characters to be treated differently. There is actually an open bug in the `rust-url` repo about this: https://github.com/servo/rust-url/issues/649, with the conclusion apparently being that this should not be implemented by the url crate itself. Gecko implementation of the equivalent algorithm for reference: https://searchfox.org/mozilla-central/rev/d52edf7ea4236446e118a2edc815023c5479663f/netwerk/base/nsStandardURL.cpp#116-172. Testing: More web platform tests pass Part of https://github.com/servo/servo/issues/5601 --------- Signed-off-by: Simon Wülker <simon.wuelker@arcor.de>
This commit is contained in:
parent
3a54ddd034
commit
a27c9ee691
14 changed files with 65 additions and 294148 deletions
59
components/url/encoding.rs
Normal file
59
components/url/encoding.rs
Normal file
|
@ -0,0 +1,59 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
||||
use encoding_rs::{EncoderResult, Encoding, UTF_8};
|
||||
|
||||
/// This is equivalent to [Encoding::encode], except nonmappable code points are handled
|
||||
/// according to the url specification, which expects nonmappable code points to be wrapped in `%26%23` and
|
||||
/// `%3B` (see [percent encode after encoding](https://url.spec.whatwg.org/#string-percent-encode-after-encoding)).
|
||||
pub fn encode_as_url_query_string<'a>(
|
||||
mut string: &'a str,
|
||||
encoding: &'static Encoding,
|
||||
) -> Cow<'a, [u8]> {
|
||||
let output_encoding = encoding.output_encoding();
|
||||
if output_encoding == UTF_8 {
|
||||
return Cow::Borrowed(string.as_bytes());
|
||||
}
|
||||
|
||||
let bytes = string.as_bytes();
|
||||
let valid_up_to = if output_encoding == encoding_rs::ISO_2022_JP {
|
||||
Encoding::iso_2022_jp_ascii_valid_up_to(bytes)
|
||||
} else {
|
||||
Encoding::ascii_valid_up_to(bytes)
|
||||
};
|
||||
|
||||
if valid_up_to == bytes.len() {
|
||||
// All the bytes are already correctly encoded - we don't need to do anything!
|
||||
return Cow::Borrowed(bytes);
|
||||
}
|
||||
|
||||
let mut encoder = encoding.new_encoder();
|
||||
let mut output = Vec::with_capacity(
|
||||
encoder
|
||||
.max_buffer_length_from_utf8_if_no_unmappables(string.len())
|
||||
.expect("string size would overflow `usize`"),
|
||||
);
|
||||
loop {
|
||||
match encoder.encode_from_utf8_to_vec_without_replacement(string, &mut output, true) {
|
||||
(EncoderResult::InputEmpty, _) => break,
|
||||
(EncoderResult::OutputFull, consumed) => {
|
||||
output.reserve(
|
||||
encoder
|
||||
.max_buffer_length_from_utf8_if_no_unmappables(string.len())
|
||||
.expect("string size would overflow `usize`"),
|
||||
);
|
||||
string = &string[consumed..];
|
||||
},
|
||||
(EncoderResult::Unmappable(character), consumed) => {
|
||||
use std::io::Write;
|
||||
write!(&mut output, "%26%23{}%3B", character as u32).unwrap();
|
||||
string = &string[consumed..];
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
Cow::Owned(output)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue