servo/components/url/encoding.rs
Simon Wülker a27c9ee691
Handle nonmappable code points in Document::encoding_parse_a_url (#37541)
This is a followup to https://github.com/servo/servo/pull/33825. Using
`Encoder::encode` introduced a subtle bug: That function will silently
replace nonmappable code points (such as `㐀` in euc-jp). The url spec
however expects nonmappable characters to be treated differently. There
is actually an open bug in the `rust-url` repo about this:
https://github.com/servo/rust-url/issues/649, with the conclusion
apparently being that this should not be implemented by the url crate
itself.

Gecko implementation of the equivalent algorithm for reference:
https://searchfox.org/mozilla-central/rev/d52edf7ea4236446e118a2edc815023c5479663f/netwerk/base/nsStandardURL.cpp#116-172.

Testing: More web platform tests pass

Part of https://github.com/servo/servo/issues/5601

---------

Signed-off-by: Simon Wülker <simon.wuelker@arcor.de>
2025-06-19 10:14:45 +00:00

59 lines
2.2 KiB
Rust

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
use std::borrow::Cow;
use encoding_rs::{EncoderResult, Encoding, UTF_8};
/// This is equivalent to [Encoding::encode], except nonmappable code points are handled
/// according to the url specification, which expects nonmappable code points to be wrapped in `%26%23` and
/// `%3B` (see [percent encode after encoding](https://url.spec.whatwg.org/#string-percent-encode-after-encoding)).
pub fn encode_as_url_query_string<'a>(
mut string: &'a str,
encoding: &'static Encoding,
) -> Cow<'a, [u8]> {
let output_encoding = encoding.output_encoding();
if output_encoding == UTF_8 {
return Cow::Borrowed(string.as_bytes());
}
let bytes = string.as_bytes();
let valid_up_to = if output_encoding == encoding_rs::ISO_2022_JP {
Encoding::iso_2022_jp_ascii_valid_up_to(bytes)
} else {
Encoding::ascii_valid_up_to(bytes)
};
if valid_up_to == bytes.len() {
// All the bytes are already correctly encoded - we don't need to do anything!
return Cow::Borrowed(bytes);
}
let mut encoder = encoding.new_encoder();
let mut output = Vec::with_capacity(
encoder
.max_buffer_length_from_utf8_if_no_unmappables(string.len())
.expect("string size would overflow `usize`"),
);
loop {
match encoder.encode_from_utf8_to_vec_without_replacement(string, &mut output, true) {
(EncoderResult::InputEmpty, _) => break,
(EncoderResult::OutputFull, consumed) => {
output.reserve(
encoder
.max_buffer_length_from_utf8_if_no_unmappables(string.len())
.expect("string size would overflow `usize`"),
);
string = &string[consumed..];
},
(EncoderResult::Unmappable(character), consumed) => {
use std::io::Write;
write!(&mut output, "%26%23{}%3B", character as u32).unwrap();
string = &string[consumed..];
},
};
}
Cow::Owned(output)
}