mirror of
https://github.com/servo/servo.git
synced 2025-06-06 16:45:39 +00:00
dom: Always replace unpaired surrogates when handling page text (#35381)
Background: > JavaScript strings are potentially ill-formed UTF-16 (arbitrary > Vec<u16>) and can contain unpaired surrogates. Rust’s String type is > well-formed UTF-8 and can not contain any surrogate. Surrogates are > never emitted when decoding bytes from the network, but they can sneak > in through document.write, the Element.innerHtml setter, or other DOM > APIs. In 2015, Servo launched an experiment to see if unpaired surrogates cropped up in page content. That experiment caused Servo to panic if unpaired surrogates were encountered with a request to report the page to bug #6564. During that time several pages were reported with unpaired surrogates, causing Servo to panic. In addition, when running the WPT tests Servo will never panic due to the `-Z replace-surrogates` option being passed by the test driver. Motivation: After this 10 year experiment, it's clear that unpaired surrogates are a real concern in page content. Several reports were filed of Servo panicking after encountering them in real world pages. A complete fix for this issue would be to somehow maintain unpaired surrogates in the DOM, but that is a much larger task than simply emitting U+FFD instead of an unpaired surrogate. Since it is clear that this kind of content exists, it is better for Servo to try its best to handle the content rather than crash as production browsers should not crash due to user content when possible. In this change, I modify Servo to always replace unpaired surrogates. It would have been ideal to only crash when debug assertions are enabled, but debug assertions are enabled by default in release mode -- so this wouldn't be effective for WPT tests. Signed-off-by: Martin Robinson <mrobinson@igalia.com>
This commit is contained in:
parent
b483cdb786
commit
75cf3d7265
10 changed files with 17 additions and 90 deletions
|
@ -154,10 +154,6 @@ pub struct DebugOptions {
|
|||
/// Translate mouse input into touch events.
|
||||
pub convert_mouse_to_touch: bool,
|
||||
|
||||
/// Replace unpaires surrogates in DOM strings with U+FFFD.
|
||||
/// See <https://github.com/servo/servo/issues/6564>
|
||||
pub replace_surrogates: bool,
|
||||
|
||||
/// Log GC passes and their durations.
|
||||
pub gc_profile: bool,
|
||||
|
||||
|
@ -184,7 +180,6 @@ impl DebugOptions {
|
|||
"gc-profile" => self.gc_profile = true,
|
||||
"profile-script-events" => self.profile_script_events = true,
|
||||
"relayout-event" => self.relayout_event = true,
|
||||
"replace-surrogates" => self.replace_surrogates = true,
|
||||
"signpost" => self.signpost = true,
|
||||
"dump-style-stats" => self.dump_style_statistics = true,
|
||||
"trace-layout" => self.trace_layout = true,
|
||||
|
|
|
@ -133,15 +133,10 @@ impl CharacterDataMethods<crate::DomTypeHolder> for CharacterData {
|
|||
|
||||
// https://dom.spec.whatwg.org/#dom-characterdata-substringdata
|
||||
fn SubstringData(&self, offset: u32, count: u32) -> Fallible<DOMString> {
|
||||
let replace_surrogates = self
|
||||
.upcast::<Node>()
|
||||
.owner_doc()
|
||||
.window()
|
||||
.replace_surrogates();
|
||||
let data = self.data.borrow();
|
||||
// Step 1.
|
||||
let mut substring = String::new();
|
||||
let remaining = match split_at_utf16_code_unit_offset(&data, offset, replace_surrogates) {
|
||||
let remaining = match split_at_utf16_code_unit_offset(&data, offset) {
|
||||
Ok((_, astral, s)) => {
|
||||
// As if we had split the UTF-16 surrogate pair in half
|
||||
// and then transcoded that to UTF-8 lossily,
|
||||
|
@ -154,7 +149,7 @@ impl CharacterDataMethods<crate::DomTypeHolder> for CharacterData {
|
|||
// Step 2.
|
||||
Err(()) => return Err(Error::IndexSize),
|
||||
};
|
||||
match split_at_utf16_code_unit_offset(remaining, count, replace_surrogates) {
|
||||
match split_at_utf16_code_unit_offset(remaining, count) {
|
||||
// Steps 3.
|
||||
Err(()) => substring += remaining,
|
||||
// Steps 4.
|
||||
|
@ -191,16 +186,11 @@ impl CharacterDataMethods<crate::DomTypeHolder> for CharacterData {
|
|||
fn ReplaceData(&self, offset: u32, count: u32, arg: DOMString) -> ErrorResult {
|
||||
let mut new_data;
|
||||
{
|
||||
let replace_surrogates = self
|
||||
.upcast::<Node>()
|
||||
.owner_doc()
|
||||
.window()
|
||||
.replace_surrogates();
|
||||
let data = self.data.borrow();
|
||||
let prefix;
|
||||
let replacement_before;
|
||||
let remaining;
|
||||
match split_at_utf16_code_unit_offset(&data, offset, replace_surrogates) {
|
||||
match split_at_utf16_code_unit_offset(&data, offset) {
|
||||
Ok((p, astral, r)) => {
|
||||
prefix = p;
|
||||
// As if we had split the UTF-16 surrogate pair in half
|
||||
|
@ -214,7 +204,7 @@ impl CharacterDataMethods<crate::DomTypeHolder> for CharacterData {
|
|||
};
|
||||
let replacement_after;
|
||||
let suffix;
|
||||
match split_at_utf16_code_unit_offset(remaining, count, replace_surrogates) {
|
||||
match split_at_utf16_code_unit_offset(remaining, count) {
|
||||
// Steps 3.
|
||||
Err(()) => {
|
||||
replacement_after = "";
|
||||
|
@ -318,17 +308,7 @@ impl<'dom> LayoutCharacterDataHelpers<'dom> for LayoutDom<'dom, CharacterData> {
|
|||
/// The two string slices are such that:
|
||||
/// `before == s.to_utf16()[..offset - 1].to_utf8()` and
|
||||
/// `after == s.to_utf16()[offset + 1..].to_utf8()`
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Note that the third variant is only ever returned when the `-Z replace-surrogates`
|
||||
/// command-line option is specified.
|
||||
/// When it *would* be returned but the option is *not* specified, this function panics.
|
||||
fn split_at_utf16_code_unit_offset(
|
||||
s: &str,
|
||||
offset: u32,
|
||||
replace_surrogates: bool,
|
||||
) -> Result<(&str, Option<char>, &str), ()> {
|
||||
fn split_at_utf16_code_unit_offset(s: &str, offset: u32) -> Result<(&str, Option<char>, &str), ()> {
|
||||
let mut code_units = 0;
|
||||
for (i, c) in s.char_indices() {
|
||||
if code_units == offset {
|
||||
|
@ -338,17 +318,9 @@ fn split_at_utf16_code_unit_offset(
|
|||
code_units += 1;
|
||||
if c > '\u{FFFF}' {
|
||||
if code_units == offset {
|
||||
if replace_surrogates {
|
||||
debug_assert_eq!(c.len_utf8(), 4);
|
||||
return Ok((&s[..i], Some(c), &s[i + c.len_utf8()..]));
|
||||
}
|
||||
panic!(
|
||||
"\n\n\
|
||||
Would split a surrogate pair in CharacterData API.\n\
|
||||
If you see this in real content, please comment with the URL\n\
|
||||
on https://github.com/servo/servo/issues/6873\n\
|
||||
\n"
|
||||
);
|
||||
debug_assert_eq!(c.len_utf8(), 4);
|
||||
warn!("Splitting a surrogate pair in CharacterData API.");
|
||||
return Ok((&s[..i], Some(c), &s[i + c.len_utf8()..]));
|
||||
}
|
||||
code_units += 1;
|
||||
}
|
||||
|
|
|
@ -374,10 +374,6 @@ pub(crate) struct Window {
|
|||
/// won't be loaded.
|
||||
userscripts_path: Option<String>,
|
||||
|
||||
/// Replace unpaired surrogates in DOM strings with U+FFFD.
|
||||
/// See <https://github.com/servo/servo/issues/6564>
|
||||
replace_surrogates: bool,
|
||||
|
||||
/// Window's GL context from application
|
||||
#[ignore_malloc_size_of = "defined in script_thread"]
|
||||
#[no_trace]
|
||||
|
@ -619,10 +615,6 @@ impl Window {
|
|||
self.userscripts_path.clone()
|
||||
}
|
||||
|
||||
pub(crate) fn replace_surrogates(&self) -> bool {
|
||||
self.replace_surrogates
|
||||
}
|
||||
|
||||
pub(crate) fn get_player_context(&self) -> WindowGLContext {
|
||||
self.player_context.clone()
|
||||
}
|
||||
|
@ -2778,7 +2770,6 @@ impl Window {
|
|||
unminify_css: bool,
|
||||
local_script_source: Option<String>,
|
||||
userscripts_path: Option<String>,
|
||||
replace_surrogates: bool,
|
||||
user_agent: Cow<'static, str>,
|
||||
player_context: WindowGLContext,
|
||||
#[cfg(feature = "webgpu")] gpu_id_hub: Arc<IdentityHub>,
|
||||
|
@ -2864,7 +2855,6 @@ impl Window {
|
|||
prepare_for_screenshot,
|
||||
unminify_css,
|
||||
userscripts_path,
|
||||
replace_surrogates,
|
||||
player_context,
|
||||
throttled: Cell::new(false),
|
||||
layout_marker: DomRefCell::new(Rc::new(Cell::new(true))),
|
||||
|
|
|
@ -318,10 +318,6 @@ pub struct ScriptThread {
|
|||
/// won't be loaded
|
||||
userscripts_path: Option<String>,
|
||||
|
||||
/// Replace unpaired surrogates in DOM strings with U+FFFD.
|
||||
/// See <https://github.com/servo/servo/issues/6564>
|
||||
replace_surrogates: bool,
|
||||
|
||||
/// An optional string allowing the user agent to be set for testing.
|
||||
user_agent: Cow<'static, str>,
|
||||
|
||||
|
@ -957,7 +953,6 @@ impl ScriptThread {
|
|||
local_script_source: opts.local_script_source.clone(),
|
||||
unminify_css: opts.unminify_css,
|
||||
userscripts_path: opts.userscripts.clone(),
|
||||
replace_surrogates: opts.debug.replace_surrogates,
|
||||
user_agent,
|
||||
player_context: state.player_context,
|
||||
node_ids: Default::default(),
|
||||
|
@ -3146,7 +3141,6 @@ impl ScriptThread {
|
|||
self.unminify_css,
|
||||
self.local_script_source.clone(),
|
||||
self.userscripts_path.clone(),
|
||||
self.replace_surrogates,
|
||||
self.user_agent.clone(),
|
||||
self.player_context.clone(),
|
||||
#[cfg(feature = "webgpu")]
|
||||
|
|
|
@ -17,7 +17,6 @@ use js::jsval::{ObjectValue, StringValue};
|
|||
use js::rust::{
|
||||
get_object_class, is_dom_class, maybe_wrap_value, HandleValue, MutableHandleValue, ToString,
|
||||
};
|
||||
use servo_config::opts;
|
||||
|
||||
use crate::inheritance::Castable;
|
||||
use crate::reflector::Reflector;
|
||||
|
@ -97,24 +96,8 @@ pub unsafe fn jsstring_to_str(cx: *mut JSContext, s: ptr::NonNull<JSString>) ->
|
|||
match item {
|
||||
Ok(c) => s.push(c),
|
||||
Err(_) => {
|
||||
// FIXME: Add more info like document URL in the message?
|
||||
macro_rules! message {
|
||||
() => {
|
||||
"Found an unpaired surrogate in a DOM string. \
|
||||
If you see this in real web content, \
|
||||
please comment on https://github.com/servo/servo/issues/6564"
|
||||
};
|
||||
}
|
||||
if opts::get().debug.replace_surrogates {
|
||||
error!(message!());
|
||||
s.push('\u{FFFD}');
|
||||
} else {
|
||||
panic!(concat!(
|
||||
message!(),
|
||||
" Use `-Z replace-surrogates` \
|
||||
on the command line to make this non-fatal."
|
||||
));
|
||||
}
|
||||
error!("Found an unpaired surrogate in a DOM string.");
|
||||
s.push('\u{FFFD}');
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
|
@ -174,11 +174,9 @@ pub fn is_token(s: &[u8]) -> bool {
|
|||
/// unpaired surrogates.
|
||||
///
|
||||
/// The hypothesis is that it does not matter much how exactly those values are
|
||||
/// transformed, because passing unpaired surrogates into the DOM is very rare.
|
||||
/// In order to test this hypothesis, Servo will panic when encountering any
|
||||
/// unpaired surrogates on conversion to `DOMString` by default. (The command
|
||||
/// line option `-Z replace-surrogates` instead causes Servo to replace the
|
||||
/// unpaired surrogate by a U+FFFD replacement character.)
|
||||
/// transformed, because passing unpaired surrogates into the DOM is very rare.
|
||||
/// Instead Servo withh replace the unpaired surrogate by a U+FFFD replacement
|
||||
/// character.
|
||||
///
|
||||
/// Currently, the lack of crash reports about this issue provides some
|
||||
/// evidence to support the hypothesis. This evidence will hopefully be used to
|
||||
|
|
|
@ -679,7 +679,6 @@ fn print_debug_options_usage(app: &str) {
|
|||
"relayout-event",
|
||||
"Print notifications when there is a relayout.",
|
||||
);
|
||||
print_option("replace-surrogates", "Replace unpaires surrogates in DOM strings with U+FFFD. See https://github.com/servo/servo/issues/6564");
|
||||
print_option(
|
||||
"show-fragment-borders",
|
||||
"Paint borders along fragment boundaries.",
|
||||
|
|
4
tests/wpt/meta/MANIFEST.json
vendored
4
tests/wpt/meta/MANIFEST.json
vendored
|
@ -505830,7 +505830,7 @@
|
|||
[]
|
||||
],
|
||||
"servodriver.py": [
|
||||
"4b82230bddddb56e7938d635feea55ba19c3d903",
|
||||
"18035f9331e0d5df1c3fa916ce33a8866bea091a",
|
||||
[]
|
||||
],
|
||||
"webkit.py": [
|
||||
|
@ -505892,7 +505892,7 @@
|
|||
[]
|
||||
],
|
||||
"executorservo.py": [
|
||||
"bc2021e1f5c05d55ce5a2a3fa35518efdd616415",
|
||||
"e3369d24ebc6d6aac1b4632ef673a36602417745",
|
||||
[]
|
||||
],
|
||||
"executorservodriver.py": [
|
||||
|
|
|
@ -99,7 +99,6 @@ class ServoWebDriverBrowser(WebDriverBrowser):
|
|||
# For some reason rustls does not like the certificate generated by the WPT tooling.
|
||||
"--ignore-certificate-errors",
|
||||
"--window-size", "800x600",
|
||||
"-Z", "replace-surrogates",
|
||||
"data:,",
|
||||
]
|
||||
|
||||
|
|
|
@ -80,7 +80,7 @@ class ServoExecutor(ProcessTestExecutor):
|
|||
self.logger.error("Unable to find wpt-prefs.json")
|
||||
return default_path
|
||||
|
||||
def build_servo_command(self, test, extra_args=None, debug_opts="replace-surrogates"):
|
||||
def build_servo_command(self, test, extra_args=None):
|
||||
args = [
|
||||
"--hard-fail", "-u", "Servo/wptrunner",
|
||||
# See https://github.com/servo/servo/issues/30080.
|
||||
|
@ -88,8 +88,6 @@ class ServoExecutor(ProcessTestExecutor):
|
|||
"--ignore-certificate-errors",
|
||||
"-z", self.test_url(test),
|
||||
]
|
||||
if debug_opts:
|
||||
args += ["-Z", debug_opts]
|
||||
for stylesheet in self.browser.user_stylesheets:
|
||||
args += ["--user-stylesheet", stylesheet]
|
||||
for pref, value in self.environment.get('prefs', {}).items():
|
||||
|
@ -231,12 +229,11 @@ class ServoRefTestExecutor(ServoExecutor):
|
|||
extra_args = ["--exit",
|
||||
"--output=%s" % output_path,
|
||||
"--window-size", viewport_size or "800x600"]
|
||||
debug_opts = "replace-surrogates"
|
||||
|
||||
if dpi:
|
||||
extra_args += ["--device-pixel-ratio", dpi]
|
||||
|
||||
self.command = self.build_servo_command(test, extra_args, debug_opts)
|
||||
self.command = self.build_servo_command(test, extra_args)
|
||||
|
||||
if not self.interactive:
|
||||
self.proc = ProcessHandler(self.command,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue