Auto merge of #10796 - servo:character-data-surrogates, r=nox

Make /dom/nodes/CharacterData-surrogates.html not panic. It now fails since `DOMString` is currently based on `std::string::String` on the Rust side, which is strictly well-formed UTF-8 and can not contain unpaired surrogate code points. Fixes #10780 r? @Ms2ger  --- This change is [<img src="https://reviewable.io/review_button.svg" height="35" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/servo/10796)
2025-10-02 17:49:16 +01:00 · 2016-05-24 15:59:31 -07:00 · 2016-05-24 15:59:31 -07:00 · a04e30d247
commit a04e30d247
parent 2a2b88f42c c11a3b958d
2 changed files with 116 additions and 25 deletions
--- a/components/script/dom/characterdata.rs
+++ b/components/script/dom/characterdata.rs
@ -20,6 +20,7 @@ use dom::node::{Node, NodeDamage};
 use dom::processinginstruction::ProcessingInstruction;
 use dom::text::Text;
 use std::cell::Ref;
 use util::opts;
 // https://dom.spec.whatwg.org/#characterdata
 #[dom_struct]
@ -94,16 +95,34 @@ impl CharacterDataMethods for CharacterData {
    fn SubstringData(&self, offset: u32, count: u32) -> Fallible<DOMString> {
        let data = self.data.borrow();
        // Step 1.
-        let data_from_offset = match find_utf16_code_unit_offset(&data, offset) {
+        let mut substring = String::new();
-            Some(offset_bytes) => &data[offset_bytes..],
+        let remaining;
        match split_at_utf16_code_unit_offset(&data, offset) {
            Ok((_, astral, s)) => {
                // As if we had split the UTF-16 surrogate pair in half
                // and then transcoded that to UTF-8 lossily,
                // since our DOMString is currently strict UTF-8.
                if astral.is_some() {
                    substring = substring + "\u{FFFD}";
                }
                remaining = s;
            }
            // Step 2.
-            None => return Err(Error::IndexSize),
+            Err(()) => return Err(Error::IndexSize),
-        };
+        }
-        let substring = match find_utf16_code_unit_offset(data_from_offset, count) {
+        match split_at_utf16_code_unit_offset(remaining, count) {
            // Steps 3.
-            None => data_from_offset,
+            Err(()) => substring = substring + remaining,
            // Steps 4.
-            Some(count_bytes) => &data_from_offset[..count_bytes],
+            Ok((s, astral, _)) => {
                substring = substring + s;
                // As if we had split the UTF-16 surrogate pair in half
                // and then transcoded that to UTF-8 lossily,
                // since our DOMString is currently strict UTF-8.
                if astral.is_some() {
                    substring = substring + "\u{FFFD}";
                }
            }
        };
        Ok(DOMString::from(substring))
    }
@ -126,26 +145,54 @@ impl CharacterDataMethods for CharacterData {
    // https://dom.spec.whatwg.org/#dom-characterdata-replacedata
    fn ReplaceData(&self, offset: u32, count: u32, arg: DOMString) -> ErrorResult {
-        let new_data = {
+        let mut new_data;
        {
            let data = self.data.borrow();
-            let (prefix, data_from_offset) = match find_utf16_code_unit_offset(&data, offset) {
+            let prefix;
-                Some(offset_bytes) => data.split_at(offset_bytes),
+            let replacement_before;
            let remaining;
            match split_at_utf16_code_unit_offset(&data, offset) {
                Ok((p, astral, r)) => {
                    prefix = p;
                    // As if we had split the UTF-16 surrogate pair in half
                    // and then transcoded that to UTF-8 lossily,
                    // since our DOMString is currently strict UTF-8.
                    replacement_before = if astral.is_some() { "\u{FFFD}" } else { "" };
                    remaining = r;
                }
                // Step 2.
-                None => return Err(Error::IndexSize),
+                Err(()) => return Err(Error::IndexSize),
            };
-            let suffix = match find_utf16_code_unit_offset(data_from_offset, count) {
+            let replacement_after;
            let suffix;
            match split_at_utf16_code_unit_offset(remaining, count) {
                // Steps 3.
-                None => "",
+                Err(()) => {
-                Some(count_bytes) => &data_from_offset[count_bytes..],
+                    replacement_after = "";
                    suffix = "";
                }
                Ok((_, astral, s)) => {
                    // As if we had split the UTF-16 surrogate pair in half
                    // and then transcoded that to UTF-8 lossily,
                    // since our DOMString is currently strict UTF-8.
                    replacement_after = if astral.is_some() { "\u{FFFD}" } else { "" };
                    suffix = s;
                }
            };
            // Step 4: Mutation observers.
            // Step 5 to 7.
-            let mut new_data = String::with_capacity(prefix.len() + arg.len() + suffix.len());
+            new_data = String::with_capacity(
                prefix.len() +
                replacement_before.len() +
                arg.len() +
                replacement_after.len() +
                suffix.len());
            new_data.push_str(prefix);
            new_data.push_str(replacement_before);
            new_data.push_str(&arg);
            new_data.push_str(replacement_after);
            new_data.push_str(suffix);
-            new_data
+        }
        };
        *self.data.borrow_mut() = DOMString::from(new_data);
        self.content_changed();
        // Steps 8-11.
@ -200,19 +247,40 @@ impl LayoutCharacterDataHelpers for LayoutJS<CharacterData> {
    }
 }
-/// Given a number of UTF-16 code units from the start of the given string,
+/// Split the given string at the given position measured in UTF-16 code units from the start.
 /// return the corresponding number of UTF-8 bytes.
 ///
-/// s[find_utf16_code_unit_offset(s, o).unwrap()..] == s.to_utf16()[o..].to_utf8()
+/// * `Err(())` indicates that `offset` if after the end of the string
-fn find_utf16_code_unit_offset(s: &str, offset: u32) -> Option<usize> {
+/// * `Ok((before, None, after))` indicates that `offset` is between Unicode code points.
 ///   The two string slices are such that:
 ///   `before == s.to_utf16()[..offset].to_utf8()` and
 ///   `after == s.to_utf16()[offset..].to_utf8()`
 /// * `Ok((before, Some(ch), after))` indicates that `offset` is "in the middle"
 ///   of a single Unicode code point that would be represented in UTF-16 by a surrogate pair
 ///   of two 16-bit code units.
 ///   `ch` is that code point.
 ///   The two string slices are such that:
 ///   `before == s.to_utf16()[..offset - 1].to_utf8()` and
 ///   `after == s.to_utf16()[offset + 1..].to_utf8()`
 ///
 /// # Panics
 ///
 /// Note that the third variant is only ever returned when the `-Z replace-surrogates`
 /// command-line option is specified.
 /// When it *would* be returned but the option is *not* specified, this function panics.
 fn split_at_utf16_code_unit_offset(s: &str, offset: u32) -> Result<(&str, Option<char>, &str), ()> {
    let mut code_units = 0;
    for (i, c) in s.char_indices() {
        if code_units == offset {
-            return Some(i);
+            let (a, b) = s.split_at(i);
            return Ok((a, None, b));
        }
        code_units += 1;
        if c > '\u{FFFF}' {
            if code_units == offset {
                if opts::get().replace_surrogates {
                    debug_assert!(c.len_utf8() == 4);
                    return Ok((&s[..i], Some(c), &s[i + c.len_utf8()..]))
                }
                panic!("\n\n\
                    Would split a surrogate pair in CharacterData API.\n\
                    If you see this in real content, please comment with the URL\n\
@ -223,8 +291,8 @@ fn find_utf16_code_unit_offset(s: &str, offset: u32) -> Option<usize> {
        }
    }
    if code_units == offset {
-        Some(s.len())
+        Ok((s, None, ""))
    } else {
-        None
+        Err(())
    }
 }
--- a/tests/wpt/metadata/dom/nodes/CharacterData-surrogates.html.ini
+++ b/tests/wpt/metadata/dom/nodes/CharacterData-surrogates.html.ini
@ -1,3 +1,26 @@
 [CharacterData-surrogates.html]
  type: testharness
-  expected: CRASH
+  [Text.substringData() splitting surrogate pairs]
    expected: FAIL
  [Text.replaceData() splitting and creating surrogate pairs]
    expected: FAIL
  [Text.deleteData() splitting and creating surrogate pairs]
    expected: FAIL
  [Text.insertData() splitting and creating surrogate pairs]
    expected: FAIL
  [Comment.substringData() splitting surrogate pairs]
    expected: FAIL
  [Comment.replaceData() splitting and creating surrogate pairs]
    expected: FAIL
  [Comment.deleteData() splitting and creating surrogate pairs]
    expected: FAIL
  [Comment.insertData() splitting and creating surrogate pairs]
    expected: FAIL