mirror of
https://github.com/servo/servo.git
synced 2025-06-12 18:34:39 +00:00
Auto merge of #10796 - servo:character-data-surrogates, r=nox
Make /dom/nodes/CharacterData-surrogates.html not panic. It now fails since `DOMString` is currently based on `std::string::String` on the Rust side, which is strictly well-formed UTF-8 and can not contain unpaired surrogate code points. Fixes #10780 r? @Ms2ger <!-- Reviewable:start --> --- This change is [<img src="https://reviewable.io/review_button.svg" height="35" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/servo/10796) <!-- Reviewable:end -->
This commit is contained in:
commit
a04e30d247
2 changed files with 116 additions and 25 deletions
|
@ -20,6 +20,7 @@ use dom::node::{Node, NodeDamage};
|
||||||
use dom::processinginstruction::ProcessingInstruction;
|
use dom::processinginstruction::ProcessingInstruction;
|
||||||
use dom::text::Text;
|
use dom::text::Text;
|
||||||
use std::cell::Ref;
|
use std::cell::Ref;
|
||||||
|
use util::opts;
|
||||||
|
|
||||||
// https://dom.spec.whatwg.org/#characterdata
|
// https://dom.spec.whatwg.org/#characterdata
|
||||||
#[dom_struct]
|
#[dom_struct]
|
||||||
|
@ -94,16 +95,34 @@ impl CharacterDataMethods for CharacterData {
|
||||||
fn SubstringData(&self, offset: u32, count: u32) -> Fallible<DOMString> {
|
fn SubstringData(&self, offset: u32, count: u32) -> Fallible<DOMString> {
|
||||||
let data = self.data.borrow();
|
let data = self.data.borrow();
|
||||||
// Step 1.
|
// Step 1.
|
||||||
let data_from_offset = match find_utf16_code_unit_offset(&data, offset) {
|
let mut substring = String::new();
|
||||||
Some(offset_bytes) => &data[offset_bytes..],
|
let remaining;
|
||||||
|
match split_at_utf16_code_unit_offset(&data, offset) {
|
||||||
|
Ok((_, astral, s)) => {
|
||||||
|
// As if we had split the UTF-16 surrogate pair in half
|
||||||
|
// and then transcoded that to UTF-8 lossily,
|
||||||
|
// since our DOMString is currently strict UTF-8.
|
||||||
|
if astral.is_some() {
|
||||||
|
substring = substring + "\u{FFFD}";
|
||||||
|
}
|
||||||
|
remaining = s;
|
||||||
|
}
|
||||||
// Step 2.
|
// Step 2.
|
||||||
None => return Err(Error::IndexSize),
|
Err(()) => return Err(Error::IndexSize),
|
||||||
};
|
}
|
||||||
let substring = match find_utf16_code_unit_offset(data_from_offset, count) {
|
match split_at_utf16_code_unit_offset(remaining, count) {
|
||||||
// Steps 3.
|
// Steps 3.
|
||||||
None => data_from_offset,
|
Err(()) => substring = substring + remaining,
|
||||||
// Steps 4.
|
// Steps 4.
|
||||||
Some(count_bytes) => &data_from_offset[..count_bytes],
|
Ok((s, astral, _)) => {
|
||||||
|
substring = substring + s;
|
||||||
|
// As if we had split the UTF-16 surrogate pair in half
|
||||||
|
// and then transcoded that to UTF-8 lossily,
|
||||||
|
// since our DOMString is currently strict UTF-8.
|
||||||
|
if astral.is_some() {
|
||||||
|
substring = substring + "\u{FFFD}";
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
Ok(DOMString::from(substring))
|
Ok(DOMString::from(substring))
|
||||||
}
|
}
|
||||||
|
@ -126,26 +145,54 @@ impl CharacterDataMethods for CharacterData {
|
||||||
|
|
||||||
// https://dom.spec.whatwg.org/#dom-characterdata-replacedata
|
// https://dom.spec.whatwg.org/#dom-characterdata-replacedata
|
||||||
fn ReplaceData(&self, offset: u32, count: u32, arg: DOMString) -> ErrorResult {
|
fn ReplaceData(&self, offset: u32, count: u32, arg: DOMString) -> ErrorResult {
|
||||||
let new_data = {
|
let mut new_data;
|
||||||
|
{
|
||||||
let data = self.data.borrow();
|
let data = self.data.borrow();
|
||||||
let (prefix, data_from_offset) = match find_utf16_code_unit_offset(&data, offset) {
|
let prefix;
|
||||||
Some(offset_bytes) => data.split_at(offset_bytes),
|
let replacement_before;
|
||||||
|
let remaining;
|
||||||
|
match split_at_utf16_code_unit_offset(&data, offset) {
|
||||||
|
Ok((p, astral, r)) => {
|
||||||
|
prefix = p;
|
||||||
|
// As if we had split the UTF-16 surrogate pair in half
|
||||||
|
// and then transcoded that to UTF-8 lossily,
|
||||||
|
// since our DOMString is currently strict UTF-8.
|
||||||
|
replacement_before = if astral.is_some() { "\u{FFFD}" } else { "" };
|
||||||
|
remaining = r;
|
||||||
|
}
|
||||||
// Step 2.
|
// Step 2.
|
||||||
None => return Err(Error::IndexSize),
|
Err(()) => return Err(Error::IndexSize),
|
||||||
};
|
};
|
||||||
let suffix = match find_utf16_code_unit_offset(data_from_offset, count) {
|
let replacement_after;
|
||||||
|
let suffix;
|
||||||
|
match split_at_utf16_code_unit_offset(remaining, count) {
|
||||||
// Steps 3.
|
// Steps 3.
|
||||||
None => "",
|
Err(()) => {
|
||||||
Some(count_bytes) => &data_from_offset[count_bytes..],
|
replacement_after = "";
|
||||||
|
suffix = "";
|
||||||
|
}
|
||||||
|
Ok((_, astral, s)) => {
|
||||||
|
// As if we had split the UTF-16 surrogate pair in half
|
||||||
|
// and then transcoded that to UTF-8 lossily,
|
||||||
|
// since our DOMString is currently strict UTF-8.
|
||||||
|
replacement_after = if astral.is_some() { "\u{FFFD}" } else { "" };
|
||||||
|
suffix = s;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
// Step 4: Mutation observers.
|
// Step 4: Mutation observers.
|
||||||
// Step 5 to 7.
|
// Step 5 to 7.
|
||||||
let mut new_data = String::with_capacity(prefix.len() + arg.len() + suffix.len());
|
new_data = String::with_capacity(
|
||||||
|
prefix.len() +
|
||||||
|
replacement_before.len() +
|
||||||
|
arg.len() +
|
||||||
|
replacement_after.len() +
|
||||||
|
suffix.len());
|
||||||
new_data.push_str(prefix);
|
new_data.push_str(prefix);
|
||||||
|
new_data.push_str(replacement_before);
|
||||||
new_data.push_str(&arg);
|
new_data.push_str(&arg);
|
||||||
|
new_data.push_str(replacement_after);
|
||||||
new_data.push_str(suffix);
|
new_data.push_str(suffix);
|
||||||
new_data
|
}
|
||||||
};
|
|
||||||
*self.data.borrow_mut() = DOMString::from(new_data);
|
*self.data.borrow_mut() = DOMString::from(new_data);
|
||||||
self.content_changed();
|
self.content_changed();
|
||||||
// Steps 8-11.
|
// Steps 8-11.
|
||||||
|
@ -200,19 +247,40 @@ impl LayoutCharacterDataHelpers for LayoutJS<CharacterData> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Given a number of UTF-16 code units from the start of the given string,
|
/// Split the given string at the given position measured in UTF-16 code units from the start.
|
||||||
/// return the corresponding number of UTF-8 bytes.
|
|
||||||
///
|
///
|
||||||
/// s[find_utf16_code_unit_offset(s, o).unwrap()..] == s.to_utf16()[o..].to_utf8()
|
/// * `Err(())` indicates that `offset` if after the end of the string
|
||||||
fn find_utf16_code_unit_offset(s: &str, offset: u32) -> Option<usize> {
|
/// * `Ok((before, None, after))` indicates that `offset` is between Unicode code points.
|
||||||
|
/// The two string slices are such that:
|
||||||
|
/// `before == s.to_utf16()[..offset].to_utf8()` and
|
||||||
|
/// `after == s.to_utf16()[offset..].to_utf8()`
|
||||||
|
/// * `Ok((before, Some(ch), after))` indicates that `offset` is "in the middle"
|
||||||
|
/// of a single Unicode code point that would be represented in UTF-16 by a surrogate pair
|
||||||
|
/// of two 16-bit code units.
|
||||||
|
/// `ch` is that code point.
|
||||||
|
/// The two string slices are such that:
|
||||||
|
/// `before == s.to_utf16()[..offset - 1].to_utf8()` and
|
||||||
|
/// `after == s.to_utf16()[offset + 1..].to_utf8()`
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// Note that the third variant is only ever returned when the `-Z replace-surrogates`
|
||||||
|
/// command-line option is specified.
|
||||||
|
/// When it *would* be returned but the option is *not* specified, this function panics.
|
||||||
|
fn split_at_utf16_code_unit_offset(s: &str, offset: u32) -> Result<(&str, Option<char>, &str), ()> {
|
||||||
let mut code_units = 0;
|
let mut code_units = 0;
|
||||||
for (i, c) in s.char_indices() {
|
for (i, c) in s.char_indices() {
|
||||||
if code_units == offset {
|
if code_units == offset {
|
||||||
return Some(i);
|
let (a, b) = s.split_at(i);
|
||||||
|
return Ok((a, None, b));
|
||||||
}
|
}
|
||||||
code_units += 1;
|
code_units += 1;
|
||||||
if c > '\u{FFFF}' {
|
if c > '\u{FFFF}' {
|
||||||
if code_units == offset {
|
if code_units == offset {
|
||||||
|
if opts::get().replace_surrogates {
|
||||||
|
debug_assert!(c.len_utf8() == 4);
|
||||||
|
return Ok((&s[..i], Some(c), &s[i + c.len_utf8()..]))
|
||||||
|
}
|
||||||
panic!("\n\n\
|
panic!("\n\n\
|
||||||
Would split a surrogate pair in CharacterData API.\n\
|
Would split a surrogate pair in CharacterData API.\n\
|
||||||
If you see this in real content, please comment with the URL\n\
|
If you see this in real content, please comment with the URL\n\
|
||||||
|
@ -223,8 +291,8 @@ fn find_utf16_code_unit_offset(s: &str, offset: u32) -> Option<usize> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if code_units == offset {
|
if code_units == offset {
|
||||||
Some(s.len())
|
Ok((s, None, ""))
|
||||||
} else {
|
} else {
|
||||||
None
|
Err(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,26 @@
|
||||||
[CharacterData-surrogates.html]
|
[CharacterData-surrogates.html]
|
||||||
type: testharness
|
type: testharness
|
||||||
expected: CRASH
|
[Text.substringData() splitting surrogate pairs]
|
||||||
|
expected: FAIL
|
||||||
|
|
||||||
|
[Text.replaceData() splitting and creating surrogate pairs]
|
||||||
|
expected: FAIL
|
||||||
|
|
||||||
|
[Text.deleteData() splitting and creating surrogate pairs]
|
||||||
|
expected: FAIL
|
||||||
|
|
||||||
|
[Text.insertData() splitting and creating surrogate pairs]
|
||||||
|
expected: FAIL
|
||||||
|
|
||||||
|
[Comment.substringData() splitting surrogate pairs]
|
||||||
|
expected: FAIL
|
||||||
|
|
||||||
|
[Comment.replaceData() splitting and creating surrogate pairs]
|
||||||
|
expected: FAIL
|
||||||
|
|
||||||
|
[Comment.deleteData() splitting and creating surrogate pairs]
|
||||||
|
expected: FAIL
|
||||||
|
|
||||||
|
[Comment.insertData() splitting and creating surrogate pairs]
|
||||||
|
expected: FAIL
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue