Remove UTF-8 BOM before parsing JSON (#35175)

Signed-off-by: Shane Handley <shanehandley@fastmail.com>
This commit is contained in:
shanehandley 2025-01-27 23:18:03 +11:00 committed by GitHub
parent 5a0a60efc1
commit cd93841ba1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 26 additions and 27 deletions

View file

@ -3,9 +3,9 @@
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
use std::rc::Rc;
use std::{ptr, str};
use std::{ptr, slice, str};
use encoding_rs::UTF_8;
use encoding_rs::{Encoding, UTF_8};
use ipc_channel::ipc::{self, IpcReceiver, IpcSender};
use ipc_channel::router::ROUTER;
use js::jsapi::{Heap, JSObject, JS_ClearPendingException, Value as JSValue};
@ -821,8 +821,11 @@ fn run_text_data_algorithm(bytes: Vec<u8>) -> Fallible<FetchedData> {
#[allow(unsafe_code)]
fn run_json_data_algorithm(cx: JSContext, bytes: Vec<u8>) -> Fallible<FetchedData> {
let json_text = String::from_utf8_lossy(&bytes);
let json_text: Vec<u16> = json_text.encode_utf16().collect();
// The JSON spec allows implementations to either ignore UTF-8 BOM or treat it as an error.
// `JS_ParseJSON` treats this as an error, so it is necessary for us to strip it if present.
//
// https://datatracker.ietf.org/doc/html/rfc8259#section-8.1
let json_text = decode_to_utf16_with_bom_removal(&bytes, UTF_8);
rooted!(in(*cx) let mut rval = UndefinedValue());
unsafe {
if !JS_ParseJSON(
@ -908,6 +911,23 @@ pub(crate) fn run_array_buffer_data_algorithm(
Ok(FetchedData::ArrayBuffer(rooted_heap))
}
#[allow(unsafe_code)]
pub(crate) fn decode_to_utf16_with_bom_removal(
bytes: &[u8],
encoding: &'static Encoding,
) -> Vec<u16> {
let mut decoder = encoding.new_decoder_with_bom_removal();
let capacity = decoder
.max_utf16_buffer_length(bytes.len())
.expect("Overflow");
let mut utf16 = Vec::with_capacity(capacity);
let extra = unsafe { slice::from_raw_parts_mut(utf16.as_mut_ptr(), capacity) };
let (_, read, written, _) = decoder.decode_to_utf16(bytes, extra, true);
assert_eq!(read, bytes.len());
unsafe { utf16.set_len(written) }
utf16
}
/// <https://fetch.spec.whatwg.org/#body>
pub(crate) trait BodyMixin {
/// <https://fetch.spec.whatwg.org/#concept-body-disturbed>

View file

@ -4,11 +4,11 @@
use std::borrow::ToOwned;
use std::cell::Cell;
use std::cmp;
use std::default::Default;
use std::str::{self, FromStr};
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use std::{cmp, slice};
use dom_struct::dom_struct;
use encoding_rs::{Encoding, UTF_8};
@ -38,7 +38,7 @@ use servo_atoms::Atom;
use servo_url::ServoUrl;
use url::Position;
use crate::body::{BodySource, Extractable, ExtractedBody};
use crate::body::{decode_to_utf16_with_bom_removal, BodySource, Extractable, ExtractedBody};
use crate::document_loader::DocumentLoader;
use crate::dom::bindings::buffer_source::HeapBufferSource;
use crate::dom::bindings::cell::DomRefCell;
@ -1432,19 +1432,6 @@ impl XMLHttpRequest {
return rval.set(NullValue());
}
// Step 4
fn decode_to_utf16_with_bom_removal(bytes: &[u8], encoding: &'static Encoding) -> Vec<u16> {
let mut decoder = encoding.new_decoder_with_bom_removal();
let capacity = decoder
.max_utf16_buffer_length(bytes.len())
.expect("Overflow");
let mut utf16 = Vec::with_capacity(capacity);
let extra = unsafe { slice::from_raw_parts_mut(utf16.as_mut_ptr(), capacity) };
let last = true;
let (_, read, written, _) = decoder.decode_to_utf16(bytes, extra, last);
assert_eq!(read, bytes.len());
unsafe { utf16.set_len(written) }
utf16
}
// https://xhr.spec.whatwg.org/#json-response refers to
// https://infra.spec.whatwg.org/#parse-json-from-bytes which refers to
// https://encoding.spec.whatwg.org/#utf-8-decode which means