From 2c8533f38e854adfc50bf7d03f3d213985ccbe52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20W=C3=BClker?= Date: Sun, 21 Sep 2025 05:45:04 +0200 Subject: [PATCH] Parse qualified names with non-alpha characters in xpath (#39409) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing parsing rules are too strict and only allow alpha and alphanumeric characters. Instead, we should follow the production defined in https://www.w3.org/TR/REC-xml-names/#NT-NCName. Testing: New tests start to pass Part of https://github.com/servo/servo/issues/34527 --------- Signed-off-by: Simon Wülker --- components/script/xpath/parser.rs | 38 ++++++++++++++----- .../domxpath/text-html-attributes.html.ini | 12 ------ .../meta/domxpath/text-html-elements.html.ini | 3 -- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/components/script/xpath/parser.rs b/components/script/xpath/parser.rs index f1f38ea0da5..699c9bf235a 100644 --- a/components/script/xpath/parser.rs +++ b/components/script/xpath/parser.rs @@ -4,12 +4,14 @@ use nom::branch::alt; use nom::bytes::complete::{tag, take_while1}; -use nom::character::complete::{alpha1, alphanumeric1, char, digit1, multispace0}; +use nom::character::complete::{char, digit1, multispace0}; use nom::combinator::{map, opt, recognize, value}; use nom::error::{Error as NomError, ErrorKind as NomErrorKind, ParseError as NomParseError}; use nom::multi::{many0, separated_list0}; use nom::sequence::{delimited, pair, preceded}; -use nom::{Finish, IResult, Parser}; +use nom::{AsChar, Finish, IResult, Input, Parser}; + +use crate::dom::bindings::xmlname::{is_valid_continuation, is_valid_start}; pub(crate) fn parse(input: &str) -> Result { let (_, ast) = expr(input).finish().map_err(OwnedParserError::from)?; @@ -955,7 +957,7 @@ fn string_literal(input: &str) -> IResult<&str, Literal> { .parse(input) } -// QName parser +/// fn qname(input: &str) -> IResult<&str, QName> { let (input, prefix) = opt((ncname, char(':'))).parse(input)?; let (input, local) = ncname(input)?; @@ -969,13 +971,31 @@ fn qname(input: &str) -> IResult<&str, QName> { )) } -// NCName parser +/// fn ncname(input: &str) -> IResult<&str, &str> { - recognize(pair( - alpha1, - many0(alt((alphanumeric1, tag("-"), tag("_")))), - )) - .parse(input) + fn name_start_character>(input: T) -> IResult + where + T: Input, + ::Item: AsChar, + { + input.split_at_position1_complete( + |character| !is_valid_start(character.as_char()) || character.as_char() == ':', + NomErrorKind::OneOf, + ) + } + + fn name_character>(input: T) -> IResult + where + T: Input, + ::Item: AsChar, + { + input.split_at_position1_complete( + |character| !is_valid_continuation(character.as_char()) || character.as_char() == ':', + NomErrorKind::OneOf, + ) + } + + recognize(pair(name_start_character, many0(name_character))).parse(input) } // Test functions to verify the parsers: diff --git a/tests/wpt/meta/domxpath/text-html-attributes.html.ini b/tests/wpt/meta/domxpath/text-html-attributes.html.ini index 22dc2e57ac1..de4111d0cec 100644 --- a/tests/wpt/meta/domxpath/text-html-attributes.html.ini +++ b/tests/wpt/meta/domxpath/text-html-attributes.html.ini @@ -2,20 +2,8 @@ [Select html element based on attribute mixed case] expected: FAIL - [Select HTML element with non-ascii attribute 1] - expected: FAIL - - [Select HTML element with non-ascii attribute 2] - expected: FAIL - [Select HTML element with non-ascii attribute 3] expected: FAIL [Select both HTML and SVG elements based on mixed case attribute] expected: FAIL - - [Select SVG element with non-ascii attribute 1] - expected: FAIL - - [Select SVG element with non-ascii attribute 2] - expected: FAIL diff --git a/tests/wpt/meta/domxpath/text-html-elements.html.ini b/tests/wpt/meta/domxpath/text-html-elements.html.ini index 5fdd314b4c7..dfac96a8954 100644 --- a/tests/wpt/meta/domxpath/text-html-elements.html.ini +++ b/tests/wpt/meta/domxpath/text-html-elements.html.ini @@ -2,9 +2,6 @@ [HTML elements mixed case] expected: FAIL - [Non-ascii HTML element] - expected: FAIL - [Non-ascii HTML element3] expected: FAIL