Parse qualified names with non-alpha characters in xpath (#39409)

The existing parsing rules are too strict and only allow alpha and
alphanumeric characters. Instead, we should follow the production
defined in https://www.w3.org/TR/REC-xml-names/#NT-NCName.

Testing: New tests start to pass
Part of https://github.com/servo/servo/issues/34527

---------

Signed-off-by: Simon Wülker <simon.wuelker@arcor.de>
This commit is contained in:
Simon Wülker 2025-09-21 05:45:04 +02:00 committed by GitHub
parent 4d43844ece
commit 2c8533f38e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 29 additions and 24 deletions

View file

@ -4,12 +4,14 @@
use nom::branch::alt;
use nom::bytes::complete::{tag, take_while1};
use nom::character::complete::{alpha1, alphanumeric1, char, digit1, multispace0};
use nom::character::complete::{char, digit1, multispace0};
use nom::combinator::{map, opt, recognize, value};
use nom::error::{Error as NomError, ErrorKind as NomErrorKind, ParseError as NomParseError};
use nom::multi::{many0, separated_list0};
use nom::sequence::{delimited, pair, preceded};
use nom::{Finish, IResult, Parser};
use nom::{AsChar, Finish, IResult, Input, Parser};
use crate::dom::bindings::xmlname::{is_valid_continuation, is_valid_start};
pub(crate) fn parse(input: &str) -> Result<Expr, OwnedParserError> {
let (_, ast) = expr(input).finish().map_err(OwnedParserError::from)?;
@ -955,7 +957,7 @@ fn string_literal(input: &str) -> IResult<&str, Literal> {
.parse(input)
}
// QName parser
/// <https://www.w3.org/TR/REC-xml-names/#NT-QName>
fn qname(input: &str) -> IResult<&str, QName> {
let (input, prefix) = opt((ncname, char(':'))).parse(input)?;
let (input, local) = ncname(input)?;
@ -969,13 +971,31 @@ fn qname(input: &str) -> IResult<&str, QName> {
))
}
// NCName parser
/// <https://www.w3.org/TR/REC-xml-names/#NT-NCName>
fn ncname(input: &str) -> IResult<&str, &str> {
recognize(pair(
alpha1,
many0(alt((alphanumeric1, tag("-"), tag("_")))),
))
.parse(input)
fn name_start_character<T, E: NomParseError<T>>(input: T) -> IResult<T, T, E>
where
T: Input,
<T as Input>::Item: AsChar,
{
input.split_at_position1_complete(
|character| !is_valid_start(character.as_char()) || character.as_char() == ':',
NomErrorKind::OneOf,
)
}
fn name_character<T, E: NomParseError<T>>(input: T) -> IResult<T, T, E>
where
T: Input,
<T as Input>::Item: AsChar,
{
input.split_at_position1_complete(
|character| !is_valid_continuation(character.as_char()) || character.as_char() == ':',
NomErrorKind::OneOf,
)
}
recognize(pair(name_start_character, many0(name_character))).parse(input)
}
// Test functions to verify the parsers:

View file

@ -2,20 +2,8 @@
[Select html element based on attribute mixed case]
expected: FAIL
[Select HTML element with non-ascii attribute 1]
expected: FAIL
[Select HTML element with non-ascii attribute 2]
expected: FAIL
[Select HTML element with non-ascii attribute 3]
expected: FAIL
[Select both HTML and SVG elements based on mixed case attribute]
expected: FAIL
[Select SVG element with non-ascii attribute 1]
expected: FAIL
[Select SVG element with non-ascii attribute 2]
expected: FAIL

View file

@ -2,9 +2,6 @@
[HTML elements mixed case]
expected: FAIL
[Non-ascii HTML element]
expected: FAIL
[Non-ascii HTML element3]
expected: FAIL