Skip to content

Commit

Permalink
Fix valid character sets in Annex-B
Browse files Browse the repository at this point in the history
  • Loading branch information
HalidOdat committed Apr 23, 2023
1 parent ee710e5 commit 8fdd10c
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 20 deletions.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ index-positions = []
# Prohibits all uses of unsafe code, for the paranoid.
prohibit-unsafe = []

# Enable EcmaScript's Annex-B RegExp syntax.
annex-b = []

[dependencies]
hashbrown = "0.13.2"
memchr = { version = "2.4.0", default-features = false }
43 changes: 25 additions & 18 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,39 +416,46 @@ where
}

// Parse a code point or character class.
let first = self.try_consume_bracket_class_atom()?;
if first.is_none() {
let Some(first) = self.try_consume_bracket_class_atom()? else {
continue;
}
};

// Check for a dash; we may have a range.
if !self.try_consume('-') {
add_class_atom(&mut result, first.unwrap());
add_class_atom(&mut result, first);
continue;
}

let second = self.try_consume_bracket_class_atom()?;
if second.is_none() {
let Some(second) = self.try_consume_bracket_class_atom()? else {
// No second atom. For example: [a-].
add_class_atom(&mut result, first.unwrap());
add_class_atom(&mut result, first);
add_class_atom(&mut result, ClassAtom::CodePoint(u32::from('-')));
continue;
}
};

// Ranges can't contain character classes: [\d-z] is invalid.
// Ranges must also be in order: z-a is invalid.
// ES6 21.2.2.15.1 "If i > j, throw a SyntaxError exception"
match (first.unwrap(), second.unwrap()) {
(ClassAtom::CodePoint(c1), ClassAtom::CodePoint(c2)) if c1 <= c2 => {
result.cps.add(Interval {
first: c1 as u32,
last: c2 as u32,
})
}
_ => {
return error("Invalid character range");
if let (ClassAtom::CodePoint(c1), ClassAtom::CodePoint(c2)) = (&first, &second) {
if c1 > c2 {
return error(
"Range values reversed, start char code is greater than end char code.",
);
}
result.cps.add(Interval {
first: *c1,
last: *c2,
});
continue;
}

if cfg!(not(feature = "annex-b")) || self.flags.unicode {
return error("Invalid character range");
}

// If it does not match a range treat as any match single characters.
add_class_atom(&mut result, first);
add_class_atom(&mut result, ClassAtom::CodePoint(u32::from('-')));
add_class_atom(&mut result, second);
}
}

Expand Down
5 changes: 5 additions & 0 deletions tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ impl TestCompiledRegex {
}

/// Match against a string, returning the first formatted match.
#[track_caller]
pub fn match1f(&self, input: &str) -> String {
match self.find(input) {
Some(m) => format_match(&m, input),
Expand Down Expand Up @@ -116,11 +117,13 @@ impl TestCompiledRegex {
}

/// Test that matching against \p input fails.
#[track_caller]
pub fn test_fails(&self, input: &str) {
assert!(self.find(input).is_none(), "Should not have matched")
}

/// Test that matching against \p input succeeds.
#[track_caller]
pub fn test_succeeds(&self, input: &str) {
assert!(self.find(input).is_some(), "Should have matched")
}
Expand Down Expand Up @@ -184,6 +187,7 @@ impl TestConfig {
}

/// Compile a pattern to a regex, with given flags.
#[track_caller]
pub fn compilef(&self, pattern: &str, flags_str: &str) -> TestCompiledRegex {
let mut flags = regress::Flags::from(flags_str);
flags.no_opt = !self.optimize;
Expand All @@ -204,6 +208,7 @@ impl TestConfig {

/// Test that \p pattern and \p flags successfully parses, and matches
/// \p input.
#[track_caller]
pub fn test_match_succeeds(&self, pattern: &str, flags_str: &str, input: &str) {
let cr = self.compilef(pattern, flags_str);
cr.test_succeeds(input)
Expand Down
10 changes: 8 additions & 2 deletions tests/syntax_error_tests.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#[track_caller]
fn test_1_error(pattern: &str, expected_err: &str) {
let res = regress::Regex::with_flags(pattern, "u");
assert!(res.is_err(), "Pattern should not have parsed: {}", pattern);
Expand Down Expand Up @@ -39,8 +40,13 @@ fn test_syntax_errors() {
test_1_error(r"(?!", "Unbalanced parenthesis");
test_1_error(r"abc)", "Unbalanced parenthesis");

test_1_error(r"[z-a]", "Invalid character range");
test_1_error(r"[\d-z]", "Invalid character range");
test_1_error(
r"[z-a]",
"Range values reversed, start char code is greater than end char code.",
);

// In unicode mode this is not allowed.
test_1_error(r"[a-\s]", "Invalid character range");

test_1_error("\\", "Incomplete escape");

Expand Down
27 changes: 27 additions & 0 deletions tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1411,3 +1411,30 @@ fn unicode_escape_property_binary_assigned_tc(tc: TestConfig) {
}
}
}

#[test]
#[cfg(feature = "annex-b")]
fn test_valid_character_sets() {
test_with_configs(test_valid_character_sets_tc)
}

#[rustfmt::skip]
#[cfg(feature = "annex-b")]
fn test_valid_character_sets_tc(tc: TestConfig) {
// From: https://github.com/boa-dev/boa/issues/2794
let regexp = r"[a-\s]";
tc.test_match_succeeds(regexp, "", "a");
tc.test_match_succeeds(regexp, "", "-");
tc.test_match_succeeds(regexp, "", " ");
tc.test_match_fails(regexp, "", "s");
tc.test_match_fails(regexp, "", "$");

let regexp = r"[\d-z]";
tc.test_match_succeeds(regexp, "", "z");
tc.test_match_succeeds(regexp, "", "1");
tc.test_match_succeeds(regexp, "", "7");
tc.test_match_succeeds(regexp, "", "9");
tc.test_match_fails(regexp, "", "a");
tc.test_match_fails(regexp, "", "f");
tc.test_match_fails(regexp, "", " ");
}

0 comments on commit 8fdd10c

Please sign in to comment.