From db751f0f8f1a6395ba3427faf21eb6c9c2a02349 Mon Sep 17 00:00:00 2001 From: camchenry <1514176+camchenry@users.noreply.github.com> Date: Sat, 28 Sep 2024 04:18:09 +0000 Subject: [PATCH] refactor(linter): use regexp AST visitor in `no-control-regex` (#6129) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - closes https://github.com/oxc-project/oxc/issues/5416 Rewrites the `no-control-regex` rule to use a regular expression AST visitor instead of the `regex` crate and parsing by hand. This change simplifies the code and makes it easier to maintain. One notable change in the snapshots is the printing of the control characters. Previously, we always printed from the source text. Now, we print a representation of the control character itself based on its numeric value. This resulted in the nonprintable chars being printed, which are invisible. The other reason for this change is that the spans output by the regex parser for unicode escapes do not match 1:1 when raw strings and escapes are involved. This resulted in goofy looking spans in the output: ``` ⚠ eslint(no-control-regex): Unexpected control character: '*\\x' ╭─[no_control_regex.tsx:1:22] 1 │ new RegExp('\\u{1111}*\\x1F', 'u') · ──── ╰──── ``` Not sure where the bug lies there yet. --- .../src/rules/eslint/no_control_regex.rs | 269 +++++++----------- .../src/snapshots/no_control_regex.snap | Bin 4513 -> 4513 bytes 2 files changed, 100 insertions(+), 169 deletions(-) diff --git a/crates/oxc_linter/src/rules/eslint/no_control_regex.rs b/crates/oxc_linter/src/rules/eslint/no_control_regex.rs index 2454e2a1dabd7..26d59c7c628c7 100644 --- a/crates/oxc_linter/src/rules/eslint/no_control_regex.rs +++ b/crates/oxc_linter/src/rules/eslint/no_control_regex.rs @@ -1,12 +1,16 @@ -use lazy_static::lazy_static; +use oxc_allocator::Allocator; use oxc_ast::{ - ast::{Argument, RegExpFlags, RegExpPattern}, + ast::{Argument, RegExpFlags}, AstKind, }; use oxc_diagnostics::OxcDiagnostic; use oxc_macros::declare_oxc_lint; +use oxc_regular_expression::{ + ast::{Character, Pattern}, + visit::Visit, + Parser, ParserOptions, +}; use oxc_span::{GetSpan, Span}; -use regex::{Matches, Regex}; use crate::{ast_util::extract_regex_flags, context::LintContext, rule::Rule, AstNode}; @@ -64,196 +68,123 @@ declare_oxc_lint!( impl Rule for NoControlRegex { fn run<'a>(&self, node: &AstNode<'a>, context: &LintContext<'a>) { - if let Some(RegexPatternData { pattern, flags, span }) = regex_pattern(node) { - let mut violations: Vec<&str> = Vec::new(); - let pattern = pattern.as_ref(); - let pattern_text = pattern.source_text(context.source_text()); - for matched_ctl_pattern in control_patterns(pattern_text.as_ref()) { - let ctl = matched_ctl_pattern.as_str(); + match node.kind() { + // regex literal + AstKind::RegExpLiteral(reg) => { + let Some(pattern) = reg.regex.pattern.as_pattern() else { + return; + }; - // check for an even number of backslashes, since these will - // prevent the pattern from being a control sequence - if ctl.starts_with('\\') && matched_ctl_pattern.start() > 0 { - let pattern_chars: Vec = pattern_text.chars().collect(); // ew + check_pattern(context, pattern, reg.span); + } - // Convert byte index to char index - let byte_start = matched_ctl_pattern.start(); - let char_start = pattern_text[..byte_start].chars().count(); + // new RegExp() + AstKind::NewExpression(expr) => { + // constructor is RegExp, + if expr.callee.is_specific_id("RegExp") + // which is provided at least 1 parameter, + && expr.arguments.len() > 0 + { + // where the first one is a string literal + // note: improvements required for strings used via identifier + // references + if let Argument::StringLiteral(pattern) = &expr.arguments[0] { + // get pattern from arguments. Missing or non-string arguments + // will be runtime errors, but are not covered by this rule. + let alloc = Allocator::default(); + let pattern_with_slashes = format!("/{}/", &pattern.value); + let flags = extract_regex_flags(&expr.arguments); + let parser = Parser::new( + &alloc, + pattern_with_slashes.as_str(), + ParserOptions { + span_offset: expr + .arguments + .first() + .map_or(0, |arg| arg.span().start), + unicode_mode: flags + .is_some_and(|flags| flags.contains(RegExpFlags::U)), + unicode_sets_mode: flags + .is_some_and(|flags| flags.contains(RegExpFlags::V)), + }, + ); - let mut first_backslash = char_start; - while first_backslash > 0 && pattern_chars[first_backslash] == '\\' { - first_backslash -= 1; - } + let Some(pattern) = parser.parse().ok().map(|pattern| pattern.pattern) + else { + return; + }; - let mut num_slashes = char_start - first_backslash; - if first_backslash == 0 && pattern_chars[first_backslash] == '\\' { - num_slashes += 1; - } - // even # of slashes - if num_slashes % 2 == 0 { - continue; + check_pattern(context, &pattern, expr.span); } } + } - if ctl.starts_with(r"\x") || ctl.starts_with(r"\u") { - let mut numeric_part = &ctl[2..]; + // RegExp() + AstKind::CallExpression(expr) => { + // constructor is RegExp, + if expr.callee.is_specific_id("RegExp") + // which is provided at least 1 parameter, + && expr.arguments.len() > 0 + { + // where the first one is a string literal + // note: improvements required for strings used via identifier + // references + if let Argument::StringLiteral(pattern) = &expr.arguments[0] { + // get pattern from arguments. Missing or non-string arguments + // will be runtime errors, but are not covered by this rule. + let alloc = Allocator::default(); + let pattern_with_slashes = format!("/{}/", &pattern.value); + let flags = extract_regex_flags(&expr.arguments); + let parser = Parser::new( + &alloc, + pattern_with_slashes.as_str(), + ParserOptions { + span_offset: expr + .arguments + .first() + .map_or(0, |arg| arg.span().start), + unicode_mode: flags + .is_some_and(|flags| flags.contains(RegExpFlags::U)), + unicode_sets_mode: flags + .is_some_and(|flags| flags.contains(RegExpFlags::V)), + }, + ); - // extract numeric part from \u{00} - if numeric_part.starts_with('{') { - let has_unicode_flag = match flags { - Some(flags) if flags.contains(RegExpFlags::U) => true, - _ => { - continue; - } + let Some(pattern) = parser.parse().ok().map(|pattern| pattern.pattern) + else { + return; }; - // 1. Unicode control pattern is missing a curly brace - // and is therefore invalid. (note: we may want to - // report this?) - // 2. Unicode flag is missing, which is needed for - // interpreting \u{`nn`} as a unicode character - if !has_unicode_flag || !numeric_part.ends_with('}') { - continue; - } - - numeric_part = &numeric_part[1..numeric_part.len() - 1]; - } - - match u32::from_str_radix(numeric_part, 16) { - Err(_) => continue, - Ok(as_num) if as_num > 0x1f => continue, - Ok(_) => { /* noop */ } + check_pattern(context, &pattern, expr.span); } } - - violations.push(ctl); } - - if !violations.is_empty() { - let violations = violations.join(", "); - context.diagnostic(no_control_regex_diagnostic(&violations, span)); - } - } + _ => {} + }; } } -/// Since we don't implement `ToOwned` trait. -enum PatRef<'a> { - Owned(RegExpPattern<'a>), - Borrowed(&'a RegExpPattern<'a>), -} +fn check_pattern(context: &LintContext, pattern: &Pattern, span: Span) { + let mut finder = ControlCharacterFinder { control_chars: Vec::new() }; + finder.visit_pattern(pattern); -impl<'a> AsRef> for PatRef<'a> { - fn as_ref(&self) -> &RegExpPattern<'a> { - match self { - Self::Owned(pat) => pat, - Self::Borrowed(pat) => pat, - } + if !finder.control_chars.is_empty() { + let violations = finder.control_chars.join(", "); + context.diagnostic(no_control_regex_diagnostic(&violations, span)); } } -struct RegexPatternData<'a> { - /// A regex pattern, either from a literal (`/foo/`) a RegExp constructor - /// (`new RegExp("foo")`), or a RegExp function call (`RegExp("foo")) - pattern: PatRef<'a>, - /// Regex flags, if found. It's possible for this to be `Some` but have - /// no flags. - /// - /// Note that flags are represented by a `u8` and therefore safely clonable - /// with low performance overhead. - flags: Option, - /// The pattern's span. For [`oxc_ast::ast::Expression::NewExpression`]s - /// and [`oxc_ast::ast::Expression::CallExpression`]s, - /// this will match the entire new/call expression. - /// - /// Note that spans are 8 bytes and safely clonable with low performance overhead - span: Span, +struct ControlCharacterFinder { + control_chars: Vec, } -/// Returns the regex pattern inside a node, if it's applicable. -/// -/// e.g.: -/// * /foo/ -> "foo" -/// * new RegExp("foo") -> foo -/// -/// note: [`RegExpFlags`] and [`Span`]s are both tiny and cloneable. -fn regex_pattern<'a>(node: &AstNode<'a>) -> Option> { - let kind = node.kind(); - match kind { - // regex literal - AstKind::RegExpLiteral(reg) => Some(RegexPatternData { - pattern: PatRef::Borrowed(®.regex.pattern), - flags: Some(reg.regex.flags), - span: reg.span, - }), - - // new RegExp() - AstKind::NewExpression(expr) => { - // constructor is RegExp, - if expr.callee.is_specific_id("RegExp") - // which is provided at least 1 parameter, - && expr.arguments.len() > 0 - { - // where the first one is a string literal - // note: improvements required for strings used via identifier - // references - if let Argument::StringLiteral(pattern) = &expr.arguments[0] { - // get pattern from arguments. Missing or non-string arguments - // will be runtime errors, but are not covered by this rule. - // Note that we're intentionally reporting the entire "new - // RegExp("pat") expression, not just "pat". - Some(RegexPatternData { - pattern: PatRef::Owned(RegExpPattern::Raw(pattern.value.as_ref())), - flags: extract_regex_flags(&expr.arguments), - span: kind.span(), - }) - } else { - None - } - } else { - None - } - } - - // RegExp() - AstKind::CallExpression(expr) => { - // constructor is RegExp, - if expr.callee.is_specific_id("RegExp") - // which is provided at least 1 parameter, - && expr.arguments.len() > 0 - { - // where the first one is a string literal - // note: improvements required for strings used via identifier - // references - if let Argument::StringLiteral(pattern) = &expr.arguments[0] { - // get pattern from arguments. Missing or non-string arguments - // will be runtime errors, but are not covered by this rule. - // Note that we're intentionally reporting the entire "new - // RegExp("pat") expression, not just "pat". - Some(RegexPatternData { - pattern: PatRef::Owned(RegExpPattern::Raw(pattern.value.as_ref())), - flags: extract_regex_flags(&expr.arguments), - span: kind.span(), - }) - } else { - None - } - } else { - None - } +impl<'a> Visit<'a> for ControlCharacterFinder { + fn visit_character(&mut self, ch: &Character) { + // Control characters are in the range 0x00 to 0x1F + if ch.value <= 0x1F { + self.control_chars.push(ch.to_string()); } - _ => None, - } -} - -fn control_patterns(pattern: &str) -> Matches<'static, '_> { - lazy_static! { - static ref CTL_PAT: Regex = Regex::new( - r"([\x00-\x1f]|(?:\\x\w{2})|(?:\\u\w{4})|(?:\\u\{\w{1,4}\}))" - // r"((?:\\x\w{2}))" - ).unwrap(); } - CTL_PAT.find_iter(pattern) } #[cfg(test)] diff --git a/crates/oxc_linter/src/snapshots/no_control_regex.snap b/crates/oxc_linter/src/snapshots/no_control_regex.snap index 994883bc7da9e2e31951c40fbed49e21b2847c9d..f931a8ab5d353991fcbe35d8af4f924f40c1fc1e 100644 GIT binary patch delta 116 zcmZ3eyij?A7UN_q-n7XR7`H%a*~!P57K7Orn9qP|2R60M8m!H%PzB