feat(transformer): support all /regex/ to new RegExp transforms (#5387

) related: #4754 The implementation port from [esbuild](https://github.com/evanw/esbuild/blob/332727499e62315cff4ecaff9fa8b86336555e46/internal/js_parser/js_parser.go#L12820-L12840). And cover all babel's regexp plugins --- ## The following description was generated by `Graphite` 😋 ### TL;DR Added support for transforming various RegExp features to ensure compatibility with older JavaScript environments. ### What changed? - Implemented a new `RegExp` transformer to handle unsupported RegExp literal features - Added options to control different RegExp transformations (e.g., sticky flag, unicode flag, dot-all flag, etc.) - Updated the transformer to convert unsupported RegExp literals into `new RegExp()` constructor calls - Added test cases for different RegExp transformations - Integrated the new RegExp transformer into the existing transformation pipeline ### How to test? 1. Run the existing test suite to ensure no regressions 2. Execute the new RegExp-specific tests in the `tasks/transform_conformance/tests/esbuild-tests/test/fixtures/regexp/` directory 3. Try transforming code with various RegExp features using different target environments to verify correct transformations
oxc-project · Sep 5, 2024 · c59d8b3 · c59d8b3
1 parent d9d29f8
commit c59d8b3
Show file tree

Hide file tree

Showing 35 changed files with 476 additions and 43 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/oxc_transformer/Cargo.toml b/crates/oxc_transformer/Cargo.toml
@@ -21,13 +21,14 @@ test    = false
 doctest = false
 
 [dependencies]
-oxc_ast         = { workspace = true }
-oxc_span        = { workspace = true }
-oxc_allocator   = { workspace = true }
-oxc_diagnostics = { workspace = true }
-oxc_syntax      = { workspace = true, features = ["to_js_string"] }
-oxc_traverse    = { workspace = true }
-oxc_semantic    = { workspace = true }
+oxc_ast                = { workspace = true }
+oxc_span               = { workspace = true }
+oxc_allocator          = { workspace = true }
+oxc_diagnostics        = { workspace = true }
+oxc_syntax             = { workspace = true, features = ["to_js_string"] }
+oxc_traverse           = { workspace = true }
+oxc_semantic           = { workspace = true }
+oxc_regular_expression = { workspace = true }
 
 dashmap          = { workspace = true }
 indexmap         = { workspace = true }

diff --git a/crates/oxc_transformer/src/env/data/babel.rs b/crates/oxc_transformer/src/env/data/babel.rs
@@ -8,10 +8,17 @@ use crate::env::{targets::version::Version, Versions};
 fn features() -> &'static FxHashMap<String, Versions> {
     static FEATURES: OnceLock<FxHashMap<String, Versions>> = OnceLock::new();
     FEATURES.get_or_init(|| {
-        let map: FxHashMap<String, FxHashMap<String, String>> =
+        let mut map: FxHashMap<String, FxHashMap<String, String>> =
             serde_json::from_str(include_str!("./@babel/compat_data/data/plugins.json"))
                 .expect("failed to parse json");
 
+        map.extend(
+            serde_json::from_str::<FxHashMap<String, FxHashMap<String, String>>>(include_str!(
+                "./esbuild/features.json"
+            ))
+            .expect("failed to parse json"),
+        );
+
         map.into_iter()
             .map(|(feature, mut versions)| {
                 (feature, {

diff --git a/crates/oxc_transformer/src/env/data/esbuild/features.json b/crates/oxc_transformer/src/env/data/esbuild/features.json
@@ -0,0 +1,23 @@
+{
+  "esbuild-regexp-lookbehind-assertions": {
+    "chrome": "62",
+    "deno": "1.0",
+    "edge": "79",
+    "firefox": "78",
+    "hermes": "0.7",
+    "ios": "16.4",
+    "node": "8.10",
+    "opera": "49",
+    "safari": "16.4"
+  },
+  "esbuild-regexp-match-indices": {
+    "chrome": "90",
+    "deno": "1.8",
+    "edge": "90",
+    "firefox": "88",
+    "ios": "15.0",
+    "node": "16.0",
+    "opera": "76",
+    "safari": "15.0"
+  }
+}
diff --git a/crates/oxc_transformer/src/lib.rs b/crates/oxc_transformer/src/lib.rs
@@ -21,6 +21,7 @@ mod es2019;
 mod es2020;
 mod es2021;
 mod react;
+mod regexp;
 mod typescript;
 
 mod helpers {
@@ -41,6 +42,7 @@ use oxc_diagnostics::OxcDiagnostic;
 use oxc_semantic::{ScopeTree, SymbolTable};
 use oxc_span::{SourceType, SPAN};
 use oxc_traverse::{traverse_mut, Traverse, TraverseCtx};
+use regexp::RegExp;
 
 pub use crate::{
     compiler_assumptions::CompilerAssumptions,
@@ -74,6 +76,7 @@ pub struct Transformer<'a> {
     x2_es2018: ES2018<'a>,
     x2_es2016: ES2016<'a>,
     x3_es2015: ES2015<'a>,
+    x4_regexp: RegExp<'a>,
 }
 
 impl<'a> Transformer<'a> {
@@ -102,7 +105,8 @@ impl<'a> Transformer<'a> {
             x2_es2019: ES2019::new(options.es2019, Rc::clone(&ctx)),
             x2_es2018: ES2018::new(options.es2018, Rc::clone(&ctx)),
             x2_es2016: ES2016::new(options.es2016, Rc::clone(&ctx)),
-            x3_es2015: ES2015::new(options.es2015, ctx),
+            x3_es2015: ES2015::new(options.es2015, Rc::clone(&ctx)),
+            x4_regexp: RegExp::new(options.regexp, ctx),
         }
     }
 
@@ -177,6 +181,7 @@ impl<'a> Traverse<'a> for Transformer<'a> {
         self.x2_es2018.enter_expression(expr, ctx);
         self.x2_es2016.enter_expression(expr, ctx);
         self.x3_es2015.enter_expression(expr, ctx);
+        self.x4_regexp.enter_expression(expr, ctx);
     }
 
     fn exit_expression(&mut self, expr: &mut Expression<'a>, ctx: &mut TraverseCtx<'a>) {

diff --git a/crates/oxc_transformer/src/options/transformer.rs b/crates/oxc_transformer/src/options/transformer.rs
@@ -14,6 +14,7 @@ use crate::{
     es2021::ES2021Options,
     options::babel::BabelOptions,
     react::ReactOptions,
+    regexp::RegExpOptions,
     typescript::TypeScriptOptions,
 };
 
@@ -38,6 +39,8 @@ pub struct TransformOptions {
     /// [preset-react](https://babeljs.io/docs/babel-preset-react)
     pub react: ReactOptions,
 
+    pub regexp: RegExpOptions,
+
     pub es2015: ES2015Options,
 
     pub es2016: ES2016Options,
@@ -60,6 +63,7 @@ impl TransformOptions {
             es2019: ES2019Options::from_targets_and_bugfixes(targets, bugfixes),
             es2020: ES2020Options::from_targets_and_bugfixes(targets, bugfixes),
             es2021: ES2021Options::from_targets_and_bugfixes(targets, bugfixes),
+            regexp: RegExpOptions::from_targets_and_bugfixes(targets, bugfixes),
             ..Default::default()
         }
     }
@@ -215,6 +219,29 @@ impl TransformOptions {
             }
         };
 
+        let regexp = transformer_options.regexp;
+        if !regexp.sticky_flag {
+            transformer_options.regexp.sticky_flag = options.has_plugin("transform-sticky-regex");
+        }
+        if !regexp.unicode_flag {
+            transformer_options.regexp.unicode_flag = options.has_plugin("transform-unicode-regex");
+        }
+        if !regexp.dot_all_flag {
+            transformer_options.regexp.dot_all_flag = options.has_plugin("transform-dotall-regex");
+        }
+        if !regexp.named_capture_groups {
+            transformer_options.regexp.named_capture_groups =
+                options.has_plugin("transform-named-capturing-groups-regex");
+        }
+        if !regexp.unicode_property_escapes {
+            transformer_options.regexp.unicode_property_escapes =
+                options.has_plugin("transform-unicode-property-regex");
+        }
+        if !regexp.set_notation {
+            transformer_options.regexp.set_notation =
+                options.has_plugin("transform-unicode-sets-regex");
+        }
+
         transformer_options.assumptions = if options.assumptions.is_null() {
             CompilerAssumptions::default()
         } else {

diff --git a/crates/oxc_transformer/src/regexp/mod.rs b/crates/oxc_transformer/src/regexp/mod.rs
@@ -0,0 +1,218 @@
+//! RegExp Transformer
+//!
+//! This module supports various RegExp plugins to handle unsupported RegExp literal features.
+//! When an unsupported feature is detected, these plugins convert the RegExp literal into
+//! a `new RegExp()` constructor call to avoid syntax errors.
+//!
+//! Note: You will need to include a polyfill for the `RegExp` constructor in your code to have the correct runtime behavior.
+//!
+//! ### ES2015
+//!
+//! #### Sticky flag (`y`)
+//! - @babel/plugin-transform-sticky-regex: <https://babeljs.io/docs/en/babel-plugin-transform-sticky-regex>
+//!
+//! #### Unicode flag (`u`)
+//! - @babel/plugin-transform-unicode-regex: <https://babeljs.io/docs/en/babel-plugin-transform-unicode-regex>
+//!
+//! ### ES2018
+//!
+//! #### DotAll flag (`s`)
+//! - @babel/plugin-transform-dotall-regex: <https://babeljs.io/docs/en/babel-plugin-transform-dotall-regex>
+//! - Spec: ECMAScript 2018: <https://262.ecma-international.org/9.0/#sec-get-regexp.prototype.dotAll>
+//!
+//! #### Lookbehind assertions (`/(?<=x)/` and `/(?<!x)/`)
+//! - Implementation: Same as esbuild's handling
+//!
+//! #### Named capture groups (`(?<name>x)`)
+//! - @babel/plugin-transform-named-capturing-groups-regex: <https://babeljs.io/docs/en/babel-plugin-transform-named-capturing-groups-regex>
+//!
+//! #### Unicode property escapes (`\p{...}` and `\P{...}`)
+//! - @babel/plugin-transform-unicode-property-regex: <https://babeljs.io/docs/en/babel-plugin-proposal-unicode-property-regex>
+//!
+//! ### ES2022
+//!
+//! #### Match indices flag (`d`)
+//! - Implementation: Same as esbuild's handling
+//!
+//! ### ES2024
+//!
+//! #### Set notation + properties of strings (`v`)
+//! - @babel/plugin-transform-unicode-sets-regex: <https://babeljs.io/docs/en/babel-plugin-proposal-unicode-sets-regex>
+//! - TC39 Proposal: <https://github.com/tc39/proposal-regexp-set-notation>
+
+mod options;
+
+use std::borrow::Cow;
+use std::mem;
+
+pub use options::RegExpOptions;
+use oxc_allocator::Box;
+use oxc_allocator::Vec;
+use oxc_ast::ast::*;
+use oxc_regular_expression::ast::{
+    CharacterClass, CharacterClassContents, LookAroundAssertionKind, Pattern, Term,
+};
+use oxc_semantic::ReferenceFlags;
+use oxc_span::Atom;
+use oxc_traverse::{Traverse, TraverseCtx};
+
+use crate::context::Ctx;
+
+pub struct RegExp<'a> {
+    _ctx: Ctx<'a>,
+    options: RegExpOptions,
+}
+
+impl<'a> RegExp<'a> {
+    pub fn new(options: RegExpOptions, ctx: Ctx<'a>) -> Self {
+        Self { _ctx: ctx, options }
+    }
+}
+
+impl<'a> Traverse<'a> for RegExp<'a> {
+    fn enter_expression(
+        &mut self,
+        expr: &mut Expression<'a>,
+        ctx: &mut oxc_traverse::TraverseCtx<'a>,
+    ) {
+        let Expression::RegExpLiteral(ref mut regexp) = expr else {
+            return;
+        };
+
+        if !self.has_unsupported_regular_expression_flags(regexp.regex.flags)
+            && self.requires_pattern_analysis()
+        {
+            match try_parse_pattern(regexp, ctx) {
+                Ok(pattern) => {
+                    let is_unsupported = self.has_unsupported_regular_expression_pattern(&pattern);
+                    regexp.regex.pattern = RegExpPattern::Pattern(pattern);
+                    if !is_unsupported {
+                        return;
+                    }
+                }
+                Err(err) => {
+                    regexp.regex.pattern = RegExpPattern::Invalid(err);
+                    return;
+                }
+            }
+        };
+
+        let pattern_source: Cow<'_, str> = match &regexp.regex.pattern {
+            RegExpPattern::Raw(raw) | RegExpPattern::Invalid(raw) => Cow::Borrowed(raw),
+            RegExpPattern::Pattern(p) => Cow::Owned(p.to_string()),
+        };
+
+        let callee = {
+            let symbol_id = ctx.scopes().find_binding(ctx.current_scope_id(), "RegExp");
+            let ident = ctx.create_reference_id(
+                regexp.span,
+                Atom::from("RegExp"),
+                symbol_id,
+                ReferenceFlags::read(),
+            );
+            ctx.ast.expression_from_identifier_reference(ident)
+        };
+
+        let mut arguments = ctx.ast.vec_with_capacity(2);
+        arguments.push(
+            ctx.ast.argument_expression(
+                ctx.ast.expression_string_literal(regexp.span, pattern_source),
+            ),
+        );
+
+        let flags = regexp.regex.flags.to_string();
+        let flags =
+            ctx.ast.argument_expression(ctx.ast.expression_string_literal(regexp.span, flags));
+        arguments.push(flags);
+
+        *expr = ctx.ast.expression_new(
+            regexp.span,
+            callee,
+            arguments,
+            None::<TSTypeParameterInstantiation>,
+        );
+    }
+}
+
+impl<'a> RegExp<'a> {
+    fn requires_pattern_analysis(&self) -> bool {
+        self.options.named_capture_groups
+            || self.options.unicode_property_escapes
+            || self.options.look_behind_assertions
+    }
+
+    /// Check if the regular expression contains any unsupported flags.
+    fn has_unsupported_regular_expression_flags(&self, flags: RegExpFlags) -> bool {
+        flags.iter().any(|f| match f {
+            RegExpFlags::S if self.options.dot_all_flag => true,
+            RegExpFlags::Y if self.options.sticky_flag => true,
+            RegExpFlags::U if self.options.unicode_flag => true,
+            RegExpFlags::D if self.options.match_indices => true,
+            RegExpFlags::V if self.options.set_notation => true,
+            _ => false,
+        })
+    }
+
+    /// Check if the regular expression contains any unsupported syntax.
+    ///
+    /// Based on parsed regular expression pattern.
+    fn has_unsupported_regular_expression_pattern(&self, pattern: &Pattern<'a>) -> bool {
+        let check_terms = |terms: &Vec<'a, Term>| {
+            terms.iter().any(|element| match element {
+                Term::CapturingGroup(_) if self.options.named_capture_groups => true,
+                Term::UnicodePropertyEscape(_) if self.options.unicode_property_escapes => true,
+                Term::CharacterClass(character_class) if self.options.unicode_property_escapes => {
+                    has_unicode_property_escape_character_class(character_class)
+                }
+                Term::LookAroundAssertion(assertion)
+                    if self.options.look_behind_assertions
+                        && matches!(
+                            assertion.kind,
+                            LookAroundAssertionKind::Lookbehind
+                                | LookAroundAssertionKind::NegativeLookbehind
+                        ) =>
+                {
+                    true
+                }
+                _ => false,
+            })
+        };
+
+        pattern.body.body.iter().any(|alternative| check_terms(&alternative.body))
+    }
+}
+
+fn has_unicode_property_escape_character_class(character_class: &CharacterClass) -> bool {
+    character_class.body.iter().any(|element| match element {
+        CharacterClassContents::UnicodePropertyEscape(_) => true,
+        CharacterClassContents::NestedCharacterClass(character_class) => {
+            has_unicode_property_escape_character_class(character_class)
+        }
+        _ => false,
+    })
+}
+
+fn try_parse_pattern<'a>(
+    literal: &mut RegExpLiteral<'a>,
+    ctx: &mut TraverseCtx<'a>,
+) -> Result<Box<'a, Pattern<'a>>, &'a str> {
+    // Take the ownership of the pattern
+    let regexp_pattern = mem::replace(&mut literal.regex.pattern, RegExpPattern::Raw(""));
+
+    match regexp_pattern {
+        RegExpPattern::Raw(raw) => {
+            use oxc_regular_expression::{ParserOptions, PatternParser};
+            let options = ParserOptions {
+                span_offset: literal.span.start + 1, // exclude `/`
+                unicode_mode: literal.regex.flags.contains(RegExpFlags::U)
+                    || literal.regex.flags.contains(RegExpFlags::V),
+                unicode_sets_mode: literal.regex.flags.contains(RegExpFlags::V),
+            };
+            PatternParser::new(ctx.ast.allocator, raw, options)
+                .parse()
+                .map_or_else(|_| Err(raw), |p| Ok(ctx.alloc(p)))
+        }
+        RegExpPattern::Pattern(pattern) => Ok(pattern),
+        RegExpPattern::Invalid(raw) => Err(raw),
+    }
+}