From 5a5e2b3c9f3c298a707c438e04956feed059f533 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 26 Aug 2022 19:41:37 -0400
Subject: [PATCH] syntax: reject '(?-u)\W' when UTF-8 mode is enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When Unicode mode is disabled (i.e., (?-u)), the Perl character classes
(\w, \d and \s) revert to their ASCII definitions. The negated forms
of these classes are also derived from their ASCII definitions, and this
means that they may actually match bytes outside of ASCII and thus
possibly invalid UTF-8. For this reason, when the translator is
configured to only produce HIR that matches valid UTF-8, '(?-u)\W'
should be rejected.

Previously, it was not being rejected, which could actually lead to
matches that produced offsets that split codepoints, and thus lead to
panics when match offsets are used to slice a string. For example, this
code

  fn main() {
      let re = regex::Regex::new(r"(?-u)\W").unwrap();
      let haystack = "☃";
      if let Some(m) = re.find(haystack) {
          println!("{:?}", &haystack[m.range()]);
      }
  }

panics with

  byte index 1 is not a char boundary; it is inside '☃' (bytes 0..3) of `☃`

That is, it reports a match at 0..1, which is technically correct, but
the regex itself should have been rejected in the first place since the
top-level Regex API always has UTF-8 mode enabled.

Also, many of the replacement tests were using '(?-u)\W' (or similar)
for some reason. I'm not sure why, so I just removed the '(?-u)' to make
those tests pass. Whether Unicode is enabled or not doesn't seem to be
an interesting detail for those tests. (All haystacks and replacements
appear to be ASCII.)

Fixes #895, Partially addresses #738
---
 regex-syntax/src/hir/translate.rs | 95 +++++++++++++++++++++++++++----
 tests/replace.rs                  | 37 +++---------
 2 files changed, 92 insertions(+), 40 deletions(-)

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index d7686988a..988384ede 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -305,7 +305,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     let hcls = hir::Class::Unicode(cls);
                     self.push(HirFrame::Expr(Hir::class(hcls)));
                 } else {
-                    let cls = self.hir_perl_byte_class(x);
+                    let cls = self.hir_perl_byte_class(x)?;
                     let hcls = hir::Class::Bytes(cls);
                     self.push(HirFrame::Expr(Hir::class(hcls)));
                 }
@@ -445,7 +445,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     cls.union(&xcls);
                     self.push(HirFrame::ClassUnicode(cls));
                 } else {
-                    let xcls = self.hir_perl_byte_class(x);
+                    let xcls = self.hir_perl_byte_class(x)?;
                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
                     cls.union(&xcls);
                     self.push(HirFrame::ClassBytes(cls));
@@ -879,7 +879,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
     fn hir_perl_byte_class(
         &self,
         ast_class: &ast::ClassPerl,
-    ) -> hir::ClassBytes {
+    ) -> Result<hir::ClassBytes> {
         use crate::ast::ClassPerlKind::*;
 
         assert!(!self.flags().unicode());
@@ -893,7 +893,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         if ast_class.negated {
             class.negate();
         }
-        class
+        // Negating a Perl byte class is likely to cause it to match invalid
+        // UTF-8. That's only OK if the translator is configured to allow such
+        // things.
+        if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
+            return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
+        }
+        Ok(class)
     }
 
     /// Converts the given Unicode specific error to an HIR translation error.
@@ -1971,7 +1977,7 @@ mod tests {
 
     #[test]
     #[cfg(feature = "unicode-perl")]
-    fn class_perl() {
+    fn class_perl_unicode() {
         // Unicode
         assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
         assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
@@ -2011,7 +2017,10 @@ mod tests {
         );
         #[cfg(feature = "unicode-case")]
         assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
+    }
 
+    #[test]
+    fn class_perl_ascii() {
         // ASCII only
         assert_eq!(
             t(r"(?-u)\d"),
@@ -2040,29 +2049,93 @@ mod tests {
 
         // ASCII only, negated
         assert_eq!(
-            t(r"(?-u)\D"),
+            t_bytes(r"(?-u)\D"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
         );
         assert_eq!(
-            t(r"(?-u)\S"),
+            t_bytes(r"(?-u)\S"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
         );
         assert_eq!(
-            t(r"(?-u)\W"),
+            t_bytes(r"(?-u)\W"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
         );
         assert_eq!(
-            t(r"(?i-u)\D"),
+            t_bytes(r"(?i-u)\D"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
         );
         assert_eq!(
-            t(r"(?i-u)\S"),
+            t_bytes(r"(?i-u)\S"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
         );
         assert_eq!(
-            t(r"(?i-u)\W"),
+            t_bytes(r"(?i-u)\W"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
         );
+
+        // ASCII only, negated, with UTF-8 mode enabled.
+        // In this case, negating any Perl class results in an error because
+        // all such classes can match invalid UTF-8.
+        assert_eq!(
+            t_err(r"(?-u)\D"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(5, 1, 6),
+                    Position::new(7, 1, 8),
+                ),
+            },
+        );
+        assert_eq!(
+            t_err(r"(?-u)\S"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(5, 1, 6),
+                    Position::new(7, 1, 8),
+                ),
+            },
+        );
+        assert_eq!(
+            t_err(r"(?-u)\W"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(5, 1, 6),
+                    Position::new(7, 1, 8),
+                ),
+            },
+        );
+        assert_eq!(
+            t_err(r"(?i-u)\D"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(6, 1, 7),
+                    Position::new(8, 1, 9),
+                ),
+            },
+        );
+        assert_eq!(
+            t_err(r"(?i-u)\S"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(6, 1, 7),
+                    Position::new(8, 1, 9),
+                ),
+            },
+        );
+        assert_eq!(
+            t_err(r"(?i-u)\W"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(6, 1, 7),
+                    Position::new(8, 1, 9),
+                ),
+            },
+        );
     }
 
     #[test]
diff --git a/tests/replace.rs b/tests/replace.rs
index d65be072f..ec2ef9eff 100644
--- a/tests/replace.rs
+++ b/tests/replace.rs
@@ -12,18 +12,11 @@ macro_rules! replace(
 replace!(first, replace, r"[0-9]", "age: 26", t!("Z"), "age: Z6");
 replace!(plus, replace, r"[0-9]+", "age: 26", t!("Z"), "age: Z");
 replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ");
-replace!(
-    groups,
-    replace,
-    r"(?-u)(\S+)\s+(\S+)",
-    "w1 w2",
-    t!("$2 $1"),
-    "w2 w1"
-);
+replace!(groups, replace, r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $1"), "w2 w1");
 replace!(
     double_dollar,
     replace,
-    r"(?-u)(\S+)\s+(\S+)",
+    r"(\S+)\s+(\S+)",
     "w1 w2",
     t!("$2 $$1"),
     "w2 $1"
@@ -33,7 +26,7 @@ replace!(
 replace!(
     named,
     replace_all,
-    r"(?-u)(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
+    r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
     "w1 w2 w3 w4",
     t!("$last $first$space"),
     "w2 w1 w4 w3"
@@ -48,26 +41,12 @@ replace!(
 );
 replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b");
 // replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b");
-replace!(
-    simple_expand,
-    replace_all,
-    r"(?-u)(\w) (\w)",
-    "a b",
-    t!("$2 $1"),
-    "b a"
-);
-replace!(
-    literal_dollar1,
-    replace_all,
-    r"(?-u)(\w+) (\w+)",
-    "a b",
-    t!("$$1"),
-    "$1"
-);
+replace!(simple_expand, replace_all, r"(\w) (\w)", "a b", t!("$2 $1"), "b a");
+replace!(literal_dollar1, replace_all, r"(\w+) (\w+)", "a b", t!("$$1"), "$1");
 replace!(
     literal_dollar2,
     replace_all,
-    r"(?-u)(\w+) (\w+)",
+    r"(\w+) (\w+)",
     "a b",
     t!("$2 $$c $1"),
     "b $c a"
@@ -75,7 +54,7 @@ replace!(
 replace!(
     no_expand1,
     replace,
-    r"(?-u)(\S+)\s+(\S+)",
+    r"(\S+)\s+(\S+)",
     "w1 w2",
     no_expand!("$2 $1"),
     "$2 $1"
@@ -83,7 +62,7 @@ replace!(
 replace!(
     no_expand2,
     replace,
-    r"(?-u)(\S+)\s+(\S+)",
+    r"(\S+)\s+(\S+)",
     "w1 w2",
     no_expand!("$$1"),
     "$$1"