syntax: make Unicode completely optional

This commit refactors the way this library handles Unicode data by making it completely optional. Several features are introduced which permit callers to select only the Unicode data they need (up to a point of granularity). An important property of these changes is that presence of absence of crate features will never change the match semantics of a regular expression. Instead, the presence or absence of a crate feature can only add or subtract from the set of all possible valid regular expressions. So for example, if the `unicode-case` feature is disabled, then attempting to produce `Hir` for the regex `(?i)a` will fail. Instead, callers must use `(?i-u)a` (or enable the `unicode-case` feature). This partially addresses #583 since it permits callers to decrease binary size.
rust-lang · Sep 2, 2019 · 85f2c0d · 85f2c0d
1 parent a88b696
commit 85f2c0d
Show file tree

Hide file tree

Showing 15 changed files with 1,381 additions and 246 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ linear time with respect to the size of the regular expression and search text.
 Much of the syntax and implementation is inspired
 by [RE2](https://github.com/google/re2).
 
-[![Build Status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex)
+[![Build status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex)
 [![Build status](https://ci.appveyor.com/api/projects/status/github/rust-lang/regex?svg=true)](https://ci.appveyor.com/project/rust-lang-libs/regex)
 [![Coverage Status](https://coveralls.io/repos/github/rust-lang/regex/badge.svg?branch=master)](https://coveralls.io/github/rust-lang/regex?branch=master)
 [![](https://meritbadge.herokuapp.com/regex)](https://crates.io/crates/regex)

diff --git a/ci/script.sh b/ci/script.sh
@@ -1,5 +1,7 @@
 #!/bin/sh
 
+# vim: tabstop=2 shiftwidth=2 softtabstop=2
+
 # This is the main CI script for testing the regex crate and its sub-crates.
 
 set -ex
@@ -42,8 +44,13 @@ RUST_REGEX_RANDOM_TEST=1 \
 ci/run-shootout-test
 
 # Run tests on regex-syntax crate.
-cargo test --verbose --manifest-path regex-syntax/Cargo.toml
 cargo doc --verbose --manifest-path regex-syntax/Cargo.toml
+# Only run the full test suite on one job, to conserve resources.
+if [ "$TRAVIS_RUST_VERSION" = "stable" ]; then
+  (cd regex-syntax && ./test)
+else
+  cargo test --verbose --manifest-path regex-syntax/Cargo.toml
+fi
 
 # Run tests on regex-capi crate.
 ci/test-regex-capi

diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml
@@ -8,3 +8,25 @@ documentation = "https://docs.rs/regex-syntax"
 homepage = "https://github.com/rust-lang/regex"
 description = "A regular expression parser."
 workspace = ".."
+
+# Features are documented in the "Crate features" section of the crate docs:
+# https://docs.rs/regex-syntax/*/#crate-features
+[features]
+default = ["unicode"]
+
+unicode = [
+  "unicode-age",
+  "unicode-bool",
+  "unicode-case",
+  "unicode-gencat",
+  "unicode-perl",
+  "unicode-script",
+  "unicode-segment",
+]
+unicode-age = []
+unicode-bool = []
+unicode-case = []
+unicode-gencat = []
+unicode-perl = []
+unicode-script = []
+unicode-segment = []
diff --git a/regex-syntax/README.md b/regex-syntax/README.md
@@ -0,0 +1,82 @@
+regex-syntax
+============
+This crate provides a robust regular expression parser.
+
+[![Build status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex)
+[![Build status](https://ci.appveyor.com/api/projects/status/github/rust-lang/regex?svg=true)](https://ci.appveyor.com/project/rust-lang-libs/regex)
+[![](https://meritbadge.herokuapp.com/regex-syntax)](https://crates.io/crates/regex-syntax)
+[![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex)
+
+
+### Documentation
+
+https://docs.rs/regex-syntax
+
+
+### Overview
+
+There are two primary types exported by this crate: `Ast` and `Hir`. The former
+is a faithful abstract syntax of a regular expression, and can convert regular
+expressions back to their concrete syntax while mostly preserving its original
+form. The latter type is a high level intermediate representation of a regular
+expression that is amenable to analysis and compilation into byte codes or
+automata. An `Hir` achieves this by drastically simplifying the syntactic
+structure of the regular expression. While an `Hir` can be converted back to
+its equivalent concrete syntax, the result is unlikely to resemble the original
+concrete syntax that produced the `Hir`.
+
+
+### Example
+
+This example shows how to parse a pattern string into its HIR:
+
+```rust
+use regex_syntax::Parser;
+use regex_syntax::hir::{self, Hir};
+
+let hir = Parser::new().parse("a|b").unwrap();
+assert_eq!(hir, Hir::alternation(vec![
+    Hir::literal(hir::Literal::Unicode('a')),
+    Hir::literal(hir::Literal::Unicode('b')),
+]));
+```
+
+
+### Crate features
+
+By default, this crate bundles a fairly large amount of Unicode data tables
+(a source size of ~750KB). Because of their large size, one can disable some
+or all of these data tables. If a regular expression attempts to use Unicode
+data that is not available, then an error will occur when translating the `Ast`
+to the `Hir`.
+
+The full set of features one can disable are
+[in the "Crate features" section of the documentation](https://docs.rs/regex-syntax/*/#crate-features).
+
+
+### Testing
+
+Simply running `cargo test` will give you very good coverage. However, because
+of the large number of features exposed by this crate, a `test` script is
+included in this directory which will test several feature combinations. This
+is the same script that is run in CI.
+
+
+### Motivation
+
+The primary purpose of this crate is to provide the parser used by `regex`.
+Specifically, this crate is treated as an implementation detail of the `regex`,
+and is primarily developed for the needs of `regex`.
+
+Since this crate is an implementation detail of `regex`, it may experience
+breaking change releases at a different cadence from `regex`. This is only
+possible because this crate is _not_ a public dependency of `regex`.
+
+Another consequence of this de-coupling is that there is no direct way to
+compile a `regex::Regex` from a `regex_syntax::hir::Hir`. Instead, one must
+first convert the `Hir` to a string (via its `std::fmt::Display`) and then
+compile that via `Regex::new`. While this does repeat some work, compilation
+typically takes much longer than parsing.
+
+Stated differently, the coupling between `regex` and `regex-syntax` exists only
+at the level of the concrete syntax.
diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs
@@ -4,6 +4,8 @@ use std::fmt::Debug;
 use std::slice;
 use std::u8;
 
+use unicode;
+
 // This module contains an *internal* implementation of interval sets.
 //
 // The primary invariant that interval sets guards is canonical ordering. That
@@ -14,7 +16,8 @@ use std::u8;
 //
 // Since case folding (as implemented below) breaks that invariant, we roll
 // that into this API even though it is a little out of place in an otherwise
-// generic interval set.
+// generic interval set. (Hence the reason why the `unicode` module is imported
+// here.)
 //
 // Some of the implementation complexity here is a result of me wanting to
 // preserve the sequential representation without using additional memory.
@@ -72,13 +75,20 @@ impl<I: Interval> IntervalSet<I> {
     /// characters. For example, if this class consists of the range `a-z`,
     /// then applying case folding will result in the class containing both the
     /// ranges `a-z` and `A-Z`.
-    pub fn case_fold_simple(&mut self) {
+    ///
+    /// This returns an error if the necessary case mapping data is not
+    /// available.
+    pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
         let len = self.ranges.len();
         for i in 0..len {
             let range = self.ranges[i];
-            range.case_fold_simple(&mut self.ranges);
+            if let Err(err) = range.case_fold_simple(&mut self.ranges) {
+                self.canonicalize();
+                return Err(err);
+            }
         }
         self.canonicalize();
+        Ok(())
     }
 
     /// Union this set with the given set, in place.
@@ -331,7 +341,10 @@ pub trait Interval:
     fn upper(&self) -> Self::Bound;
     fn set_lower(&mut self, bound: Self::Bound);
     fn set_upper(&mut self, bound: Self::Bound);
-    fn case_fold_simple(&self, intervals: &mut Vec<Self>);
+    fn case_fold_simple(
+        &self,
+        intervals: &mut Vec<Self>,
+    ) -> Result<(), unicode::CaseFoldError>;
 
     /// Create a new interval.
     fn create(lower: Self::Bound, upper: Self::Bound) -> Self {

diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs
@@ -1105,6 +1105,7 @@ mod tests {
     test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
     test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
     test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
+    #[cfg(feature = "unicode-case")]
     test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
     test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
     test_lit!(
@@ -1114,6 +1115,7 @@ mod tests {
         M("\\xe2\\x85\\xa0"),
         M("\\xe2\\x98\\x83")
     );
+    #[cfg(feature = "unicode-case")]
     test_lit!(
         pfx_class3,
         prefixes,
@@ -1122,11 +1124,11 @@ mod tests {
         M("\\xe2\\x85\\xb0"),
         M("\\xe2\\x98\\x83")
     );
-    test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a", M("A"), M("a"));
+    test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a"));
     test_lit!(
         pfx_one_lit_casei2,
         prefixes,
-        "(?i)abc",
+        "(?i-u)abc",
         M("ABC"),
         M("aBC"),
         M("AbC"),
@@ -1158,7 +1160,7 @@ mod tests {
     test_lit!(
         pfx_cat3,
         prefixes,
-        "(?i)[ab]z",
+        "(?i-u)[ab]z",
         M("AZ"),
         M("BZ"),
         M("aZ"),
@@ -1295,7 +1297,7 @@ mod tests {
     test_exhausted!(
         pfx_exhausted4,
         prefixes,
-        "(?i)foobar",
+        "(?i-u)foobar",
         C("FO"),
         C("fO"),
         C("Fo"),
@@ -1336,6 +1338,7 @@ mod tests {
     test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
     test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
     test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
+    #[cfg(feature = "unicode-case")]
     test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
     test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
     test_lit!(
@@ -1345,6 +1348,7 @@ mod tests {
         M("\\xe2\\x85\\xa0"),
         M("\\xe2\\x98\\x83")
     );
+    #[cfg(feature = "unicode-case")]
     test_lit!(
         sfx_class3,
         suffixes,
@@ -1353,11 +1357,11 @@ mod tests {
         M("\\xe2\\x85\\xb0"),
         M("\\xe2\\x98\\x83")
     );
-    test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a", M("A"), M("a"));
+    test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a"));
     test_lit!(
         sfx_one_lit_casei2,
         suffixes,
-        "(?i)abc",
+        "(?i-u)abc",
         M("ABC"),
         M("ABc"),
         M("AbC"),
@@ -1389,7 +1393,7 @@ mod tests {
     test_lit!(
         sfx_cat3,
         suffixes,
-        "(?i)[ab]z",
+        "(?i-u)[ab]z",
         M("AZ"),
         M("Az"),
         M("BZ"),
@@ -1480,7 +1484,7 @@ mod tests {
     test_exhausted!(
         sfx_exhausted4,
         suffixes,
-        "(?i)foobar",
+        "(?i-u)foobar",
         C("AR"),
         C("Ar"),
         C("aR"),