From 85f2c0d4a44b69a3015b69d4bf79d83f9d16e5e0 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 Sep 2019 01:01:16 -0400 Subject: [PATCH] syntax: make Unicode completely optional This commit refactors the way this library handles Unicode data by making it completely optional. Several features are introduced which permit callers to select only the Unicode data they need (up to a point of granularity). An important property of these changes is that presence of absence of crate features will never change the match semantics of a regular expression. Instead, the presence or absence of a crate feature can only add or subtract from the set of all possible valid regular expressions. So for example, if the `unicode-case` feature is disabled, then attempting to produce `Hir` for the regex `(?i)a` will fail. Instead, callers must use `(?i-u)a` (or enable the `unicode-case` feature). This partially addresses #583 since it permits callers to decrease binary size. --- README.md | 2 +- ci/script.sh | 9 +- regex-syntax/Cargo.toml | 22 + regex-syntax/README.md | 82 ++ regex-syntax/src/hir/interval.rs | 21 +- regex-syntax/src/hir/literal/mod.rs | 20 +- regex-syntax/src/hir/mod.rs | 105 ++- regex-syntax/src/hir/translate.rs | 325 +++++++- regex-syntax/src/lib.rs | 124 ++- regex-syntax/src/unicode.rs | 746 ++++++++++++++---- regex-syntax/src/unicode_tables/mod.rs | 45 ++ .../src/unicode_tables/perl_decimal.rs | 70 ++ regex-syntax/src/unicode_tables/perl_space.rs | 21 + regex-syntax/test | 20 + scripts/generate-unicode-tables | 15 +- 15 files changed, 1381 insertions(+), 246 deletions(-) create mode 100644 regex-syntax/README.md create mode 100644 regex-syntax/src/unicode_tables/perl_decimal.rs create mode 100644 regex-syntax/src/unicode_tables/perl_space.rs create mode 100755 regex-syntax/test diff --git a/README.md b/README.md index cea3da5549..16099495c1 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ linear time with respect to the size of the regular expression and search text. Much of the syntax and implementation is inspired by [RE2](https://github.com/google/re2). -[![Build Status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex) +[![Build status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex) [![Build status](https://ci.appveyor.com/api/projects/status/github/rust-lang/regex?svg=true)](https://ci.appveyor.com/project/rust-lang-libs/regex) [![Coverage Status](https://coveralls.io/repos/github/rust-lang/regex/badge.svg?branch=master)](https://coveralls.io/github/rust-lang/regex?branch=master) [![](https://meritbadge.herokuapp.com/regex)](https://crates.io/crates/regex) diff --git a/ci/script.sh b/ci/script.sh index 0bbb4cbd1c..15e0adae89 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -1,5 +1,7 @@ #!/bin/sh +# vim: tabstop=2 shiftwidth=2 softtabstop=2 + # This is the main CI script for testing the regex crate and its sub-crates. set -ex @@ -42,8 +44,13 @@ RUST_REGEX_RANDOM_TEST=1 \ ci/run-shootout-test # Run tests on regex-syntax crate. -cargo test --verbose --manifest-path regex-syntax/Cargo.toml cargo doc --verbose --manifest-path regex-syntax/Cargo.toml +# Only run the full test suite on one job, to conserve resources. +if [ "$TRAVIS_RUST_VERSION" = "stable" ]; then + (cd regex-syntax && ./test) +else + cargo test --verbose --manifest-path regex-syntax/Cargo.toml +fi # Run tests on regex-capi crate. ci/test-regex-capi diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index ac8f5d5c81..b77b1d573e 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -8,3 +8,25 @@ documentation = "https://docs.rs/regex-syntax" homepage = "https://github.com/rust-lang/regex" description = "A regular expression parser." workspace = ".." + +# Features are documented in the "Crate features" section of the crate docs: +# https://docs.rs/regex-syntax/*/#crate-features +[features] +default = ["unicode"] + +unicode = [ + "unicode-age", + "unicode-bool", + "unicode-case", + "unicode-gencat", + "unicode-perl", + "unicode-script", + "unicode-segment", +] +unicode-age = [] +unicode-bool = [] +unicode-case = [] +unicode-gencat = [] +unicode-perl = [] +unicode-script = [] +unicode-segment = [] diff --git a/regex-syntax/README.md b/regex-syntax/README.md new file mode 100644 index 0000000000..5149ed95f6 --- /dev/null +++ b/regex-syntax/README.md @@ -0,0 +1,82 @@ +regex-syntax +============ +This crate provides a robust regular expression parser. + +[![Build status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex) +[![Build status](https://ci.appveyor.com/api/projects/status/github/rust-lang/regex?svg=true)](https://ci.appveyor.com/project/rust-lang-libs/regex) +[![](https://meritbadge.herokuapp.com/regex-syntax)](https://crates.io/crates/regex-syntax) +[![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) + + +### Documentation + +https://docs.rs/regex-syntax + + +### Overview + +There are two primary types exported by this crate: `Ast` and `Hir`. The former +is a faithful abstract syntax of a regular expression, and can convert regular +expressions back to their concrete syntax while mostly preserving its original +form. The latter type is a high level intermediate representation of a regular +expression that is amenable to analysis and compilation into byte codes or +automata. An `Hir` achieves this by drastically simplifying the syntactic +structure of the regular expression. While an `Hir` can be converted back to +its equivalent concrete syntax, the result is unlikely to resemble the original +concrete syntax that produced the `Hir`. + + +### Example + +This example shows how to parse a pattern string into its HIR: + +```rust +use regex_syntax::Parser; +use regex_syntax::hir::{self, Hir}; + +let hir = Parser::new().parse("a|b").unwrap(); +assert_eq!(hir, Hir::alternation(vec![ + Hir::literal(hir::Literal::Unicode('a')), + Hir::literal(hir::Literal::Unicode('b')), +])); +``` + + +### Crate features + +By default, this crate bundles a fairly large amount of Unicode data tables +(a source size of ~750KB). Because of their large size, one can disable some +or all of these data tables. If a regular expression attempts to use Unicode +data that is not available, then an error will occur when translating the `Ast` +to the `Hir`. + +The full set of features one can disable are +[in the "Crate features" section of the documentation](https://docs.rs/regex-syntax/*/#crate-features). + + +### Testing + +Simply running `cargo test` will give you very good coverage. However, because +of the large number of features exposed by this crate, a `test` script is +included in this directory which will test several feature combinations. This +is the same script that is run in CI. + + +### Motivation + +The primary purpose of this crate is to provide the parser used by `regex`. +Specifically, this crate is treated as an implementation detail of the `regex`, +and is primarily developed for the needs of `regex`. + +Since this crate is an implementation detail of `regex`, it may experience +breaking change releases at a different cadence from `regex`. This is only +possible because this crate is _not_ a public dependency of `regex`. + +Another consequence of this de-coupling is that there is no direct way to +compile a `regex::Regex` from a `regex_syntax::hir::Hir`. Instead, one must +first convert the `Hir` to a string (via its `std::fmt::Display`) and then +compile that via `Regex::new`. While this does repeat some work, compilation +typically takes much longer than parsing. + +Stated differently, the coupling between `regex` and `regex-syntax` exists only +at the level of the concrete syntax. diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index 0fc1a8ecdf..51eed52595 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -4,6 +4,8 @@ use std::fmt::Debug; use std::slice; use std::u8; +use unicode; + // This module contains an *internal* implementation of interval sets. // // The primary invariant that interval sets guards is canonical ordering. That @@ -14,7 +16,8 @@ use std::u8; // // Since case folding (as implemented below) breaks that invariant, we roll // that into this API even though it is a little out of place in an otherwise -// generic interval set. +// generic interval set. (Hence the reason why the `unicode` module is imported +// here.) // // Some of the implementation complexity here is a result of me wanting to // preserve the sequential representation without using additional memory. @@ -72,13 +75,20 @@ impl IntervalSet { /// characters. For example, if this class consists of the range `a-z`, /// then applying case folding will result in the class containing both the /// ranges `a-z` and `A-Z`. - pub fn case_fold_simple(&mut self) { + /// + /// This returns an error if the necessary case mapping data is not + /// available. + pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { let len = self.ranges.len(); for i in 0..len { let range = self.ranges[i]; - range.case_fold_simple(&mut self.ranges); + if let Err(err) = range.case_fold_simple(&mut self.ranges) { + self.canonicalize(); + return Err(err); + } } self.canonicalize(); + Ok(()) } /// Union this set with the given set, in place. @@ -331,7 +341,10 @@ pub trait Interval: fn upper(&self) -> Self::Bound; fn set_lower(&mut self, bound: Self::Bound); fn set_upper(&mut self, bound: Self::Bound); - fn case_fold_simple(&self, intervals: &mut Vec); + fn case_fold_simple( + &self, + intervals: &mut Vec, + ) -> Result<(), unicode::CaseFoldError>; /// Create a new interval. fn create(lower: Self::Bound, upper: Self::Bound) -> Self { diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index 0971410874..3ba225c657 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -1105,6 +1105,7 @@ mod tests { test_lit!(pfx_one_lit1, prefixes, "a", M("a")); test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); + #[cfg(feature = "unicode-case")] test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); test_lit!( @@ -1114,6 +1115,7 @@ mod tests { M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83") ); + #[cfg(feature = "unicode-case")] test_lit!( pfx_class3, prefixes, @@ -1122,11 +1124,11 @@ mod tests { M("\\xe2\\x85\\xb0"), M("\\xe2\\x98\\x83") ); - test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a", M("A"), M("a")); + test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a")); test_lit!( pfx_one_lit_casei2, prefixes, - "(?i)abc", + "(?i-u)abc", M("ABC"), M("aBC"), M("AbC"), @@ -1158,7 +1160,7 @@ mod tests { test_lit!( pfx_cat3, prefixes, - "(?i)[ab]z", + "(?i-u)[ab]z", M("AZ"), M("BZ"), M("aZ"), @@ -1295,7 +1297,7 @@ mod tests { test_exhausted!( pfx_exhausted4, prefixes, - "(?i)foobar", + "(?i-u)foobar", C("FO"), C("fO"), C("Fo"), @@ -1336,6 +1338,7 @@ mod tests { test_lit!(sfx_one_lit1, suffixes, "a", M("a")); test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); + #[cfg(feature = "unicode-case")] test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); test_lit!( @@ -1345,6 +1348,7 @@ mod tests { M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83") ); + #[cfg(feature = "unicode-case")] test_lit!( sfx_class3, suffixes, @@ -1353,11 +1357,11 @@ mod tests { M("\\xe2\\x85\\xb0"), M("\\xe2\\x98\\x83") ); - test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a", M("A"), M("a")); + test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a")); test_lit!( sfx_one_lit_casei2, suffixes, - "(?i)abc", + "(?i-u)abc", M("ABC"), M("ABc"), M("AbC"), @@ -1389,7 +1393,7 @@ mod tests { test_lit!( sfx_cat3, suffixes, - "(?i)[ab]z", + "(?i-u)[ab]z", M("AZ"), M("Az"), M("BZ"), @@ -1480,7 +1484,7 @@ mod tests { test_exhausted!( sfx_exhausted4, suffixes, - "(?i)foobar", + "(?i-u)foobar", C("AR"), C("Ar"), C("aR"), diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index e938de80de..ee08e83dba 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -5,6 +5,7 @@ use std::char; use std::cmp; use std::error; use std::fmt; +use std::result; use std::u8; use ast::Span; @@ -12,6 +13,7 @@ use hir::interval::{Interval, IntervalSet, IntervalSetIter}; use unicode; pub use hir::visitor::{visit, Visitor}; +pub use unicode::CaseFoldError; mod interval; pub mod literal; @@ -65,6 +67,14 @@ pub enum ErrorKind { /// This occurs when an unrecognized Unicode property value could not /// be found. UnicodePropertyValueNotFound, + /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or + /// `\d`) could not be found. This can occur when the `unicode-perl` + /// crate feature is not enabled. + UnicodePerlClassNotFound, + /// This occurs when the Unicode simple case mapping tables are not + /// available, and the regular expression required Unicode aware case + /// insensitivity. + UnicodeCaseUnavailable, /// This occurs when the translator attempts to construct a character class /// that is empty. /// @@ -88,8 +98,16 @@ impl ErrorKind { InvalidUtf8 => "pattern can match invalid UTF-8", UnicodePropertyNotFound => "Unicode property not found", UnicodePropertyValueNotFound => "Unicode property value not found", + UnicodePerlClassNotFound => { + "Unicode-aware Perl class not found \ + (make sure the unicode-perl feature is enabled)" + } + UnicodeCaseUnavailable => { + "Unicode-aware case insensitivity matching is not available \ + (make sure the unicode-case feature is enabled)" + } EmptyClassNotAllowed => "empty character classes are not allowed", - _ => unreachable!(), + __Nonexhaustive => unreachable!(), } } } @@ -848,8 +866,38 @@ impl ClassUnicode { /// characters, according to Unicode's "simple" mapping. For example, if /// this class consists of the range `a-z`, then applying case folding will /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. pub fn case_fold_simple(&mut self) { - self.set.case_fold_simple(); + self.set + .case_fold_simple() + .expect("unicode-case feature must be enabled"); + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. + pub fn try_case_fold_simple( + &mut self, + ) -> result::Result<(), CaseFoldError> { + self.set.case_fold_simple() } /// Negate this character class. @@ -957,9 +1005,12 @@ impl Interval for ClassUnicodeRange { /// /// Additional ranges are appended to the given vector. Canonical ordering /// is *not* maintained in the given vector. - fn case_fold_simple(&self, ranges: &mut Vec) { - if !unicode::contains_simple_case_mapping(self.start, self.end) { - return; + fn case_fold_simple( + &self, + ranges: &mut Vec, + ) -> Result<(), unicode::CaseFoldError> { + if !unicode::contains_simple_case_mapping(self.start, self.end)? { + return Ok(()); } let start = self.start as u32; let end = (self.end as u32).saturating_add(1); @@ -968,7 +1019,7 @@ impl Interval for ClassUnicodeRange { if next_simple_cp.map_or(false, |next| cp < next) { continue; } - let it = match unicode::simple_fold(cp) { + let it = match unicode::simple_fold(cp)? { Ok(it) => it, Err(next) => { next_simple_cp = next; @@ -979,6 +1030,7 @@ impl Interval for ClassUnicodeRange { ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); } } + Ok(()) } } @@ -1057,7 +1109,7 @@ impl ClassBytes { /// Note that this only applies ASCII case folding, which is limited to the /// characters `a-z` and `A-Z`. pub fn case_fold_simple(&mut self) { - self.set.case_fold_simple(); + self.set.case_fold_simple().expect("ASCII case folding never fails"); } /// Negate this byte class. @@ -1151,7 +1203,10 @@ impl Interval for ClassBytesRange { /// /// Additional ranges are appended to the given vector. Canonical ordering /// is *not* maintained in the given vector. - fn case_fold_simple(&self, ranges: &mut Vec) { + fn case_fold_simple( + &self, + ranges: &mut Vec, + ) -> Result<(), unicode::CaseFoldError> { if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { let lower = cmp::max(self.start, b'a'); let upper = cmp::min(self.end, b'z'); @@ -1162,6 +1217,7 @@ impl Interval for ClassBytesRange { let upper = cmp::min(self.end, b'Z'); ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); } + Ok(()) } } @@ -1473,6 +1529,7 @@ mod tests { cls.iter().map(|x| (x.start(), x.end())).collect() } + #[cfg(feature = "unicode-case")] fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { let mut cls_ = cls.clone(); cls_.case_fold_simple(); @@ -1643,6 +1700,7 @@ mod tests { } #[test] + #[cfg(feature = "unicode-case")] fn class_case_fold_unicode() { let cls = uclass(&[ ('C', 'F'), @@ -1700,6 +1758,37 @@ mod tests { assert_eq!(cls, ucasefold(&cls)); } + #[test] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + assert!(cls.try_case_fold_simple().is_err()); + } + + #[test] + #[should_panic] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled_panics() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + cls.case_fold_simple(); + } + #[test] fn class_case_fold_bytes() { let cls = bclass(&[ diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 4086aceca6..3db8796140 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -313,7 +313,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { Ast::Class(ast::Class::Bracketed(ref ast)) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); - self.unicode_fold_and_negate(ast.negated, &mut cls); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls, + )?; if cls.iter().next().is_none() { return Err(self.error( ast.span, @@ -431,7 +435,9 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { for &(s, e) in ascii_class(&x.kind) { cls.push(hir::ClassUnicodeRange::new(s, e)); } - self.unicode_fold_and_negate(x.negated, &mut cls); + self.unicode_fold_and_negate( + &x.span, x.negated, &mut cls, + )?; self.push(HirFrame::ClassUnicode(cls)); } else { let mut cls = self.pop().unwrap().unwrap_class_bytes(); @@ -464,7 +470,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { ast::ClassSetItem::Bracketed(ref ast) => { if self.flags().unicode() { let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); - self.unicode_fold_and_negate(ast.negated, &mut cls1); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls1, + )?; let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); cls2.union(&cls1); @@ -527,8 +537,18 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { let mut lhs = self.pop().unwrap().unwrap_class_unicode(); let mut cls = self.pop().unwrap().unwrap_class_unicode(); if self.flags().case_insensitive() { - rhs.case_fold_simple(); - lhs.case_fold_simple(); + rhs.try_case_fold_simple().map_err(|_| { + self.error( + op.rhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; + lhs.try_case_fold_simple().map_err(|_| { + self.error( + op.lhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; } match op.kind { Intersection => lhs.intersect(&rhs), @@ -659,21 +679,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> { span: Span, c: char, ) -> Result { - // If case folding won't do anything, then don't bother trying. - if !unicode::contains_simple_case_mapping(c, c) { - return self.hir_from_char(span, c); - } if self.flags().unicode() { + // If case folding won't do anything, then don't bother trying. + let map = + unicode::contains_simple_case_mapping(c, c).map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; + if !map { + return self.hir_from_char(span, c); + } let mut cls = hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( c, c, )]); - cls.case_fold_simple(); + cls.try_case_fold_simple().map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; Ok(Hir::class(hir::Class::Unicode(cls))) } else { if c.len_utf8() > 1 { return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); } + // If case folding won't do anything, then don't bother trying. + match c { + 'A'..='Z' | 'a'..='z' => {} + _ => return self.hir_from_char(span, c), + } let mut cls = hir::ClassBytes::new(vec![hir::ClassBytesRange::new( c as u8, c as u8, @@ -805,7 +836,11 @@ impl<'t, 'p> TranslatorI<'t, 'p> { unicode::class(query), ); if let Ok(ref mut class) = result { - self.unicode_fold_and_negate(ast_class.negated, class); + self.unicode_fold_and_negate( + &ast_class.span, + ast_class.negated, + class, + )?; } result } @@ -870,25 +905,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> { unicode::Error::PropertyValueNotFound => { self.error(sp, ErrorKind::UnicodePropertyValueNotFound) } + unicode::Error::PerlClassNotFound => { + self.error(sp, ErrorKind::UnicodePerlClassNotFound) + } } }) } fn unicode_fold_and_negate( &self, + span: &Span, negated: bool, class: &mut hir::ClassUnicode, - ) { + ) -> Result<()> { // Note that we must apply case folding before negation! // Consider `(?i)[^x]`. If we applied negation field, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { - class.case_fold_simple(); + class.try_case_fold_simple().map_err(|_| { + self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) + })?; } if negated { class.negate(); } + Ok(()) } fn bytes_fold_and_negate( @@ -1205,13 +1247,14 @@ mod tests { Hir::concat(exprs) } + #[allow(dead_code)] fn hir_uclass_query(query: ClassQuery) -> Hir { Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) } + #[allow(dead_code)] fn hir_uclass_perl_word() -> Hir { - use unicode_tables::perl_word::PERL_WORD; - Hir::class(hir::Class::Unicode(unicode::hir_class(PERL_WORD))) + Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) } fn hir_uclass(ranges: &[(char, char)]) -> Hir { @@ -1262,6 +1305,7 @@ mod tests { } } + #[allow(dead_code)] fn hir_union(expr1: Hir, expr2: Hir) -> Hir { use hir::Class::{Bytes, Unicode}; @@ -1278,6 +1322,7 @@ mod tests { } } + #[allow(dead_code)] fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { use hir::Class::{Bytes, Unicode}; @@ -1377,11 +1422,14 @@ mod tests { #[test] fn literal_case_insensitive() { + #[cfg(feature = "unicode-case")] assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i:a)"), hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],)) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("a(?i)a(?-i)a"), hir_cat(vec![ @@ -1390,6 +1438,7 @@ mod tests { hir_lit("a"), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)ab@c"), hir_cat(vec![ @@ -1399,12 +1448,14 @@ mod tests { hir_uclass(&[('C', 'C'), ('c', 'c')]), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)β"), hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) ); assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?-u)a(?i)a(?-i)a"), hir_cat(vec![ @@ -1575,6 +1626,7 @@ mod tests { #[test] fn flags() { + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i:a)a"), hir_cat(vec![ @@ -1589,6 +1641,7 @@ mod tests { hir_lit("β"), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)(?-i:a)a"), hir_cat(vec![ @@ -1596,6 +1649,7 @@ mod tests { hir_uclass(&[('A', 'A'), ('a', 'a')]), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?im)a^"), hir_cat(vec![ @@ -1603,6 +1657,7 @@ mod tests { hir_anchor(hir::Anchor::StartLine), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?im)a^(?i-m)a^"), hir_cat(vec![ @@ -1621,6 +1676,7 @@ mod tests { hir_star(false, hir_lit("a")), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?:a(?i)a)a"), hir_cat(vec![ @@ -1631,6 +1687,7 @@ mod tests { hir_lit("a"), ]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)(?:a(?-i)a)a"), hir_cat(vec![ @@ -1820,6 +1877,7 @@ mod tests { t("[[:^lower:]]"), hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[[:lower:]]"), hir_uclass(&[ @@ -1864,19 +1922,23 @@ mod tests { } #[test] + #[cfg(feature = "unicode-perl")] fn class_perl() { // Unicode assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); assert_eq!(t(r"\w"), hir_uclass_perl_word()); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\d"), hir_uclass_query(ClassQuery::Binary("digit")) ); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\s"), hir_uclass_query(ClassQuery::Binary("space")) ); + #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word()); // Unicode, negated @@ -1889,14 +1951,17 @@ mod tests { hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) ); assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word())); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\D"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\S"), hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) ); + #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); // ASCII only @@ -1965,7 +2030,56 @@ mod tests { } #[test] - fn class_unicode() { + #[cfg(not(feature = "unicode-perl"))] + fn class_perl_word_disabled() { + assert_eq!( + t_err(r"\w"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))] + fn class_perl_space_disabled() { + assert_eq!( + t_err(r"\s"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all( + not(feature = "unicode-perl"), + not(feature = "unicode-gencat") + ))] + fn class_perl_digit_disabled() { + assert_eq!( + t_err(r"\d"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn class_unicode_gencat() { assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z"))); assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z"))); assert_eq!( @@ -2003,21 +2117,6 @@ mod tests { hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) ); - assert_eq!( - t(r"\p{Greek}"), - hir_uclass_query(ClassQuery::Binary("Greek")) - ); - assert_eq!( - t(r"(?i)\p{Greek}"), - hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) - ); - assert_eq!( - t(r"(?i)\P{Greek}"), - hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( - "Greek" - )))) - ); - assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any"))); assert_eq!( t(r"\p{assigned}"), @@ -2090,6 +2189,54 @@ mod tests { ), } ); + } + + #[test] + #[cfg(not(feature = "unicode-gencat"))] + fn class_unicode_gencat_disabled() { + assert_eq!( + t_err(r"\p{Separator}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + + assert_eq!( + t_err(r"\p{Any}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-script")] + fn class_unicode_script() { + assert_eq!( + t(r"\p{Greek}"), + hir_uclass_query(ClassQuery::Binary("Greek")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\p{Greek}"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\P{Greek}"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "Greek" + )))) + ); + assert_eq!( t_err(r"\p{sc:Foo}"), TestError { @@ -2110,6 +2257,37 @@ mod tests { ), } ); + } + + #[test] + #[cfg(not(feature = "unicode-script"))] + fn class_unicode_script_disabled() { + assert_eq!( + t_err(r"\p{Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(9, 1, 10) + ), + } + ); + + assert_eq!( + t_err(r"\p{scx:Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-age")] + fn class_unicode_age() { assert_eq!( t_err(r"\p{age:Foo}"), TestError { @@ -2122,6 +2300,21 @@ mod tests { ); } + #[test] + #[cfg(not(feature = "unicode-age"))] + fn class_unicode_age_disabled() { + assert_eq!( + t_err(r"\p{age:3.0}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + #[test] fn class_bracketed() { assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')])); @@ -2132,28 +2325,39 @@ mod tests { assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\pZ]"), hir_uclass_query(ClassQuery::Binary("separator")) ); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\p{separator}]"), hir_uclass_query(ClassQuery::Binary("separator")) ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\PZ]"), hir_uclass_query(ClassQuery::Binary("separator")) ); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\P{separator}]"), hir_uclass_query(ClassQuery::Binary("separator")) ); + #[cfg(all( + feature = "unicode-case", + any(feature = "unicode-perl", feature = "unicode-gencat") + ))] assert_eq!( t(r"(?i)[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")) ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[^\P{greek}]"), hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))) @@ -2163,11 +2367,14 @@ mod tests { assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); + #[cfg(feature = "unicode-case")] assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[k]"), hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),]) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[β]"), hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) @@ -2180,24 +2387,29 @@ mod tests { t_bytes("(?-u)[^a]"), hir_negate(hir_bclass(&[(b'a', b'a')])) ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!( t(r"[^\d]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\pZ]"), hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) ); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\p{separator}]"), hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[^\p{greek}]"), hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( "greek" )))) ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[\P{greek}]"), hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( @@ -2236,6 +2448,7 @@ mod tests { ), } ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] assert_eq!( t_err(r"[^\s\S]"), TestError { @@ -2246,6 +2459,7 @@ mod tests { ), } ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] assert_eq!( t_err(r"(?-u)[^\s\S]"), TestError { @@ -2261,6 +2475,7 @@ mod tests { #[test] fn class_bracketed_union() { assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[a\pZb]"), hir_union( @@ -2268,6 +2483,7 @@ mod tests { hir_uclass_query(ClassQuery::Binary("separator")) ) ); + #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))] assert_eq!( t(r"[\pZ\p{Greek}]"), hir_union( @@ -2275,6 +2491,11 @@ mod tests { hir_uclass_query(ClassQuery::Binary("separator")) ) ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] assert_eq!( t(r"[\p{age:3.0}\pZ\p{Greek}]"), hir_union( @@ -2288,6 +2509,11 @@ mod tests { ) ) ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] assert_eq!( t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), hir_union( @@ -2305,6 +2531,12 @@ mod tests { ) ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] assert_eq!( t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), hir_case_fold(hir_union( @@ -2318,6 +2550,11 @@ mod tests { ) )) ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] assert_eq!( t(r"[^\p{age:3.0}\pZ\p{Greek}]"), hir_negate(hir_union( @@ -2331,6 +2568,12 @@ mod tests { ) )) ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] assert_eq!( t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), hir_negate(hir_case_fold(hir_union( @@ -2355,16 +2598,20 @@ mod tests { assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a[^c]]"), hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) ); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a-b[^c]]"), hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) ); + #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')])); + #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[^a-b[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]) @@ -2380,6 +2627,7 @@ mod tests { ), } ); + #[cfg(feature = "unicode-case")] assert_eq!( t_err(r"(?i)[^a-c[^c]]"), TestError { @@ -2411,26 +2659,32 @@ mod tests { assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[abc&&b-c]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[abc&&[b-c]]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[[abc]&&[b-c]]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[a-z&&b-y&&c-x]"), hir_case_fold(hir_uclass(&[('c', 'x')])) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[c-da-b&&a-d]"), hir_case_fold(hir_uclass(&[('a', 'd')])) ); + #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[a-d&&c-da-b]"), hir_case_fold(hir_uclass(&[('a', 'd')])) @@ -2478,21 +2732,26 @@ mod tests { #[test] fn class_bracketed_intersect_negate() { + #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^\w&&\d]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^[\w&&\d]]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); + #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^[^\w&&\d]]"), hir_uclass_query(ClassQuery::Binary("digit")) ); + #[cfg(feature = "unicode-perl")] assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word())); + #[cfg(feature = "unicode-perl")] assert_eq!( t_bytes(r"(?-u)[^\w&&\d]"), hir_negate(hir_bclass_from_char(ascii_class( @@ -2523,6 +2782,7 @@ mod tests { #[test] fn class_bracketed_difference() { + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\pL--[:ascii:]]"), hir_difference( @@ -2539,6 +2799,7 @@ mod tests { #[test] fn class_bracketed_symmetric_difference() { + #[cfg(feature = "unicode-script")] assert_eq!( t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), hir_uclass(&[ @@ -2575,6 +2836,7 @@ mod tests { ); assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); + #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"(?x)\p # comment { # comment @@ -2797,6 +3059,7 @@ mod tests { assert!(t(r"a{0,}").is_match_empty()); assert!(t(r"a{0,1}").is_match_empty()); assert!(t(r"a{0,10}").is_match_empty()); + #[cfg(feature = "unicode-gencat")] assert!(t(r"\pL*").is_match_empty()); assert!(t(r"a*|b").is_match_empty()); assert!(t(r"b|a*").is_match_empty()); diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 238c5dd3b8..e832be5c01 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -91,12 +91,74 @@ know a regular expression must match a prefix or suffix literal, then it is often quicker to search for instances of that literal, and then confirm or deny the match using the full regular expression engine. These optimizations are done automatically in the `regex` crate. + + +# Crate features + +An important feature provided by this crate is its Unicode support. This +includes things like case folding, boolean properties, general categories, +scripts and Unicode-aware support for the Perl classes `\w`, `\s` and `\d`. +However, a downside of this support is that it requires bundling several +Unicode data tables that are substantial in size. + +A fair number of use cases do not require full Unicode support. For this +reason, this crate exposes a number of features to control which Unicode +data is available. + +If a regular expression attempts to use a Unicode feature that is not available +because the corresponding crate feature was disabled, then translating that +regular expression to an `Hir` will return an error. (It is still possible +construct an `Ast` for such a regular expression, since Unicode data is not +used until translation to an `Hir`.) Stated differently, enabling or disabling +any of the features below can only add or subtract from the total set of valid +regular expressions. Enabling or disabling a feature will never modify the +match semantics of a regular expression. + +The following features are available: + +* **unicode** - + Enables all Unicode features. This feature is enabled by default, and will + always cover all Unicode features, even if more are added in the future. +* **unicode-age** - + Provide the data for the + [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). + This makes it possible to use classes like `\p{Age:6.0}` to refer to all + codepoints first introduced in Unicode 6.0 +* **unicode-bool** - + Provide the data for numerous Unicode boolean properties. The full list + is not included here, but contains properties like `Alphabetic`, `Emoji`, + `Lowercase`, `Math`, `Uppercase` and `White_Space`. +* **unicode-case** - + Provide the data for case insensitive matching using + [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). +* **unicode-gencat** - + Provide the data for + [Uncode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + This includes, but is not limited to, `Decimal_Number`, `Letter`, + `Math_Symbol`, `Number` and `Punctuation`. +* **unicode-perl** - + Provide the data for supporting the Unicode-aware Perl character classes, + corresponding to `\w`, `\s` and `\d`. This is also necessary for using + Unicode-aware word boundary assertions. Note that if this feature is + disabled, the `\s` and `\d` character classes are still available if the + `unicode-bool` and `unicode-gencat` features are enabled, respectively. +* **unicode-script** - + Provide the data for + [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). + This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, + `Latin` and `Thai`. +* **unicode-segment** - + Provide the data necessary to provide the properties used to implement the + [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). + This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and + `\p{sb=ATerm}`. */ #![deny(missing_docs)] pub use error::{Error, Result}; pub use parser::{Parser, ParserBuilder}; +pub use unicode::UnicodeWordError; pub mod ast; mod either; @@ -156,24 +218,35 @@ pub fn is_meta_character(c: char) -> bool { /// is considered a word character if it is in either of the `Alphabetic` or /// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` /// or `Connector_Punctuation` general categories. +/// +/// # Panics +/// +/// If the `unicode-perl` feature is not enabled, then this function panics. +/// For this reason, it is recommended that callers use +/// [`try_is_word_character`](fn.try_is_word_character.html) +/// instead. pub fn is_word_character(c: char) -> bool { - use std::cmp::Ordering; - use unicode_tables::perl_word::PERL_WORD; + try_is_word_character(c).expect("unicode-perl feature must be enabled") +} - if c <= 0x7F as char && is_word_byte(c as u8) { - return true; - } - PERL_WORD - .binary_search_by(|&(start, end)| { - if start <= c && c <= end { - Ordering::Equal - } else if start > c { - Ordering::Greater - } else { - Ordering::Less - } - }) - .is_ok() +/// Returns true if and only if the given character is a Unicode word +/// character. +/// +/// A Unicode word character is defined by +/// [UTS#18 Annex C](http://unicode.org/reports/tr18/#Compatibility_Properties). +/// In particular, a character +/// is considered a word character if it is in either of the `Alphabetic` or +/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` +/// or `Connector_Punctuation` general categories. +/// +/// # Errors +/// +/// If the `unicode-perl` feature is not enabled, then this function always +/// returns an error. +pub fn try_is_word_character( + c: char, +) -> std::result::Result { + unicode::is_word_character(c) } /// Returns true if and only if the given character is an ASCII word character. @@ -200,10 +273,14 @@ mod tests { } #[test] - fn word() { + fn word_byte() { assert!(is_word_byte(b'a')); assert!(!is_word_byte(b'-')); + } + #[test] + #[cfg(feature = "unicode-perl")] + fn word_char() { assert!(is_word_character('a'), "ASCII"); assert!(is_word_character('à'), "Latin-1"); assert!(is_word_character('β'), "Greek"); @@ -216,4 +293,17 @@ mod tests { assert!(!is_word_character('-')); assert!(!is_word_character('☃')); } + + #[test] + #[should_panic] + #[cfg(not(feature = "unicode-perl"))] + fn word_char_disabled_panic() { + assert!(is_word_character('a')); + } + + #[test] + #[cfg(not(feature = "unicode-perl"))] + fn word_char_disabled_error() { + assert!(try_is_word_character('a').is_err()); + } } diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index 6126e77d86..ea3f9c167d 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -1,22 +1,16 @@ -use std::cmp::Ordering; +use std::error; +use std::fmt; use std::result; use hir; -use unicode_tables::age; -use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; -use unicode_tables::general_category; -use unicode_tables::grapheme_cluster_break; -use unicode_tables::property_bool; -use unicode_tables::property_names::PROPERTY_NAMES; -use unicode_tables::property_values::PROPERTY_VALUES; -use unicode_tables::script; -use unicode_tables::script_extension; -use unicode_tables::sentence_break; -use unicode_tables::word_break; /// A type alias for errors specific to Unicode handling of classes. pub type Result = result::Result; +/// An inclusive range of codepoints from a generated file (hence the static +/// lifetime). +type Range = &'static [(char, char)]; + /// An error that occurs when dealing with Unicode. /// /// We don't impl the Error trait here because these always get converted @@ -25,17 +19,51 @@ pub type Result = result::Result; pub enum Error { PropertyNotFound, PropertyValueNotFound, + // Not used when unicode-perl is enabled. + #[allow(dead_code)] + PerlClassNotFound, +} + +/// A type alias for errors specific to Unicode case folding. +pub type FoldResult = result::Result; + +/// An error that occurs when Unicode-aware simple case folding fails. +/// +/// This error can occur when the case mapping tables necessary for Unicode +/// aware case folding are unavailable. This only occurs when the +/// `unicode-case` feature is disabled. (The feature is enabled by default.) +#[derive(Debug)] +pub struct CaseFoldError(()); + +impl error::Error for CaseFoldError {} + +impl fmt::Display for CaseFoldError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "Unicode-aware case folding is not available \ + (probably because the unicode-case feature is not enabled)" + ) + } } -/// An iterator over a codepoint's simple case equivalence class. +/// An error that occurs when the Unicode-aware `\w` class is unavailable. +/// +/// This error can occur when the data tables necessary for the Unicode aware +/// Perl character class `\w` are unavailable. This only occurs when the +/// `unicode-perl` feature is disabled. (The feature is enabled by default.) #[derive(Debug)] -pub struct SimpleFoldIter(::std::slice::Iter<'static, char>); +pub struct UnicodeWordError(()); -impl Iterator for SimpleFoldIter { - type Item = char; +impl error::Error for UnicodeWordError {} - fn next(&mut self) -> Option { - self.0.next().map(|c| *c) +impl fmt::Display for UnicodeWordError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "Unicode-aware \\w class is not available \ + (probably because the unicode-perl feature is not enabled)" + ) } } @@ -48,17 +76,40 @@ impl Iterator for SimpleFoldIter { /// scalar value exists, then `None` is returned. The point of this behavior /// is to permit callers to avoid calling `simple_fold` more than they need /// to, since there is some cost to fetching the equivalence class. -pub fn simple_fold(c: char) -> result::Result> { - CASE_FOLDING_SIMPLE - .binary_search_by_key(&c, |&(c1, _)| c1) - .map(|i| SimpleFoldIter(CASE_FOLDING_SIMPLE[i].1.iter())) - .map_err(|i| { - if i >= CASE_FOLDING_SIMPLE.len() { - None - } else { - Some(CASE_FOLDING_SIMPLE[i].0) - } - }) +/// +/// This returns an error if the Unicode case folding tables are not available. +pub fn simple_fold( + c: char, +) -> FoldResult, Option>> { + #[cfg(not(feature = "unicode-case"))] + fn imp( + _: char, + ) -> FoldResult, Option>> + { + use std::option::IntoIter; + Err::, _>, _>(CaseFoldError(())) + } + + #[cfg(feature = "unicode-case")] + fn imp( + c: char, + ) -> FoldResult, Option>> + { + use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; + + Ok(CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |&(c1, _)| c1) + .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().map(|&c| c)) + .map_err(|i| { + if i >= CASE_FOLDING_SIMPLE.len() { + None + } else { + Some(CASE_FOLDING_SIMPLE[i].0) + } + })) + } + + imp(c) } /// Returns true if and only if the given (inclusive) range contains at least @@ -66,19 +117,37 @@ pub fn simple_fold(c: char) -> result::Result> { /// mapping. /// /// This function panics if `end < start`. -pub fn contains_simple_case_mapping(start: char, end: char) -> bool { - assert!(start <= end); - CASE_FOLDING_SIMPLE - .binary_search_by(|&(c, _)| { - if start <= c && c <= end { - Ordering::Equal - } else if c > end { - Ordering::Greater - } else { - Ordering::Less - } - }) - .is_ok() +/// +/// This returns an error if the Unicode case folding tables are not available. +pub fn contains_simple_case_mapping( + start: char, + end: char, +) -> FoldResult { + #[cfg(not(feature = "unicode-case"))] + fn imp(_: char, _: char) -> FoldResult { + Err(CaseFoldError(())) + } + + #[cfg(feature = "unicode-case")] + fn imp(start: char, end: char) -> FoldResult { + use std::cmp::Ordering; + use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; + + assert!(start <= end); + Ok(CASE_FOLDING_SIMPLE + .binary_search_by(|&(c, _)| { + if start <= c && c <= end { + Ordering::Equal + } else if c > end { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok()) + } + + imp(start, end) } /// A query for finding a character class defined by Unicode. This supports @@ -124,27 +193,27 @@ impl<'a> ClassQuery<'a> { let property_name = symbolic_name_normalize(property_name); let property_value = symbolic_name_normalize(property_value); - let canon_name = match canonical_prop(&property_name) { + let canon_name = match canonical_prop(&property_name)? { None => return Err(Error::PropertyNotFound), Some(canon_name) => canon_name, }; Ok(match canon_name { "General_Category" => { - let canon = match canonical_gencat(&property_value) { + let canon = match canonical_gencat(&property_value)? { None => return Err(Error::PropertyValueNotFound), Some(canon) => canon, }; CanonicalClassQuery::GeneralCategory(canon) } "Script" => { - let canon = match canonical_script(&property_value) { + let canon = match canonical_script(&property_value)? { None => return Err(Error::PropertyValueNotFound), Some(canon) => canon, }; CanonicalClassQuery::Script(canon) } _ => { - let vals = match property_values(canon_name) { + let vals = match property_values(canon_name)? { None => return Err(Error::PropertyValueNotFound), Some(vals) => vals, }; @@ -168,13 +237,13 @@ impl<'a> ClassQuery<'a> { fn canonical_binary(&self, name: &str) -> Result { let norm = symbolic_name_normalize(name); - if let Some(canon) = canonical_prop(&norm) { + if let Some(canon) = canonical_prop(&norm)? { return Ok(CanonicalClassQuery::Binary(canon)); } - if let Some(canon) = canonical_gencat(&norm) { + if let Some(canon) = canonical_gencat(&norm)? { return Ok(CanonicalClassQuery::GeneralCategory(canon)); } - if let Some(canon) = canonical_script(&norm) { + if let Some(canon) = canonical_script(&norm)? { return Ok(CanonicalClassQuery::Script(canon)); } Err(Error::PropertyNotFound) @@ -212,25 +281,9 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result { use self::CanonicalClassQuery::*; match query.canonicalize()? { - Binary(name) => property_set(property_bool::BY_NAME, name) - .map(hir_class) - .ok_or(Error::PropertyNotFound), - GeneralCategory("Any") => Ok(hir_class(&[('\0', '\u{10FFFF}')])), - GeneralCategory("Assigned") => { - let mut cls = - property_set(general_category::BY_NAME, "Unassigned") - .map(hir_class) - .ok_or(Error::PropertyNotFound)?; - cls.negate(); - Ok(cls) - } - GeneralCategory("ASCII") => Ok(hir_class(&[('\0', '\x7F')])), - GeneralCategory(name) => property_set(general_category::BY_NAME, name) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound), - Script(name) => property_set(script::BY_NAME, name) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound), + Binary(name) => bool_property(name), + GeneralCategory(name) => gencat(name), + Script(name) => script(name), ByValue { property_name: "Age", property_value } => { let mut class = hir::ClassUnicode::empty(); for set in ages(property_value)? { @@ -239,25 +292,17 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result { Ok(class) } ByValue { property_name: "Script_Extensions", property_value } => { - property_set(script_extension::BY_NAME, property_value) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound) + script_extension(property_value) } ByValue { property_name: "Grapheme_Cluster_Break", property_value, - } => property_set(grapheme_cluster_break::BY_NAME, property_value) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound), + } => gcb(property_value), ByValue { property_name: "Sentence_Break", property_value } => { - property_set(sentence_break::BY_NAME, property_value) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound) + sb(property_value) } ByValue { property_name: "Word_Break", property_value } => { - property_set(word_break::BY_NAME, property_value) - .map(hir_class) - .ok_or(Error::PropertyValueNotFound) + wb(property_value) } _ => { // What else should we support? @@ -270,24 +315,66 @@ pub fn class<'a>(query: ClassQuery<'a>) -> Result { /// /// This returns an error if the data is not available for \w. pub fn perl_word() -> Result { - use unicode_tables::perl_word::PERL_WORD; - Ok(hir_class(PERL_WORD)) + #[cfg(not(feature = "unicode-perl"))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(feature = "unicode-perl")] + fn imp() -> Result { + use unicode_tables::perl_word::PERL_WORD; + Ok(hir_class(PERL_WORD)) + } + + imp() } /// Returns a Unicode aware class for \s. /// /// This returns an error if the data is not available for \s. pub fn perl_space() -> Result { - let query = ClassQuery::Binary("Whitespace"); - class(query) + #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] + fn imp() -> Result { + use unicode_tables::perl_space::WHITE_SPACE; + Ok(hir_class(WHITE_SPACE)) + } + + #[cfg(feature = "unicode-bool")] + fn imp() -> Result { + use unicode_tables::property_bool::WHITE_SPACE; + Ok(hir_class(WHITE_SPACE)) + } + + imp() } /// Returns a Unicode aware class for \d. /// /// This returns an error if the data is not available for \d. pub fn perl_digit() -> Result { - let query = ClassQuery::Binary("Decimal_Number"); - class(query) + #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] + fn imp() -> Result { + use unicode_tables::perl_decimal::DECIMAL_NUMBER; + Ok(hir_class(DECIMAL_NUMBER)) + } + + #[cfg(feature = "unicode-gencat")] + fn imp() -> Result { + use unicode_tables::general_category::DECIMAL_NUMBER; + Ok(hir_class(DECIMAL_NUMBER)) + } + + imp() } /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. @@ -299,6 +386,40 @@ pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { hir::ClassUnicode::new(hir_ranges) } +/// Returns true only if the given codepoint is in the `\w` character class. +/// +/// If the `unicode-perl` feature is not enabled, then this returns an error. +pub fn is_word_character(c: char) -> result::Result { + #[cfg(not(feature = "unicode-perl"))] + fn imp(_: char) -> result::Result { + Err(UnicodeWordError(())) + } + + #[cfg(feature = "unicode-perl")] + fn imp(c: char) -> result::Result { + use is_word_byte; + use std::cmp::Ordering; + use unicode_tables::perl_word::PERL_WORD; + + if c <= 0x7F as char && is_word_byte(c as u8) { + return Ok(true); + } + Ok(PERL_WORD + .binary_search_by(|&(start, end)| { + if start <= c && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok()) + } + + imp(c) +} + /// A mapping of property values for a specific property. /// /// The first element of each tuple is a normalized property value while the @@ -306,21 +427,21 @@ pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { /// value. type PropertyValues = &'static [(&'static str, &'static str)]; -fn canonical_gencat(normalized_value: &str) -> Option<&'static str> { - match normalized_value { +fn canonical_gencat(normalized_value: &str) -> Result> { + Ok(match normalized_value { "any" => Some("Any"), "assigned" => Some("Assigned"), "ascii" => Some("ASCII"), _ => { - let gencats = property_values("General_Category").unwrap(); + let gencats = property_values("General_Category")?.unwrap(); canonical_value(gencats, normalized_value) } - } + }) } -fn canonical_script(normalized_value: &str) -> Option<&'static str> { - let scripts = property_values("Script").unwrap(); - canonical_value(scripts, normalized_value) +fn canonical_script(normalized_value: &str) -> Result> { + let scripts = property_values("Script")?.unwrap(); + Ok(canonical_value(scripts, normalized_value)) } /// Find the canonical property name for the given normalized property name. @@ -329,11 +450,39 @@ fn canonical_script(normalized_value: &str) -> Option<&'static str> { /// /// The normalized property name must have been normalized according to /// UAX44 LM3, which can be done using `symbolic_name_normalize`. -fn canonical_prop(normalized_name: &str) -> Option<&'static str> { - PROPERTY_NAMES - .binary_search_by_key(&normalized_name, |&(n, _)| n) - .ok() - .map(|i| PROPERTY_NAMES[i].1) +/// +/// If the property names data is not available, then an error is returned. +fn canonical_prop(normalized_name: &str) -> Result> { + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + fn imp(_: &str) -> Result> { + Err(Error::PropertyNotFound) + } + + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + fn imp(name: &str) -> Result> { + use unicode_tables::property_names::PROPERTY_NAMES; + + Ok(PROPERTY_NAMES + .binary_search_by_key(&name, |&(n, _)| n) + .ok() + .map(|i| PROPERTY_NAMES[i].1)) + } + + imp(normalized_name) } /// Find the canonical property value for the given normalized property @@ -355,79 +504,291 @@ fn canonical_value( .map(|i| vals[i].1) } +/// Return the table of property values for the given property name. +/// +/// If the property values data is not available, then an error is returned. fn property_values( canonical_property_name: &'static str, -) -> Option { - PROPERTY_VALUES - .binary_search_by_key(&canonical_property_name, |&(n, _)| n) - .ok() - .map(|i| PROPERTY_VALUES[i].1) +) -> Result> { + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + fn imp(_: &'static str) -> Result> { + Err(Error::PropertyValueNotFound) + } + + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + fn imp(name: &'static str) -> Result> { + use unicode_tables::property_values::PROPERTY_VALUES; + + Ok(PROPERTY_VALUES + .binary_search_by_key(&name, |&(n, _)| n) + .ok() + .map(|i| PROPERTY_VALUES[i].1)) + } + + imp(canonical_property_name) } +// This is only used in some cases, but small enough to just let it be dead +// instead of figuring out (and maintaining) the right set of features. +#[allow(dead_code)] fn property_set( - name_map: &'static [(&'static str, &'static [(char, char)])], + name_map: &'static [(&'static str, Range)], canonical: &'static str, -) -> Option<&'static [(char, char)]> { +) -> Option { name_map .binary_search_by_key(&canonical, |x| x.0) .ok() .map(|i| name_map[i].1) } -/// An iterator over Unicode Age sets. Each item corresponds to a set of -/// codepoints that were added in a particular revision of Unicode. The +/// Returns an iterator over Unicode Age sets. Each item corresponds to a set +/// of codepoints that were added in a particular revision of Unicode. The /// iterator yields items in chronological order. -#[derive(Debug)] -struct AgeIter { - ages: &'static [(&'static str, &'static [(char, char)])], -} +/// +/// If the given age value isn't valid or if the data isn't available, then an +/// error is returned instead. +fn ages(canonical_age: &str) -> Result> { + #[cfg(not(feature = "unicode-age"))] + fn imp(_: &str) -> Result> { + use std::option::IntoIter; + Err::, _>(Error::PropertyNotFound) + } -fn ages(canonical_age: &str) -> Result { - const AGES: &'static [(&'static str, &'static [(char, char)])] = &[ - ("V1_1", age::V1_1), - ("V2_0", age::V2_0), - ("V2_1", age::V2_1), - ("V3_0", age::V3_0), - ("V3_1", age::V3_1), - ("V3_2", age::V3_2), - ("V4_0", age::V4_0), - ("V4_1", age::V4_1), - ("V5_0", age::V5_0), - ("V5_1", age::V5_1), - ("V5_2", age::V5_2), - ("V6_0", age::V6_0), - ("V6_1", age::V6_1), - ("V6_2", age::V6_2), - ("V6_3", age::V6_3), - ("V7_0", age::V7_0), - ("V8_0", age::V8_0), - ("V9_0", age::V9_0), - ("V10_0", age::V10_0), - ("V11_0", age::V11_0), - ("V12_0", age::V12_0), - ("V12_1", age::V12_1), - ]; - assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); - - let pos = AGES.iter().position(|&(age, _)| canonical_age == age); - match pos { - None => Err(Error::PropertyValueNotFound), - Some(i) => Ok(AgeIter { ages: &AGES[..i + 1] }), + #[cfg(feature = "unicode-age")] + fn imp(canonical_age: &str) -> Result> { + use unicode_tables::age; + + const AGES: &'static [(&'static str, Range)] = &[ + ("V1_1", age::V1_1), + ("V2_0", age::V2_0), + ("V2_1", age::V2_1), + ("V3_0", age::V3_0), + ("V3_1", age::V3_1), + ("V3_2", age::V3_2), + ("V4_0", age::V4_0), + ("V4_1", age::V4_1), + ("V5_0", age::V5_0), + ("V5_1", age::V5_1), + ("V5_2", age::V5_2), + ("V6_0", age::V6_0), + ("V6_1", age::V6_1), + ("V6_2", age::V6_2), + ("V6_3", age::V6_3), + ("V7_0", age::V7_0), + ("V8_0", age::V8_0), + ("V9_0", age::V9_0), + ("V10_0", age::V10_0), + ("V11_0", age::V11_0), + ("V12_0", age::V12_0), + ("V12_1", age::V12_1), + ]; + assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); + + let pos = AGES.iter().position(|&(age, _)| canonical_age == age); + match pos { + None => Err(Error::PropertyValueNotFound), + Some(i) => Ok(AGES[..i + 1].iter().map(|&(_, classes)| classes)), + } } + + imp(canonical_age) } -impl Iterator for AgeIter { - type Item = &'static [(char, char)]; +/// Returns the Unicode HIR class corresponding to the given general category. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given general category could not be found, or if the general +/// category data is not available, then an error is returned. +fn gencat(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-gencat"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } - fn next(&mut self) -> Option<&'static [(char, char)]> { - if self.ages.is_empty() { - None - } else { - let set = self.ages[0]; - self.ages = &self.ages[1..]; - Some(set.1) + #[cfg(feature = "unicode-gencat")] + fn imp(name: &'static str) -> Result { + use unicode_tables::general_category::BY_NAME; + match dbg!(name) { + "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), + "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), + "Assigned" => { + let mut cls = gencat("Unassigned")?; + cls.negate(); + Ok(cls) + } + name => property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound), } } + + match canonical_name { + "Decimal_Number" => perl_digit(), + name => imp(name), + } +} + +/// Returns the Unicode HIR class corresponding to the given script. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given script could not be found, or if the script data is not +/// available, then an error is returned. +fn script(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-script"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-script")] + fn imp(name: &'static str) -> Result { + use unicode_tables::script::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given script extension. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given script extension could not be found, or if the script data is +/// not available, then an error is returned. +fn script_extension( + canonical_name: &'static str, +) -> Result { + #[cfg(not(feature = "unicode-script"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-script")] + fn imp(name: &'static str) -> Result { + use unicode_tables::script_extension::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given Unicode boolean +/// property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given boolean property could not be found, or if the boolean +/// property data is not available, then an error is returned. +fn bool_property(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-bool"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-bool")] + fn imp(name: &'static str) -> Result { + use unicode_tables::property_bool::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyNotFound) + } + + match canonical_name { + "Decimal_Number" => perl_digit(), + "White_Space" => perl_space(), + name => imp(name), + } +} + +/// Returns the Unicode HIR class corresponding to the given grapheme cluster +/// break property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn gcb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use unicode_tables::grapheme_cluster_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given word break +/// property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn wb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use unicode_tables::word_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given sentence +/// break property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn sb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use unicode_tables::sentence_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) } /// Like symbolic_name_normalize_bytes, but operates on a string. @@ -438,8 +799,9 @@ fn symbolic_name_normalize(x: &str) -> String { // This should always succeed because `symbolic_name_normalize_bytes` // guarantees that `&tmp[..len]` is always valid UTF-8. // - // N.B. We could use unsafe here to avoid the additional UTF-8 check here, - // but it's unlikely to be worth it. A benchmark must justify it first. + // N.B. We could avoid the additional UTF-8 check here, but it's unlikely + // to be worth skipping the additional safety check. A benchmark must + // justify it first. String::from_utf8(tmp).unwrap() } @@ -471,7 +833,7 @@ fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { } let mut next_write = 0; for i in start..slice.len() { - // SAFETY ARGUMENT: To guarantee that the resulting slice is valid + // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid // UTF-8, we ensure that the slice contains only ASCII bytes. In // particular, we drop every non-ASCII byte from the normalized string. let b = slice[i]; @@ -505,57 +867,93 @@ mod tests { symbolic_name_normalize_bytes, }; + #[cfg(feature = "unicode-case")] + fn simple_fold_ok(c: char) -> impl Iterator { + simple_fold(c).unwrap().unwrap() + } + + #[cfg(feature = "unicode-case")] + fn simple_fold_err(c: char) -> Option { + match simple_fold(c).unwrap() { + Ok(_) => unreachable!("simple_fold returned Ok iterator"), + Err(next) => next, + } + } + + #[cfg(feature = "unicode-case")] + fn contains_case_map(start: char, end: char) -> bool { + contains_simple_case_mapping(start, end).unwrap() + } + #[test] + #[cfg(feature = "unicode-case")] fn simple_fold_k() { - let xs: Vec = simple_fold('k').unwrap().collect(); + let xs: Vec = simple_fold_ok('k').collect(); assert_eq!(xs, vec!['K', 'K']); - let xs: Vec = simple_fold('K').unwrap().collect(); + let xs: Vec = simple_fold_ok('K').collect(); assert_eq!(xs, vec!['k', 'K']); - let xs: Vec = simple_fold('K').unwrap().collect(); + let xs: Vec = simple_fold_ok('K').collect(); assert_eq!(xs, vec!['K', 'k']); } #[test] + #[cfg(feature = "unicode-case")] fn simple_fold_a() { - let xs: Vec = simple_fold('a').unwrap().collect(); + let xs: Vec = simple_fold_ok('a').collect(); assert_eq!(xs, vec!['A']); - let xs: Vec = simple_fold('A').unwrap().collect(); + let xs: Vec = simple_fold_ok('A').collect(); assert_eq!(xs, vec!['a']); } #[test] + #[cfg(feature = "unicode-case")] fn simple_fold_empty() { - assert_eq!(Some('A'), simple_fold('?').unwrap_err()); - assert_eq!(Some('A'), simple_fold('@').unwrap_err()); - assert_eq!(Some('a'), simple_fold('[').unwrap_err()); - assert_eq!(Some('Ⰰ'), simple_fold('☃').unwrap_err()); + assert_eq!(Some('A'), simple_fold_err('?')); + assert_eq!(Some('A'), simple_fold_err('@')); + assert_eq!(Some('a'), simple_fold_err('[')); + assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); } #[test] + #[cfg(feature = "unicode-case")] fn simple_fold_max() { - assert_eq!(None, simple_fold('\u{10FFFE}').unwrap_err()); - assert_eq!(None, simple_fold('\u{10FFFF}').unwrap_err()); + assert_eq!(None, simple_fold_err('\u{10FFFE}')); + assert_eq!(None, simple_fold_err('\u{10FFFF}')); } #[test] + #[cfg(not(feature = "unicode-case"))] + fn simple_fold_disabled() { + assert!(simple_fold('a').is_err()); + } + + #[test] + #[cfg(feature = "unicode-case")] fn range_contains() { - assert!(contains_simple_case_mapping('A', 'A')); - assert!(contains_simple_case_mapping('Z', 'Z')); - assert!(contains_simple_case_mapping('A', 'Z')); - assert!(contains_simple_case_mapping('@', 'A')); - assert!(contains_simple_case_mapping('Z', '[')); - assert!(contains_simple_case_mapping('☃', 'Ⰰ')); + assert!(contains_case_map('A', 'A')); + assert!(contains_case_map('Z', 'Z')); + assert!(contains_case_map('A', 'Z')); + assert!(contains_case_map('@', 'A')); + assert!(contains_case_map('Z', '[')); + assert!(contains_case_map('☃', 'Ⰰ')); + + assert!(!contains_case_map('[', '[')); + assert!(!contains_case_map('[', '`')); - assert!(!contains_simple_case_mapping('[', '[')); - assert!(!contains_simple_case_mapping('[', '`')); + assert!(!contains_case_map('☃', '☃')); + } - assert!(!contains_simple_case_mapping('☃', '☃')); + #[test] + #[cfg(not(feature = "unicode-case"))] + fn range_contains_disabled() { + assert!(contains_simple_case_mapping('a', 'a').is_err()); } #[test] + #[cfg(feature = "unicode-gencat")] fn regression_466() { use super::{CanonicalClassQuery, ClassQuery}; diff --git a/regex-syntax/src/unicode_tables/mod.rs b/regex-syntax/src/unicode_tables/mod.rs index b9adf7ec5c..20736c7ac8 100644 --- a/regex-syntax/src/unicode_tables/mod.rs +++ b/regex-syntax/src/unicode_tables/mod.rs @@ -1,12 +1,57 @@ +#[cfg(feature = "unicode-age")] pub mod age; + +#[cfg(feature = "unicode-case")] pub mod case_folding_simple; + +#[cfg(feature = "unicode-gencat")] pub mod general_category; + +#[cfg(feature = "unicode-segment")] pub mod grapheme_cluster_break; + +#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] +#[allow(dead_code)] +pub mod perl_decimal; + +#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] +#[allow(dead_code)] +pub mod perl_space; + +#[cfg(feature = "unicode-perl")] pub mod perl_word; + +#[cfg(feature = "unicode-bool")] pub mod property_bool; + +#[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", +))] pub mod property_names; + +#[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", +))] pub mod property_values; + +#[cfg(feature = "unicode-script")] pub mod script; + +#[cfg(feature = "unicode-script")] pub mod script_extension; + +#[cfg(feature = "unicode-segment")] pub mod sentence_break; + +#[cfg(feature = "unicode-segment")] pub mod word_break; diff --git a/regex-syntax/src/unicode_tables/perl_decimal.rs b/regex-syntax/src/unicode_tables/perl_decimal.rs new file mode 100644 index 0000000000..8f6a046f65 --- /dev/null +++ b/regex-syntax/src/unicode_tables/perl_decimal.rs @@ -0,0 +1,70 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate general-category /tmp/ucd/12.1.0/ --chars --include decimalnumber +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = + &[("Decimal_Number", DECIMAL_NUMBER)]; + +pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ + ('0', '9'), + ('٠', '٩'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('௦', '௯'), + ('౦', '౯'), + ('೦', '೯'), + ('൦', '൯'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༩'), + ('၀', '၉'), + ('႐', '႙'), + ('០', '៩'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧙'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('꘠', '꘩'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐒠', '𐒩'), + ('𐴰', '𐴹'), + ('𑁦', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜹'), + ('𑣠', '𑣩'), + ('𑱐', '𑱙'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𖩠', '𖩩'), + ('𖭐', '𖭙'), + ('𝟎', '𝟿'), + ('\u{1e140}', '\u{1e149}'), + ('\u{1e2f0}', '\u{1e2f9}'), + ('𞥐', '𞥙'), +]; diff --git a/regex-syntax/src/unicode_tables/perl_space.rs b/regex-syntax/src/unicode_tables/perl_space.rs new file mode 100644 index 0000000000..515724521c --- /dev/null +++ b/regex-syntax/src/unicode_tables/perl_space.rs @@ -0,0 +1,21 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-bool /tmp/ucd/12.1.0/ --chars --include whitespace +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = + &[("White_Space", WHITE_SPACE)]; + +pub const WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), + (' ', ' '), + ('\u{85}', '\u{85}'), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; diff --git a/regex-syntax/test b/regex-syntax/test new file mode 100755 index 0000000000..9970a9945a --- /dev/null +++ b/regex-syntax/test @@ -0,0 +1,20 @@ +#!/bin/bash + +# This is a convenience script for running a broad swath of the syntax tests. +echo "===== DEFAULT FEATURES ===" +cargo test + +features=( + unicode + unicode-age + unicode-bool + unicode-case + unicode-gencat + unicode-perl + unicode-script + unicode-segment +) +for f in "${features[@]}"; do + echo "===== FEATURE: $f ===" + cargo test --no-default-features --features "$f" +done diff --git a/scripts/generate-unicode-tables b/scripts/generate-unicode-tables index ca3db7decd..c01df16e7e 100755 --- a/scripts/generate-unicode-tables +++ b/scripts/generate-unicode-tables @@ -31,8 +31,6 @@ ucd-generate general-category "$ucddir" \ --chars --exclude surrogate > "$out/general_category.rs" ucd-generate grapheme-cluster-break "$ucddir" \ --chars > "$out/grapheme_cluster_break.rs" -ucd-generate perl-word "$ucddir" \ - --chars > "$out/perl_word.rs" ucd-generate property-bool "$ucddir" \ --chars > "$out/property_bool.rs" ucd-generate property-names "$ucddir" \ @@ -48,5 +46,18 @@ ucd-generate sentence-break "$ucddir" \ ucd-generate word-break "$ucddir" \ --chars > "$out/word_break.rs" +# These generate the \w, \d and \s Unicode-aware character classes. \d and \s +# are technically part of the general category and boolean properties generated +# above. However, these are generated separately to make it possible to enable +# or disable them via Cargo features independently of whether all boolean +# properties or general categories are enabled or disabled. The crate ensures +# that only one copy is compiled. +ucd-generate perl-word "$ucddir" \ + --chars > "$out/perl_word.rs" +ucd-generate general-category "$ucddir" \ + --chars --include decimalnumber > "$out/perl_decimal.rs" +ucd-generate property-bool "$ucddir" \ + --chars --include whitespace > "$out/perl_space.rs" + # Make sure everything is formatted. cargo +stable fmt --all