From 0e84e194e1d236c34c928fb9b3813daaf4c1e76e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 30 Apr 2016 12:08:39 -0400 Subject: [PATCH 1/3] Use Unicode instructions more aggressively. Previously, a byte based regex would always use byte based instructions. This isn't actually necessary. If the regex has the Unicode flag enabled, for example, then Unicode instructions can and should be used. We can't mix and match though. If any part of the regex could match invalid UTF-8, then we need to drop back to byte-based matching. We could be more aggressive in the parser. In particular, we could check if arbitrary character classes only match UTF-8, and if so, represent them as Unicode classes instead of byte classes. This optimization is only applicable to the NFA engines, since the DFA operates exclusively on bytes. In particular, the NFA engines are much faster with Unicode instructions. --- bench/src/bench.rs | 10 ++++ bench/src/sherlock.rs | 19 -------- regex-syntax/src/parser.rs | 97 ++++++++++++++++++++------------------ src/exec.rs | 6 ++- src/prog.rs | 2 +- 5 files changed, 67 insertions(+), 67 deletions(-) diff --git a/bench/src/bench.rs b/bench/src/bench.rs index d0804d42bf..d15518a210 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -59,11 +59,21 @@ pub use ffi::tcl::Regex; // Due to macro scoping rules, this definition only applies for the modules // defined below. Effectively, it allows us to use the same tests for both // native and dynamic regexes. +#[cfg(not(feature = "re-rust-bytes"))] #[cfg(not(feature = "re-rust-plugin"))] macro_rules! regex { ($re:expr) => { ::Regex::new($re).unwrap() } } +#[cfg(feature = "re-rust-bytes")] +#[cfg(not(feature = "re-rust-plugin"))] +macro_rules! regex { + ($re:expr) => {{ + use regex::bytes::RegexBuilder; + RegexBuilder::new($re).unicode(true).compile().unwrap() + }} +} + // Usage: text!(haystack) // // Builds a ::Text from an owned string. diff --git a/bench/src/sherlock.rs b/bench/src/sherlock.rs index 8c704ae07d..a7b741fd63 100644 --- a/bench/src/sherlock.rs +++ b/bench/src/sherlock.rs @@ -101,32 +101,17 @@ sherlock!(everything_greedy_nl, r"(?s).*", 1); // How fast can we match every letter? This also defeats any clever prefix // tricks. #[cfg(not(feature = "re-tcl"))] -#[cfg(not(feature = "re-rust-bytes"))] sherlock!(letters, r"\p{L}", 447160); -#[cfg(not(feature = "re-tcl"))] -#[cfg(feature = "re-rust-bytes")] -sherlock!(letters, r"(?u)\p{L}", 447160); #[cfg(not(feature = "re-tcl"))] -#[cfg(not(feature = "re-rust-bytes"))] sherlock!(letters_upper, r"\p{Lu}", 14180); -#[cfg(not(feature = "re-tcl"))] -#[cfg(feature = "re-rust-bytes")] -sherlock!(letters_upper, r"(?u)\p{Lu}", 14180); #[cfg(not(feature = "re-tcl"))] -#[cfg(not(feature = "re-rust-bytes"))] sherlock!(letters_lower, r"\p{Ll}", 432980); -#[cfg(not(feature = "re-tcl"))] -#[cfg(feature = "re-rust-bytes")] -sherlock!(letters_lower, r"(?u)\p{Ll}", 432980); // Similarly, for words. -#[cfg(not(feature = "re-rust-bytes"))] #[cfg(not(feature = "re-re2"))] sherlock!(words, r"\w+", 109214); -#[cfg(feature = "re-rust-bytes")] -sherlock!(words, r"(?u)\w+", 109214); #[cfg(feature = "re-re2")] sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here? @@ -195,8 +180,4 @@ sherlock!(ing_suffix, r"[a-zA-Z]+ing", 2824); // // Onig does surprisingly well on this benchmark and yet does quite poorly on // the ing_suffix benchmark. That one has me stumped. -// -// Interestingly, this is slower in the rust-bytes benchmark, presumably -// because scanning for one of the bytes in the Unicode *unaware* `\s` ends -// up being slower than avoiding the prefix scan at all. sherlock!(ing_suffix_limited_space, r"\s[a-zA-Z]{0,12}ing\s", 2081); diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index 03a56e55dc..65604d764e 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -114,13 +114,7 @@ impl Parser { '{' => try!(self.parse_counted_repeat()), '[' => match self.maybe_parse_ascii() { None => try!(self.parse_class()), - Some(cls) => { - Build::Expr(if self.flags.unicode { - Expr::Class(cls) - } else { - Expr::ClassBytes(cls.to_byte_class()) - }) - } + Some(cls) => Build::Expr(Expr::Class(cls)), }, '^' => { if self.flags.multi { @@ -224,11 +218,7 @@ impl Parser { } 'd'|'s'|'w'|'D'|'S'|'W' => { self.bump(); - Ok(Build::Expr(if self.flags.unicode { - Expr::Class(self.parse_perl_class(c)) - } else { - Expr::ClassBytes(self.parse_perl_class(c).to_byte_class()) - })) + Ok(Build::Expr(Expr::Class(self.parse_perl_class(c)))) } c => Err(self.err(ErrorKind::UnrecognizedEscape(c))), } @@ -1328,16 +1318,28 @@ mod tests { ByteClass::new(ranges) } - fn asciid() -> ByteClass { - ascii_class("digit").unwrap().to_byte_class() + fn asciid() -> CharClass { + ascii_class("digit").unwrap() + } + + fn asciis() -> CharClass { + ascii_class("space").unwrap() + } + + fn asciiw() -> CharClass { + ascii_class("word").unwrap() } - fn asciis() -> ByteClass { - ascii_class("space").unwrap().to_byte_class() + fn asciid_bytes() -> ByteClass { + asciid().to_byte_class() } - fn asciiw() -> ByteClass { - ascii_class("word").unwrap().to_byte_class() + fn asciis_bytes() -> ByteClass { + asciis().to_byte_class() + } + + fn asciiw_bytes() -> ByteClass { + asciiw().to_byte_class() } #[test] @@ -1905,55 +1907,55 @@ mod tests { #[test] fn escape_perl_d() { assert_eq!(p(r"\d"), Expr::Class(class(PERLD))); - assert_eq!(pb(r"(?-u)\d"), Expr::ClassBytes(asciid())); + assert_eq!(pb(r"(?-u)\d"), Expr::Class(asciid())); } #[test] fn escape_perl_s() { assert_eq!(p(r"\s"), Expr::Class(class(PERLS))); - assert_eq!(pb(r"(?-u)\s"), Expr::ClassBytes(asciis())); + assert_eq!(pb(r"(?-u)\s"), Expr::Class(asciis())); } #[test] fn escape_perl_w() { assert_eq!(p(r"\w"), Expr::Class(class(PERLW))); - assert_eq!(pb(r"(?-u)\w"), Expr::ClassBytes(asciiw())); + assert_eq!(pb(r"(?-u)\w"), Expr::Class(asciiw())); } #[test] fn escape_perl_d_negate() { assert_eq!(p(r"\D"), Expr::Class(class(PERLD).negate())); - assert_eq!(pb(r"(?-u)\D"), Expr::ClassBytes(asciid().negate())); + assert_eq!(pb(r"(?-u)\D"), Expr::Class(asciid().negate())); } #[test] fn escape_perl_s_negate() { assert_eq!(p(r"\S"), Expr::Class(class(PERLS).negate())); - assert_eq!(pb(r"(?-u)\S"), Expr::ClassBytes(asciis().negate())); + assert_eq!(pb(r"(?-u)\S"), Expr::Class(asciis().negate())); } #[test] fn escape_perl_w_negate() { assert_eq!(p(r"\W"), Expr::Class(class(PERLW).negate())); - assert_eq!(pb(r"(?-u)\W"), Expr::ClassBytes(asciiw().negate())); + assert_eq!(pb(r"(?-u)\W"), Expr::Class(asciiw().negate())); } #[test] fn escape_perl_d_case_fold() { assert_eq!(p(r"(?i)\d"), Expr::Class(class(PERLD).case_fold())); - assert_eq!(pb(r"(?i-u)\d"), Expr::ClassBytes(asciid().case_fold())); + assert_eq!(pb(r"(?i-u)\d"), Expr::Class(asciid().case_fold())); } #[test] fn escape_perl_s_case_fold() { assert_eq!(p(r"(?i)\s"), Expr::Class(class(PERLS).case_fold())); - assert_eq!(pb(r"(?i-u)\s"), Expr::ClassBytes(asciis().case_fold())); + assert_eq!(pb(r"(?i-u)\s"), Expr::Class(asciis().case_fold())); } #[test] fn escape_perl_w_case_fold() { assert_eq!(p(r"(?i)\w"), Expr::Class(class(PERLW).case_fold())); - assert_eq!(pb(r"(?i-u)\w"), Expr::ClassBytes(asciiw().case_fold())); + assert_eq!(pb(r"(?i-u)\w"), Expr::Class(asciiw().case_fold())); } #[test] @@ -1961,7 +1963,7 @@ mod tests { assert_eq!(p(r"(?i)\D"), Expr::Class(class(PERLD).case_fold().negate())); let bytes = asciid().case_fold().negate(); - assert_eq!(pb(r"(?i-u)\D"), Expr::ClassBytes(bytes)); + assert_eq!(pb(r"(?i-u)\D"), Expr::Class(bytes)); } #[test] @@ -1969,7 +1971,7 @@ mod tests { assert_eq!(p(r"(?i)\S"), Expr::Class(class(PERLS).case_fold().negate())); let bytes = asciis().case_fold().negate(); - assert_eq!(pb(r"(?i-u)\S"), Expr::ClassBytes(bytes)); + assert_eq!(pb(r"(?i-u)\S"), Expr::Class(bytes)); } #[test] @@ -1977,7 +1979,7 @@ mod tests { assert_eq!(p(r"(?i)\W"), Expr::Class(class(PERLW).case_fold().negate())); let bytes = asciiw().case_fold().negate(); - assert_eq!(pb(r"(?i-u)\W"), Expr::ClassBytes(bytes)); + assert_eq!(pb(r"(?i-u)\W"), Expr::Class(bytes)); } #[test] @@ -2039,11 +2041,11 @@ mod tests { assert_eq!(p(r"[^\w]"), Expr::Class(class(PERLW).negate())); assert_eq!(p(r"[^\s]"), Expr::Class(class(PERLS).negate())); - let bytes = asciid().negate(); + let bytes = asciid_bytes().negate(); assert_eq!(pb(r"(?-u)[^\d]"), Expr::ClassBytes(bytes)); - let bytes = asciiw().negate(); + let bytes = asciiw_bytes().negate(); assert_eq!(pb(r"(?-u)[^\w]"), Expr::ClassBytes(bytes)); - let bytes = asciis().negate(); + let bytes = asciis_bytes().negate(); assert_eq!(pb(r"(?-u)[^\s]"), Expr::ClassBytes(bytes)); } @@ -2053,9 +2055,9 @@ mod tests { assert_eq!(p(r"[^\W]"), Expr::Class(class(PERLW))); assert_eq!(p(r"[^\S]"), Expr::Class(class(PERLS))); - assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid())); - assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw())); - assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis())); + assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid_bytes())); + assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw_bytes())); + assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis_bytes())); } #[test] @@ -2063,7 +2065,8 @@ mod tests { assert_eq!(p(r"(?i)[\d]"), Expr::Class(class(PERLD).case_fold())); assert_eq!(p(r"(?i)[\p{Yi}]"), Expr::Class(class(YI).case_fold())); - assert_eq!(pb(r"(?i-u)[\d]"), Expr::ClassBytes(asciid().case_fold())); + assert_eq!(pb(r"(?i-u)[\d]"), + Expr::ClassBytes(asciid_bytes().case_fold())); } #[test] @@ -2075,11 +2078,11 @@ mod tests { assert_eq!(p(r"(?i)[^\s]"), Expr::Class(class(PERLS).case_fold().negate())); - let bytes = asciid().case_fold().negate(); + let bytes = asciid_bytes().case_fold().negate(); assert_eq!(pb(r"(?i-u)[^\d]"), Expr::ClassBytes(bytes)); - let bytes = asciiw().case_fold().negate(); + let bytes = asciiw_bytes().case_fold().negate(); assert_eq!(pb(r"(?i-u)[^\w]"), Expr::ClassBytes(bytes)); - let bytes = asciis().case_fold().negate(); + let bytes = asciis_bytes().case_fold().negate(); assert_eq!(pb(r"(?i-u)[^\s]"), Expr::ClassBytes(bytes)); } @@ -2089,9 +2092,12 @@ mod tests { assert_eq!(p(r"(?i)[^\W]"), Expr::Class(class(PERLW).case_fold())); assert_eq!(p(r"(?i)[^\S]"), Expr::Class(class(PERLS).case_fold())); - assert_eq!(pb(r"(?i-u)[^\D]"), Expr::ClassBytes(asciid().case_fold())); - assert_eq!(pb(r"(?i-u)[^\W]"), Expr::ClassBytes(asciiw().case_fold())); - assert_eq!(pb(r"(?i-u)[^\S]"), Expr::ClassBytes(asciis().case_fold())); + assert_eq!(pb(r"(?i-u)[^\D]"), + Expr::ClassBytes(asciid_bytes().case_fold())); + assert_eq!(pb(r"(?i-u)[^\W]"), + Expr::ClassBytes(asciiw_bytes().case_fold())); + assert_eq!(pb(r"(?i-u)[^\S]"), + Expr::ClassBytes(asciis_bytes().case_fold())); } #[test] @@ -2184,8 +2190,7 @@ mod tests { assert_eq!(p("[:upper:]"), Expr::Class(class(UPPER))); assert_eq!(p("[[:upper:]]"), Expr::Class(class(UPPER))); - assert_eq!(pb("(?-u)[:upper:]"), - Expr::ClassBytes(class(UPPER).to_byte_class())); + assert_eq!(pb("(?-u)[:upper:]"), Expr::Class(class(UPPER))); assert_eq!(pb("(?-u)[[:upper:]]"), Expr::ClassBytes(class(UPPER).to_byte_class())); } @@ -2233,7 +2238,7 @@ mod tests { Expr::Class(class(UPPER).case_fold())); assert_eq!(pb("(?i-u)[:upper:]"), - Expr::ClassBytes(class(UPPER).to_byte_class().case_fold())); + Expr::Class(class(UPPER).case_fold())); assert_eq!(pb("(?i-u)[[:upper:]]"), Expr::ClassBytes(class(UPPER).to_byte_class().case_fold())); } diff --git a/src/exec.rs b/src/exec.rs index 11f4fde621..37228e005b 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -105,6 +105,7 @@ struct Parsed { exprs: Vec, prefixes: Literals, suffixes: Literals, + bytes: bool, } impl ExecBuilder { @@ -208,6 +209,7 @@ impl ExecBuilder { let mut exprs = Vec::with_capacity(self.options.pats.len()); let mut prefixes = Some(Literals::empty()); let mut suffixes = Some(Literals::empty()); + let mut bytes = false; for pat in &self.options.pats { let parser = ExprBuilder::new() @@ -219,6 +221,7 @@ impl ExecBuilder { .unicode(self.options.unicode) .allow_bytes(!self.only_utf8); let expr = try!(parser.parse(pat)); + bytes = bytes || expr.has_bytes(); prefixes = prefixes.and_then(|mut prefixes| { if !prefixes.union_prefixes(&expr) { None @@ -239,6 +242,7 @@ impl ExecBuilder { exprs: exprs, prefixes: prefixes.unwrap_or(Literals::empty()), suffixes: suffixes.unwrap_or(Literals::empty()), + bytes: bytes, }) } @@ -261,7 +265,7 @@ impl ExecBuilder { let mut nfa = try!( Compiler::new() .size_limit(self.options.size_limit) - .bytes(self.bytes) + .bytes(self.bytes || parsed.bytes) .only_utf8(self.only_utf8) .compile(&parsed.exprs)); let mut dfa = try!( diff --git a/src/prog.rs b/src/prog.rs index 165c20a355..44e3228262 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -132,7 +132,7 @@ impl Program { /// Returns true if this program uses Byte instructions instead of /// Char/Range instructions. pub fn uses_bytes(&self) -> bool { - self.is_bytes || self.is_dfa || !self.only_utf8 + self.is_bytes || self.is_dfa } /// Returns true if this program exclusively matches valid UTF-8 bytes. From e9b7cb11deea5e60abaa5b9c317fb2faa96a20df Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 1 May 2016 12:13:36 -0400 Subject: [PATCH 2/3] A few comment touchups. --- bench/src/bench.rs | 2 ++ src/literals.rs | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bench/src/bench.rs b/bench/src/bench.rs index d15518a210..eabd5ecc2f 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -69,6 +69,8 @@ macro_rules! regex { #[cfg(not(feature = "re-rust-plugin"))] macro_rules! regex { ($re:expr) => {{ + // Always enable the Unicode flag for byte based regexes. + // Really, this should have been enabled by default. *sigh* use regex::bytes::RegexBuilder; RegexBuilder::new($re).unicode(true).compile().unwrap() }} diff --git a/src/literals.rs b/src/literals.rs index 934aefa29d..2de5a7e500 100644 --- a/src/literals.rs +++ b/src/literals.rs @@ -361,8 +361,8 @@ pub struct SingleSearch { /// The second rarest byte is used as a type of guard for quickly detecting /// a mismatch after memchr locates an instance of the rarest byte. This /// is a hedge against pathological cases where the pre-computed frequency - /// analysis may be off. (But of course, does not prevent pathological - /// cases.) + /// analysis may be off. (But of course, does not prevent *all* + /// pathological cases.) rare2: u8, /// The offset of the second rarest byte in `pat`. rare2i: usize, From 7ea96425ed7e59269ca299e80f19201169de3e73 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 1 May 2016 12:19:12 -0400 Subject: [PATCH 3/3] Remove the StateKey type. It has exactly the same information and therefore was purely redundant. We now uses states themselves as keys in the compiled map. --- src/dfa.rs | 45 +++++++++++++-------------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/src/dfa.rs b/src/dfa.rs index fbd7d81ac2..56b1bd880d 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -101,10 +101,10 @@ pub struct Cache { qnext: SparseSet, } -#[derive(Clone, Debug)] /// CacheInner is logically just a part of Cache, but groups together fields /// that aren't passed as function parameters throughout search. (This split /// is mostly an artifact of the borrow checker. It is happily paid.) +#[derive(Clone, Debug)] struct CacheInner { /// A cache of pre-compiled DFA states, keyed by the set of NFA states /// and the set of empty-width flags set at the byte in the input when the @@ -114,7 +114,7 @@ struct CacheInner { /// things, we just pass indexes around manually. The performance impact of /// this is probably an instruction or two in the inner loop. However, on /// 64 bit, each StatePtr is half the size of a *State. - compiled: HashMap, + compiled: HashMap, /// The transition table. /// /// The transition table is laid out in row-major order, where states are @@ -230,7 +230,7 @@ impl Result { /// /// States don't carry their transitions. Instead, transitions are stored in /// a single row-major table. -#[derive(Clone)] +#[derive(Clone, Eq, Hash, PartialEq)] struct State { /// The set of NFA states in this DFA state, which are computed by /// following epsilon transitions from `insts[0]`. Note that not all @@ -244,20 +244,6 @@ struct State { flags: StateFlags, } -/// A state's key for identifying it in the cache. In particular, if two -/// state's cache keys are equivalent, then they cannot be discriminatory in -/// a match. -/// -/// We capture two bits of information: an ordered set of NFA states for the -/// DFA state and its flags. -#[derive(Clone, Debug, Eq, Hash, PartialEq)] -struct StateKey { - /// An ordered set of NFA states. - insts: Box<[InstPtr]>, - /// The state's flags. (Match, word, empty assertions.) - flags: StateFlags, -} - /// InstPtr is a 32 bit pointer into a sequence of opcodes (i.e., it indexes /// an NFA state). /// @@ -1090,8 +1076,7 @@ impl<'a> Fsm<'a> { } } // Allocate room for our state and add it. - let state = State { insts: key.insts.clone(), flags: state_flags }; - self.add_state(key, state) + self.add_state(key) } /// Produces a key suitable for describing a state in the DFA cache. @@ -1108,7 +1093,7 @@ impl<'a> Fsm<'a> { &mut self, q: &SparseSet, state_flags: &mut StateFlags, - ) -> Option { + ) -> Option { use prog::Inst::*; use prog::EmptyLook::*; @@ -1171,7 +1156,7 @@ impl<'a> Fsm<'a> { if insts.len() == 0 && !state_flags.is_match() { None } else { - Some(StateKey { + Some(State { insts: insts.into_boxed_slice(), flags: *state_flags, }) @@ -1256,16 +1241,12 @@ impl<'a> Fsm<'a> { /// Restores the given state back into the cache, and returns a pointer /// to it. fn restore_state(&mut self, state: State) -> Option { - let key = StateKey { - insts: state.insts.clone(), - flags: state.flags, - }; // If we've already stored this state, just return a pointer to it. // None will be the wiser. - if let Some(&si) = self.cache.compiled.get(&key) { + if let Some(&si) = self.cache.compiled.get(&state) { return Some(si); } - self.add_state(key, state) + self.add_state(state) } /// Returns the next state given the current state si and current byte @@ -1395,7 +1376,7 @@ impl<'a> Fsm<'a> { /// /// If None is returned, then the state limit was reached and the DFA /// should quit. - fn add_state(&mut self, key: StateKey, state: State) -> Option { + fn add_state(&mut self, state: State) -> Option { // This will fail if the next state pointer exceeds STATE_PTR. In // practice, the cache limit will prevent us from ever getting here, // but maybe callers will set the cache size to something ridiculous... @@ -1415,8 +1396,8 @@ impl<'a> Fsm<'a> { } // Finally, put our actual state on to our heap of states and index it // so we can find it later. - self.cache.states.push(state); - self.cache.compiled.insert(key, si); + self.cache.states.push(state.clone()); + self.cache.compiled.insert(state, si); // Transition table and set of states and map should all be in sync. debug_assert!(self.cache.states.len() == self.cache.trans.num_states()); @@ -1505,7 +1486,7 @@ impl<'a> Fsm<'a> { // Estimate that there are about 16 instructions per state consuming // 64 = 16 * 4 bytes of space. let compiled = - (self.cache.compiled.len() * (size::() + 64)) + (self.cache.compiled.len() * (size::() + 64)) + (self.cache.compiled.len() * size::()); let states = self.cache.states.len() @@ -1522,7 +1503,7 @@ impl<'a> Fsm<'a> { fn actual_size(&self) -> usize { let mut compiled = 0; for k in self.cache.compiled.keys() { - compiled += mem::size_of::(); + compiled += mem::size_of::(); compiled += mem::size_of::(); compiled += k.insts.len() * mem::size_of::(); }