diff --git a/bench/src/bench.rs b/bench/src/bench.rs index d0804d42bf..eabd5ecc2f 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -59,11 +59,23 @@ pub use ffi::tcl::Regex; // Due to macro scoping rules, this definition only applies for the modules // defined below. Effectively, it allows us to use the same tests for both // native and dynamic regexes. +#[cfg(not(feature = "re-rust-bytes"))] #[cfg(not(feature = "re-rust-plugin"))] macro_rules! regex { ($re:expr) => { ::Regex::new($re).unwrap() } } +#[cfg(feature = "re-rust-bytes")] +#[cfg(not(feature = "re-rust-plugin"))] +macro_rules! regex { + ($re:expr) => {{ + // Always enable the Unicode flag for byte based regexes. + // Really, this should have been enabled by default. *sigh* + use regex::bytes::RegexBuilder; + RegexBuilder::new($re).unicode(true).compile().unwrap() + }} +} + // Usage: text!(haystack) // // Builds a ::Text from an owned string. diff --git a/bench/src/sherlock.rs b/bench/src/sherlock.rs index 8c704ae07d..a7b741fd63 100644 --- a/bench/src/sherlock.rs +++ b/bench/src/sherlock.rs @@ -101,32 +101,17 @@ sherlock!(everything_greedy_nl, r"(?s).*", 1); // How fast can we match every letter? This also defeats any clever prefix // tricks. #[cfg(not(feature = "re-tcl"))] -#[cfg(not(feature = "re-rust-bytes"))] sherlock!(letters, r"\p{L}", 447160); -#[cfg(not(feature = "re-tcl"))] -#[cfg(feature = "re-rust-bytes")] -sherlock!(letters, r"(?u)\p{L}", 447160); #[cfg(not(feature = "re-tcl"))] -#[cfg(not(feature = "re-rust-bytes"))] sherlock!(letters_upper, r"\p{Lu}", 14180); -#[cfg(not(feature = "re-tcl"))] -#[cfg(feature = "re-rust-bytes")] -sherlock!(letters_upper, r"(?u)\p{Lu}", 14180); #[cfg(not(feature = "re-tcl"))] -#[cfg(not(feature = "re-rust-bytes"))] sherlock!(letters_lower, r"\p{Ll}", 432980); -#[cfg(not(feature = "re-tcl"))] -#[cfg(feature = "re-rust-bytes")] -sherlock!(letters_lower, r"(?u)\p{Ll}", 432980); // Similarly, for words. -#[cfg(not(feature = "re-rust-bytes"))] #[cfg(not(feature = "re-re2"))] sherlock!(words, r"\w+", 109214); -#[cfg(feature = "re-rust-bytes")] -sherlock!(words, r"(?u)\w+", 109214); #[cfg(feature = "re-re2")] sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here? @@ -195,8 +180,4 @@ sherlock!(ing_suffix, r"[a-zA-Z]+ing", 2824); // // Onig does surprisingly well on this benchmark and yet does quite poorly on // the ing_suffix benchmark. That one has me stumped. -// -// Interestingly, this is slower in the rust-bytes benchmark, presumably -// because scanning for one of the bytes in the Unicode *unaware* `\s` ends -// up being slower than avoiding the prefix scan at all. sherlock!(ing_suffix_limited_space, r"\s[a-zA-Z]{0,12}ing\s", 2081); diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index 03a56e55dc..65604d764e 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -114,13 +114,7 @@ impl Parser { '{' => try!(self.parse_counted_repeat()), '[' => match self.maybe_parse_ascii() { None => try!(self.parse_class()), - Some(cls) => { - Build::Expr(if self.flags.unicode { - Expr::Class(cls) - } else { - Expr::ClassBytes(cls.to_byte_class()) - }) - } + Some(cls) => Build::Expr(Expr::Class(cls)), }, '^' => { if self.flags.multi { @@ -224,11 +218,7 @@ impl Parser { } 'd'|'s'|'w'|'D'|'S'|'W' => { self.bump(); - Ok(Build::Expr(if self.flags.unicode { - Expr::Class(self.parse_perl_class(c)) - } else { - Expr::ClassBytes(self.parse_perl_class(c).to_byte_class()) - })) + Ok(Build::Expr(Expr::Class(self.parse_perl_class(c)))) } c => Err(self.err(ErrorKind::UnrecognizedEscape(c))), } @@ -1328,16 +1318,28 @@ mod tests { ByteClass::new(ranges) } - fn asciid() -> ByteClass { - ascii_class("digit").unwrap().to_byte_class() + fn asciid() -> CharClass { + ascii_class("digit").unwrap() + } + + fn asciis() -> CharClass { + ascii_class("space").unwrap() + } + + fn asciiw() -> CharClass { + ascii_class("word").unwrap() } - fn asciis() -> ByteClass { - ascii_class("space").unwrap().to_byte_class() + fn asciid_bytes() -> ByteClass { + asciid().to_byte_class() } - fn asciiw() -> ByteClass { - ascii_class("word").unwrap().to_byte_class() + fn asciis_bytes() -> ByteClass { + asciis().to_byte_class() + } + + fn asciiw_bytes() -> ByteClass { + asciiw().to_byte_class() } #[test] @@ -1905,55 +1907,55 @@ mod tests { #[test] fn escape_perl_d() { assert_eq!(p(r"\d"), Expr::Class(class(PERLD))); - assert_eq!(pb(r"(?-u)\d"), Expr::ClassBytes(asciid())); + assert_eq!(pb(r"(?-u)\d"), Expr::Class(asciid())); } #[test] fn escape_perl_s() { assert_eq!(p(r"\s"), Expr::Class(class(PERLS))); - assert_eq!(pb(r"(?-u)\s"), Expr::ClassBytes(asciis())); + assert_eq!(pb(r"(?-u)\s"), Expr::Class(asciis())); } #[test] fn escape_perl_w() { assert_eq!(p(r"\w"), Expr::Class(class(PERLW))); - assert_eq!(pb(r"(?-u)\w"), Expr::ClassBytes(asciiw())); + assert_eq!(pb(r"(?-u)\w"), Expr::Class(asciiw())); } #[test] fn escape_perl_d_negate() { assert_eq!(p(r"\D"), Expr::Class(class(PERLD).negate())); - assert_eq!(pb(r"(?-u)\D"), Expr::ClassBytes(asciid().negate())); + assert_eq!(pb(r"(?-u)\D"), Expr::Class(asciid().negate())); } #[test] fn escape_perl_s_negate() { assert_eq!(p(r"\S"), Expr::Class(class(PERLS).negate())); - assert_eq!(pb(r"(?-u)\S"), Expr::ClassBytes(asciis().negate())); + assert_eq!(pb(r"(?-u)\S"), Expr::Class(asciis().negate())); } #[test] fn escape_perl_w_negate() { assert_eq!(p(r"\W"), Expr::Class(class(PERLW).negate())); - assert_eq!(pb(r"(?-u)\W"), Expr::ClassBytes(asciiw().negate())); + assert_eq!(pb(r"(?-u)\W"), Expr::Class(asciiw().negate())); } #[test] fn escape_perl_d_case_fold() { assert_eq!(p(r"(?i)\d"), Expr::Class(class(PERLD).case_fold())); - assert_eq!(pb(r"(?i-u)\d"), Expr::ClassBytes(asciid().case_fold())); + assert_eq!(pb(r"(?i-u)\d"), Expr::Class(asciid().case_fold())); } #[test] fn escape_perl_s_case_fold() { assert_eq!(p(r"(?i)\s"), Expr::Class(class(PERLS).case_fold())); - assert_eq!(pb(r"(?i-u)\s"), Expr::ClassBytes(asciis().case_fold())); + assert_eq!(pb(r"(?i-u)\s"), Expr::Class(asciis().case_fold())); } #[test] fn escape_perl_w_case_fold() { assert_eq!(p(r"(?i)\w"), Expr::Class(class(PERLW).case_fold())); - assert_eq!(pb(r"(?i-u)\w"), Expr::ClassBytes(asciiw().case_fold())); + assert_eq!(pb(r"(?i-u)\w"), Expr::Class(asciiw().case_fold())); } #[test] @@ -1961,7 +1963,7 @@ mod tests { assert_eq!(p(r"(?i)\D"), Expr::Class(class(PERLD).case_fold().negate())); let bytes = asciid().case_fold().negate(); - assert_eq!(pb(r"(?i-u)\D"), Expr::ClassBytes(bytes)); + assert_eq!(pb(r"(?i-u)\D"), Expr::Class(bytes)); } #[test] @@ -1969,7 +1971,7 @@ mod tests { assert_eq!(p(r"(?i)\S"), Expr::Class(class(PERLS).case_fold().negate())); let bytes = asciis().case_fold().negate(); - assert_eq!(pb(r"(?i-u)\S"), Expr::ClassBytes(bytes)); + assert_eq!(pb(r"(?i-u)\S"), Expr::Class(bytes)); } #[test] @@ -1977,7 +1979,7 @@ mod tests { assert_eq!(p(r"(?i)\W"), Expr::Class(class(PERLW).case_fold().negate())); let bytes = asciiw().case_fold().negate(); - assert_eq!(pb(r"(?i-u)\W"), Expr::ClassBytes(bytes)); + assert_eq!(pb(r"(?i-u)\W"), Expr::Class(bytes)); } #[test] @@ -2039,11 +2041,11 @@ mod tests { assert_eq!(p(r"[^\w]"), Expr::Class(class(PERLW).negate())); assert_eq!(p(r"[^\s]"), Expr::Class(class(PERLS).negate())); - let bytes = asciid().negate(); + let bytes = asciid_bytes().negate(); assert_eq!(pb(r"(?-u)[^\d]"), Expr::ClassBytes(bytes)); - let bytes = asciiw().negate(); + let bytes = asciiw_bytes().negate(); assert_eq!(pb(r"(?-u)[^\w]"), Expr::ClassBytes(bytes)); - let bytes = asciis().negate(); + let bytes = asciis_bytes().negate(); assert_eq!(pb(r"(?-u)[^\s]"), Expr::ClassBytes(bytes)); } @@ -2053,9 +2055,9 @@ mod tests { assert_eq!(p(r"[^\W]"), Expr::Class(class(PERLW))); assert_eq!(p(r"[^\S]"), Expr::Class(class(PERLS))); - assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid())); - assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw())); - assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis())); + assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid_bytes())); + assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw_bytes())); + assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis_bytes())); } #[test] @@ -2063,7 +2065,8 @@ mod tests { assert_eq!(p(r"(?i)[\d]"), Expr::Class(class(PERLD).case_fold())); assert_eq!(p(r"(?i)[\p{Yi}]"), Expr::Class(class(YI).case_fold())); - assert_eq!(pb(r"(?i-u)[\d]"), Expr::ClassBytes(asciid().case_fold())); + assert_eq!(pb(r"(?i-u)[\d]"), + Expr::ClassBytes(asciid_bytes().case_fold())); } #[test] @@ -2075,11 +2078,11 @@ mod tests { assert_eq!(p(r"(?i)[^\s]"), Expr::Class(class(PERLS).case_fold().negate())); - let bytes = asciid().case_fold().negate(); + let bytes = asciid_bytes().case_fold().negate(); assert_eq!(pb(r"(?i-u)[^\d]"), Expr::ClassBytes(bytes)); - let bytes = asciiw().case_fold().negate(); + let bytes = asciiw_bytes().case_fold().negate(); assert_eq!(pb(r"(?i-u)[^\w]"), Expr::ClassBytes(bytes)); - let bytes = asciis().case_fold().negate(); + let bytes = asciis_bytes().case_fold().negate(); assert_eq!(pb(r"(?i-u)[^\s]"), Expr::ClassBytes(bytes)); } @@ -2089,9 +2092,12 @@ mod tests { assert_eq!(p(r"(?i)[^\W]"), Expr::Class(class(PERLW).case_fold())); assert_eq!(p(r"(?i)[^\S]"), Expr::Class(class(PERLS).case_fold())); - assert_eq!(pb(r"(?i-u)[^\D]"), Expr::ClassBytes(asciid().case_fold())); - assert_eq!(pb(r"(?i-u)[^\W]"), Expr::ClassBytes(asciiw().case_fold())); - assert_eq!(pb(r"(?i-u)[^\S]"), Expr::ClassBytes(asciis().case_fold())); + assert_eq!(pb(r"(?i-u)[^\D]"), + Expr::ClassBytes(asciid_bytes().case_fold())); + assert_eq!(pb(r"(?i-u)[^\W]"), + Expr::ClassBytes(asciiw_bytes().case_fold())); + assert_eq!(pb(r"(?i-u)[^\S]"), + Expr::ClassBytes(asciis_bytes().case_fold())); } #[test] @@ -2184,8 +2190,7 @@ mod tests { assert_eq!(p("[:upper:]"), Expr::Class(class(UPPER))); assert_eq!(p("[[:upper:]]"), Expr::Class(class(UPPER))); - assert_eq!(pb("(?-u)[:upper:]"), - Expr::ClassBytes(class(UPPER).to_byte_class())); + assert_eq!(pb("(?-u)[:upper:]"), Expr::Class(class(UPPER))); assert_eq!(pb("(?-u)[[:upper:]]"), Expr::ClassBytes(class(UPPER).to_byte_class())); } @@ -2233,7 +2238,7 @@ mod tests { Expr::Class(class(UPPER).case_fold())); assert_eq!(pb("(?i-u)[:upper:]"), - Expr::ClassBytes(class(UPPER).to_byte_class().case_fold())); + Expr::Class(class(UPPER).case_fold())); assert_eq!(pb("(?i-u)[[:upper:]]"), Expr::ClassBytes(class(UPPER).to_byte_class().case_fold())); } diff --git a/src/dfa.rs b/src/dfa.rs index fbd7d81ac2..56b1bd880d 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -101,10 +101,10 @@ pub struct Cache { qnext: SparseSet, } -#[derive(Clone, Debug)] /// CacheInner is logically just a part of Cache, but groups together fields /// that aren't passed as function parameters throughout search. (This split /// is mostly an artifact of the borrow checker. It is happily paid.) +#[derive(Clone, Debug)] struct CacheInner { /// A cache of pre-compiled DFA states, keyed by the set of NFA states /// and the set of empty-width flags set at the byte in the input when the @@ -114,7 +114,7 @@ struct CacheInner { /// things, we just pass indexes around manually. The performance impact of /// this is probably an instruction or two in the inner loop. However, on /// 64 bit, each StatePtr is half the size of a *State. - compiled: HashMap, + compiled: HashMap, /// The transition table. /// /// The transition table is laid out in row-major order, where states are @@ -230,7 +230,7 @@ impl Result { /// /// States don't carry their transitions. Instead, transitions are stored in /// a single row-major table. -#[derive(Clone)] +#[derive(Clone, Eq, Hash, PartialEq)] struct State { /// The set of NFA states in this DFA state, which are computed by /// following epsilon transitions from `insts[0]`. Note that not all @@ -244,20 +244,6 @@ struct State { flags: StateFlags, } -/// A state's key for identifying it in the cache. In particular, if two -/// state's cache keys are equivalent, then they cannot be discriminatory in -/// a match. -/// -/// We capture two bits of information: an ordered set of NFA states for the -/// DFA state and its flags. -#[derive(Clone, Debug, Eq, Hash, PartialEq)] -struct StateKey { - /// An ordered set of NFA states. - insts: Box<[InstPtr]>, - /// The state's flags. (Match, word, empty assertions.) - flags: StateFlags, -} - /// InstPtr is a 32 bit pointer into a sequence of opcodes (i.e., it indexes /// an NFA state). /// @@ -1090,8 +1076,7 @@ impl<'a> Fsm<'a> { } } // Allocate room for our state and add it. - let state = State { insts: key.insts.clone(), flags: state_flags }; - self.add_state(key, state) + self.add_state(key) } /// Produces a key suitable for describing a state in the DFA cache. @@ -1108,7 +1093,7 @@ impl<'a> Fsm<'a> { &mut self, q: &SparseSet, state_flags: &mut StateFlags, - ) -> Option { + ) -> Option { use prog::Inst::*; use prog::EmptyLook::*; @@ -1171,7 +1156,7 @@ impl<'a> Fsm<'a> { if insts.len() == 0 && !state_flags.is_match() { None } else { - Some(StateKey { + Some(State { insts: insts.into_boxed_slice(), flags: *state_flags, }) @@ -1256,16 +1241,12 @@ impl<'a> Fsm<'a> { /// Restores the given state back into the cache, and returns a pointer /// to it. fn restore_state(&mut self, state: State) -> Option { - let key = StateKey { - insts: state.insts.clone(), - flags: state.flags, - }; // If we've already stored this state, just return a pointer to it. // None will be the wiser. - if let Some(&si) = self.cache.compiled.get(&key) { + if let Some(&si) = self.cache.compiled.get(&state) { return Some(si); } - self.add_state(key, state) + self.add_state(state) } /// Returns the next state given the current state si and current byte @@ -1395,7 +1376,7 @@ impl<'a> Fsm<'a> { /// /// If None is returned, then the state limit was reached and the DFA /// should quit. - fn add_state(&mut self, key: StateKey, state: State) -> Option { + fn add_state(&mut self, state: State) -> Option { // This will fail if the next state pointer exceeds STATE_PTR. In // practice, the cache limit will prevent us from ever getting here, // but maybe callers will set the cache size to something ridiculous... @@ -1415,8 +1396,8 @@ impl<'a> Fsm<'a> { } // Finally, put our actual state on to our heap of states and index it // so we can find it later. - self.cache.states.push(state); - self.cache.compiled.insert(key, si); + self.cache.states.push(state.clone()); + self.cache.compiled.insert(state, si); // Transition table and set of states and map should all be in sync. debug_assert!(self.cache.states.len() == self.cache.trans.num_states()); @@ -1505,7 +1486,7 @@ impl<'a> Fsm<'a> { // Estimate that there are about 16 instructions per state consuming // 64 = 16 * 4 bytes of space. let compiled = - (self.cache.compiled.len() * (size::() + 64)) + (self.cache.compiled.len() * (size::() + 64)) + (self.cache.compiled.len() * size::()); let states = self.cache.states.len() @@ -1522,7 +1503,7 @@ impl<'a> Fsm<'a> { fn actual_size(&self) -> usize { let mut compiled = 0; for k in self.cache.compiled.keys() { - compiled += mem::size_of::(); + compiled += mem::size_of::(); compiled += mem::size_of::(); compiled += k.insts.len() * mem::size_of::(); } diff --git a/src/exec.rs b/src/exec.rs index 11f4fde621..37228e005b 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -105,6 +105,7 @@ struct Parsed { exprs: Vec, prefixes: Literals, suffixes: Literals, + bytes: bool, } impl ExecBuilder { @@ -208,6 +209,7 @@ impl ExecBuilder { let mut exprs = Vec::with_capacity(self.options.pats.len()); let mut prefixes = Some(Literals::empty()); let mut suffixes = Some(Literals::empty()); + let mut bytes = false; for pat in &self.options.pats { let parser = ExprBuilder::new() @@ -219,6 +221,7 @@ impl ExecBuilder { .unicode(self.options.unicode) .allow_bytes(!self.only_utf8); let expr = try!(parser.parse(pat)); + bytes = bytes || expr.has_bytes(); prefixes = prefixes.and_then(|mut prefixes| { if !prefixes.union_prefixes(&expr) { None @@ -239,6 +242,7 @@ impl ExecBuilder { exprs: exprs, prefixes: prefixes.unwrap_or(Literals::empty()), suffixes: suffixes.unwrap_or(Literals::empty()), + bytes: bytes, }) } @@ -261,7 +265,7 @@ impl ExecBuilder { let mut nfa = try!( Compiler::new() .size_limit(self.options.size_limit) - .bytes(self.bytes) + .bytes(self.bytes || parsed.bytes) .only_utf8(self.only_utf8) .compile(&parsed.exprs)); let mut dfa = try!( diff --git a/src/literals.rs b/src/literals.rs index 934aefa29d..2de5a7e500 100644 --- a/src/literals.rs +++ b/src/literals.rs @@ -361,8 +361,8 @@ pub struct SingleSearch { /// The second rarest byte is used as a type of guard for quickly detecting /// a mismatch after memchr locates an instance of the rarest byte. This /// is a hedge against pathological cases where the pre-computed frequency - /// analysis may be off. (But of course, does not prevent pathological - /// cases.) + /// analysis may be off. (But of course, does not prevent *all* + /// pathological cases.) rare2: u8, /// The offset of the second rarest byte in `pat`. rare2i: usize, diff --git a/src/prog.rs b/src/prog.rs index 165c20a355..44e3228262 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -132,7 +132,7 @@ impl Program { /// Returns true if this program uses Byte instructions instead of /// Char/Range instructions. pub fn uses_bytes(&self) -> bool { - self.is_bytes || self.is_dfa || !self.only_utf8 + self.is_bytes || self.is_dfa } /// Returns true if this program exclusively matches valid UTF-8 bytes.