diff --git a/bench/src/bench.rs b/bench/src/bench.rs index eabd5ecc2f..308abdcdfb 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -62,7 +62,7 @@ pub use ffi::tcl::Regex; #[cfg(not(feature = "re-rust-bytes"))] #[cfg(not(feature = "re-rust-plugin"))] macro_rules! regex { - ($re:expr) => { ::Regex::new($re).unwrap() } + ($re:expr) => { ::Regex::new(&$re.to_owned()).unwrap() } } #[cfg(feature = "re-rust-bytes")] @@ -72,7 +72,7 @@ macro_rules! regex { // Always enable the Unicode flag for byte based regexes. // Really, this should have been enabled by default. *sigh* use regex::bytes::RegexBuilder; - RegexBuilder::new($re).unicode(true).compile().unwrap() + RegexBuilder::new(&$re.to_owned()).unicode(true).compile().unwrap() }} } diff --git a/bench/src/misc.rs b/bench/src/misc.rs index c6dc52711a..86f93c4878 100644 --- a/bench/src/misc.rs +++ b/bench/src/misc.rs @@ -16,19 +16,16 @@ use test::Bencher; use {Regex, Text}; -/* #[cfg(not(feature = "re-onig"))] #[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] #[cfg(not(feature = "re-rust-plugin"))] bench_match!(no_exponential, { - let re = format!( + format!( "{}{}", repeat("a?").take(100).collect::(), - repeat("a").take(100).collect::()); - regex!(&re) + repeat("a").take(100).collect::()) }, repeat("a").take(100).collect()); -*/ bench_match!(literal, r"y", { format!("{}y", repeat("x").take(50).collect::()) diff --git a/bench/src/sherlock.rs b/bench/src/sherlock.rs index a7b741fd63..530554db97 100644 --- a/bench/src/sherlock.rs +++ b/bench/src/sherlock.rs @@ -119,6 +119,10 @@ sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here? // optimizations. sherlock!(before_holmes, r"\w+\s+Holmes", 319); +// Find complete words before Holmes. Both of the `\w`s defeat any prefix +// and suffix optimizations. +sherlock!(before_after_holmes, r"\w+\s+Holmes\s+\w+", 137); + // Find Holmes co-occuring with Watson in a particular window of characters. // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for // the rest. diff --git a/src/dfa.rs b/src/dfa.rs index a4be1bb675..d216f2cbae 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -275,25 +275,12 @@ struct State{ /// `u32` here for the DFA to save on space. type InstPtr = u32; -// Used to construct new states. +/// Adds ip to data using delta encoding with respect to prev. +/// +/// After completion, `data` will contain `ip` and `prev` will be set to `ip`. fn push_inst_ptr(data: &mut Vec, prev: &mut InstPtr, ip: InstPtr) { let delta = (ip as i32) - (*prev as i32); - if delta.abs() <= 127 { - data.push(delta as u8); - *prev = ip; - return; - } - let delta = delta as u32; - // Write 4 bytes in little-endian format. - let a = (delta & (0xFF << 0 * 8)) >> 0 * 8; - let b = (delta & (0xFF << 1 * 8)) >> 1 * 8; - let c = (delta & (0xFF << 2 * 8)) >> 2 * 8; - let d = (delta & (0xFF << 3 * 8)) >> 3 * 8; - data.push(128); - data.push(a as u8); - data.push(b as u8); - data.push(c as u8); - data.push(d as u8); + write_vari32(data, delta); *prev = ip; } @@ -306,31 +293,20 @@ impl <'a>Iterator for InstPtrs<'a> { type Item = usize; fn next(&mut self) -> Option { - let x = match self.data.get(0){ - Some(&x) => x, - None => return None, - }; - let delta = if x == 128 { - //Read 4 bytes in little-endian format. - let a = self.data[1] as u32; - let b = self.data[2] as u32; - let c = self.data[3] as u32; - let d = self.data[4] as u32; - self.data = &self.data[5..]; - (a << 0 * 8 | b << 1 * 8 | c << 2 * 8 | d << 3 * 8) as i32 as isize - } else { - self.data = &self.data[1..]; - x as i8 as isize - }; - let base = self.base as isize + delta; + if self.data.is_empty() { + return None; + } + let (delta, nread) = read_vari32(self.data); + let base = self.base as i32 + delta; debug_assert!(base >= 0); + debug_assert!(nread > 0); + self.data = &self.data[nread..]; self.base = base as usize; Some(self.base) } } impl State { - fn flags(&self) -> StateFlags { StateFlags(self.data[0]) } @@ -1566,14 +1542,15 @@ impl<'a> Fsm<'a> { fn approximate_size(&self) -> usize { use std::mem::size_of as size; // Estimate that there are about 16 instructions per state consuming - // 64 = 16 * 4 bytes of space. + // 20 = 4 + (15 * 1) bytes of space (1 byte because of delta encoding). + const STATE_HEAP: usize = 20 + 1; // one extra byte for flags let compiled = - (self.cache.compiled.len() * (size::() + 64)) + (self.cache.compiled.len() * (size::() + STATE_HEAP)) + (self.cache.compiled.len() * size::()); let states = self.cache.states.len() * (size::() - + 64 + + STATE_HEAP + (self.num_byte_classes() * size::())); let start_states = self.cache.start_states.len() * size::(); self.prog.approximate_size() + compiled + states + start_states @@ -1802,11 +1779,56 @@ fn show_state_ptr(si: StatePtr) -> String { s } +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_vari32(data: &mut Vec, n: i32) { + let mut un = (n as u32) << 1; + if n < 0 { + un = !un; + } + write_varu32(data, un) +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_vari32(data: &[u8]) -> (i32, usize) { + let (un, i) = read_varu32(data); + let mut n = (un >> 1) as i32; + if un & 1 != 0 { + n = !n; + } + (n, i) +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_varu32(data: &mut Vec, mut n: u32) { + while n >= 0b1000_0000 { + data.push((n as u8) | 0b1000_0000); + n >>= 7; + } + data.push(n as u8); +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_varu32(data: &[u8]) -> (u32, usize) { + let mut n: u32 = 0; + let mut shift: u32 = 0; + for (i, &b) in data.iter().enumerate() { + if b < 0b1000_0000 { + return (n | ((b as u32) << shift), i + 1); + } + n |= ((b as u32) & 0b0111_1111) << shift; + shift += 7; + } + (0, 0) +} + #[cfg(test)] mod tests { - use quickcheck::quickcheck; + extern crate rand; + + use quickcheck::{QuickCheck, StdGen, quickcheck}; use super::{ StateFlags, State, push_inst_ptr, + write_varu32, read_varu32, write_vari32, read_vari32, }; #[test] @@ -1818,10 +1840,36 @@ mod tests { push_inst_ptr(&mut data, &mut prev, ip); } let state = State { data: data.into_boxed_slice() }; - state.inst_ptrs().zip(ips.iter()).all(|(x, &y)| x == y as usize) - && - state.flags() == StateFlags(flags) + + let expected: Vec = + ips.into_iter().map(|ip| ip as usize).collect(); + let got: Vec = state.inst_ptrs().collect(); + expected == got && state.flags() == StateFlags(flags) + } + QuickCheck::new() + .gen(StdGen::new(self::rand::thread_rng(), 70_000)) + .quickcheck(p as fn(Vec, u8) -> bool); + } + + #[test] + fn prop_read_write_u32() { + fn p(n: u32) -> bool { + let mut buf = vec![]; + write_varu32(&mut buf, n); + let (got, nread) = read_varu32(&buf); + nread == buf.len() && got == n + } + quickcheck(p as fn(u32) -> bool); + } + + #[test] + fn prop_read_write_i32() { + fn p(n: i32) -> bool { + let mut buf = vec![]; + write_vari32(&mut buf, n); + let (got, nread) = read_vari32(&buf); + nread == buf.len() && got == n } - quickcheck(p as fn(Vec, u8) -> bool) + quickcheck(p as fn(i32) -> bool); } } diff --git a/tests/suffix_reverse.rs b/tests/suffix_reverse.rs index d89143268a..f2993553f9 100644 --- a/tests/suffix_reverse.rs +++ b/tests/suffix_reverse.rs @@ -14,4 +14,3 @@ mat!(t03, r".*(?:abcd)+", r"abcdabcd", Some((0, 8))); mat!(t04, r".*(?:abcd)+", r"abcdxabcd", Some((0, 9))); mat!(t05, r".*x(?:abcd)+", r"abcdxabcd", Some((0, 9))); mat!(t06, r"[^abcd]*x(?:abcd)+", r"abcdxabcd", Some((4, 9))); -// mat!(t05, r".*(?:abcd)+", r"abcdabcd", Some((0, 4)));