diff --git a/Cargo.toml b/Cargo.toml index 91256321f3..0bd85e3b60 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,8 @@ memchr = "0.1.9" thread_local = "0.2.4" # For parsing regular expressions. regex-syntax = { path = "regex-syntax", version = "0.3.1" } +# For accelerating text search. +simd = { version = "0.1.0", optional = true } # For compiling UTF-8 decoding into automata. utf8-ranges = "0.1.3" @@ -35,6 +37,8 @@ rand = "0.3" [features] # Enable to use the unstable pattern traits defined in std. pattern = [] +# Enable to use simd acceleration. +simd-accel = ["simd"] [lib] # There are no benchmarks in the library code itself diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 3155cd804c..05654e072c 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -17,7 +17,7 @@ libc = "0.2" onig = { version = "0.4", optional = true } libpcre-sys = { version = "0.2", optional = true } memmap = "0.2" -regex = { version = "0.1", path = ".." } +regex = { version = "0.1", path = "..", features = ["simd-accel"] } regex_macros = { version = "0.1", path = "../regex_macros", optional = true } regex-syntax = { version = "0.3", path = "../regex-syntax" } rustc-serialize = "0.3" diff --git a/bench/compile b/bench/compile index 8825ad1955..420d9873b1 100755 --- a/bench/compile +++ b/bench/compile @@ -1,5 +1,8 @@ #!/bin/sh +# Enable SIMD. +export RUSTFLAGS="-C target-feature=+ssse3" + exec cargo build \ --release \ --features 're-onig re-pcre1 re-pcre2 re-re2 re-rust re-rust-bytes re-tcl' \ diff --git a/bench/run b/bench/run index 79feab05ce..ccebd3a720 100755 --- a/bench/run +++ b/bench/run @@ -9,6 +9,9 @@ if [ $# = 0 ] || [ $1 = '-h' ] || [ $1 = '--help' ]; then usage fi +# Enable SIMD. +export RUSTFLAGS="-C target-feature=+ssse3" + which="$1" shift case $which in diff --git a/src/lib.rs b/src/lib.rs index a2c3c76b7a..66bd8c02a2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -449,6 +449,7 @@ #![deny(missing_docs)] #![cfg_attr(test, deny(warnings))] #![cfg_attr(feature = "pattern", feature(pattern))] +#![cfg_attr(feature = "simd-accel", feature(cfg_target_feature))] #![doc(html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", html_favicon_url = "https://www.rust-lang.org/favicon.ico", html_root_url = "https://doc.rust-lang.org/regex/")] @@ -458,6 +459,7 @@ extern crate memchr; extern crate thread_local; #[cfg(test)] extern crate quickcheck; extern crate regex_syntax as syntax; +#[cfg(feature = "simd-accel")] extern crate simd; extern crate utf8_ranges; pub use error::Error; @@ -582,6 +584,10 @@ mod re_plugin; mod re_set; mod re_trait; mod re_unicode; +#[cfg(feature = "simd-accel")] +mod simd_accel; +#[cfg(not(feature = "simd-accel"))] +mod simd_fallback; mod sparse; /// The `internal` module exists to support the `regex!` macro and other diff --git a/src/literals.rs b/src/literals.rs index 6e1b2a000c..69870e081e 100644 --- a/src/literals.rs +++ b/src/literals.rs @@ -15,6 +15,7 @@ use memchr::{memchr, memchr2, memchr3}; use syntax; use freqs::BYTE_FREQUENCIES; +use simd_accel::teddy128::Teddy; /// A prefix extracted from a compiled regular expression. /// @@ -51,6 +52,8 @@ enum Matcher { Single(SingleSearch), /// An Aho-Corasick automaton. AC(FullAcAutomaton<syntax::Lit>), + /// A simd accelerated multiple string matcher. + Teddy128(Teddy), } impl LiteralSearcher { @@ -100,6 +103,7 @@ impl LiteralSearcher { Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), Single(ref s) => s.find(haystack).map(|i| (i, i + s.len())), AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)), + Teddy128(ref ted) => ted.find(haystack).map(|m| (m.start, m.end)), } } @@ -136,6 +140,9 @@ impl LiteralSearcher { Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), Matcher::Single(ref s) => LiteralIter::Single(&s.pat), Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()), + Matcher::Teddy128(ref ted) => { + LiteralIter::Teddy128(ted.patterns()) + } } } @@ -162,6 +169,7 @@ impl LiteralSearcher { Bytes(ref sset) => sset.dense.len(), Single(_) => 1, AC(ref aut) => aut.len(), + Teddy128(ref ted) => ted.len(), } } @@ -173,6 +181,7 @@ impl LiteralSearcher { Bytes(ref sset) => sset.approximate_size(), Single(ref single) => single.approximate_size(), AC(ref aut) => aut.heap_bytes(), + Teddy128(ref ted) => ted.approximate_size(), } } } @@ -190,23 +199,34 @@ impl Matcher { fn new(lits: &syntax::Literals, sset: SingleByteSet) -> Self { if lits.literals().is_empty() { - Matcher::Empty - } else if sset.dense.len() >= 26 { + return Matcher::Empty; + } + if sset.dense.len() >= 26 { // Avoid trying to match a large number of single bytes. // This is *very* sensitive to a frequency analysis comparison // between the bytes in sset and the composition of the haystack. // No matter the size of sset, if its members all are rare in the // haystack, then it'd be worth using it. How to tune this... IDK. // ---AG - Matcher::Empty - } else if sset.complete { - Matcher::Bytes(sset) - } else if lits.literals().len() == 1 { - Matcher::Single(SingleSearch::new(lits.literals()[0].to_vec())) - } else { - let pats = lits.literals().to_owned(); - Matcher::AC(AcAutomaton::new(pats).into_full()) + return Matcher::Empty; + } + if sset.complete { + return Matcher::Bytes(sset); + } + if lits.literals().len() == 1 { + let lit = lits.literals()[0].to_vec(); + return Matcher::Single(SingleSearch::new(lit)); } + // Only try Teddy if Aho-Corasick can't use memchr. + // Also, in its current form, Teddy doesn't scale well to lots of + // literals. + if sset.dense.len() > 1 && lits.literals().len() <= 32 { + if let Some(ted) = Teddy::new(lits) { + return Matcher::Teddy128(ted); + } + } + let pats = lits.literals().to_owned(); + Matcher::AC(AcAutomaton::new(pats).into_full()) } } @@ -215,6 +235,7 @@ pub enum LiteralIter<'a> { Bytes(&'a [u8]), Single(&'a [u8]), AC(&'a [syntax::Lit]), + Teddy128(&'a [Vec<u8>]), } impl<'a> Iterator for LiteralIter<'a> { @@ -250,6 +271,15 @@ impl<'a> Iterator for LiteralIter<'a> { Some(&**next) } } + LiteralIter::Teddy128(ref mut lits) => { + if lits.is_empty() { + None + } else { + let next = &lits[0]; + *lits = &lits[1..]; + Some(&**next) + } + } } } } diff --git a/src/simd_accel/mod.rs b/src/simd_accel/mod.rs new file mode 100644 index 0000000000..f3c868dd88 --- /dev/null +++ b/src/simd_accel/mod.rs @@ -0,0 +1,5 @@ +#[cfg(target_feature = "ssse3")] +pub mod teddy128; +#[cfg(not(target_feature = "ssse3"))] +#[path = "../simd_fallback/teddy128.rs"] +pub mod teddy128; diff --git a/src/simd_accel/teddy128.rs b/src/simd_accel/teddy128.rs new file mode 100644 index 0000000000..07d1d37c0d --- /dev/null +++ b/src/simd_accel/teddy128.rs @@ -0,0 +1,792 @@ +/*! +Teddy is a simd accelerated multiple substring matching algorithm. The name +and the core ideas in the algorithm were learned from the Hyperscan[1] +project. + + +Background +---------- +The key idea of Teddy is to do *packed* substring matching. In the literature, +packed substring matching is the idea of examing multiple bytes in a haystack +at a time to detect matches. Implementations of, for example, memchr (which +detects matches of a single byte) have been doing this for years. Only +recently, with the introduction of various SIMD instructions, has this been +extended to substring matching. The PCMPESTRI instruction (and its relatives), +for example, implements substring matching in hardware. It is, however, limited +to substrings of length 16 bytes or fewer, but this restriction is fine in a +regex engine, since we rarely care about the performance difference between +searching for a 16 byte literal and a 16 + N literal---16 is already long +enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs +at least, is its latency and throughput. As a result, it is often faster to do +substring search with a Boyer-Moore variant and a well placed memchr to quickly +skip through the haystack. + +There are fewer results from the literature on packed substring matching, +and even fewer for packed multiple substring matching. Ben-Kiki et al.[2] +describes use of PCMPESTRI for substring matching, but is mostly theoretical +and hand-waves performance. There is other theoretical work done by Bille[3] +as well. + +The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci +and is generally focused on multiple pattern search. Their first paper[4a] +introduces the concept of a fingerprint, which is computed for every block of +N bytes in every pattern. The haystack is then scanned N bytes at a time and +a fingerprint is computed in the same way it was computed for blocks in the +patterns. If the fingerprint corresponds to one that was found in a pattern, +then a verification step follows to confirm that one of the substrings with the +corresponding fingerprint actually matches at the current location. Various +implementation tricks are employed to make sure the fingerprint lookup is fast; +typically by truncating the fingerprint. (This may, of course, provoke more +steps in the verification process, so a balance must be struck.) + +The main downside of [4a] is that the minimum substring length is 32 bytes, +presumably because of how the algorithm uses certain SIMD instructions. This +essentially makes it useless for general purpose regex matching, where a small +number of short patterns is far more likely. + +Faro and Kulekci published another paper[4b] that is conceptually very similar +to [4a]. The key difference is that it uses the CRC32 instruction (introduced +as part of SSE 4.2) to compute fingerprint values. This also enables the +algorithm to work effectively on substrings as short at 7 bytes with 4 byte +windows. 7 bytes is unfortunately still too long. The window could be +technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the +small window size ends up negating most performance benefits---and it's likely +the common case in a general purpose regex engine. + +Faro and Kulekci also published [4c] that appears to be intended as a +replacement to using PCMPESTRI. In particular, it is specifically motivated by +the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD +instructions that are faster. While this approach works for short substrings, +I personally couldn't see a way to generalize it to multiple substring search. + +Faro and Kulekci have another paper[4d] that I haven't been able to read +because it is behind a paywall. + + +Teddy +----- +Finally, we get to Teddy. If the above literature review is complete, then it +appears that Teddy is a novel algorithm. More than that, in my experience, it +completely blows away the competition for short substrings, which is exactly +what we want in a general purpose regex engine. Again, the algorithm appears +to be developed by the authors of Hyperscan[1]. Hyperscan was open sourced late +2015, and no earlier history could be found. Therefore, tracking the exact +provenance of the algorithm with respect to the published literature seems +difficult. + +DISCLAIMER: My understanding of Teddy is limited to reading auto-generated C +code, its disassembly and observing its runtime behavior. + +At a high level, Teddy works somewhat similarly to the fingerprint algorithms +published by Faro and Kulekci, but Teddy does it in a way that scales a bit +better. Namely: + +1. Teddy's core algorithm scans the haystack in 16 byte chunks. 16 is + significant because it corresponds to the number of bytes in a SIMD vector. + If one used AVX2 instructions, then we could scan the haystack in 32 byte + chunks. Similarly, if one used AVX512 instructions, we could sca the + haystack in 64 byte chunks. Hyperscan implements SIMD + AVX2, we only + implement SIMD for the moment. (The author doesn't have a CPU with AVX2 + support... yet.) +2. Bitwise operations are performed on each chunk to discover if any region of + it matches a set of precomputed fingerprints from the patterns. If there are + matches, then a verification step is performed. In this implementation, our + verificiation step is a naive. This can be improved upon. + +The details to make this work are quite clever. First, we must choose how to +pick our fingerprints. In Hyperscan's implementation, I *believe* they use the +last N bytes of each substring, where N must be at least the minimum length of +any substring in the set being searched. In this implementation, we use the +first N bytes of each substring. (The tradeoffs between these choices aren't +yet clear to me.) We then must figure out how to quickly test whether an +occurrence of any fingerprint from the set of patterns appears in a 16 byte +block from the haystack. To keep things simple, let's assume N = 1 and examine +some examples to motivate the approach. Here are our patterns: + +```ignore +foo +bar +baz +``` + +The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set +our 16 byte block to: + +```ignore +bat cat foo bump +xxxxxxxxxxxxxxxx +``` + +To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates +a mask that allows us to quickly compute membership of a fingerprint in a 16 +byte block that also tells which pattern the fingerprint corresponds to. In +this case, our fingerprint is a single byte, so an appropriate abstraction is +a map from a single byte to a list of patterns that contain that fingerprint: + +```ignore +f |--> foo +b |--> bar, baz +``` + +Now, all we need to do is figure out how to represent this map in vector space +and use normal SIMD operations to perform a lookup. The first simplification +we can make is to represent our patterns as bit fields occupying a single +byte. This is important, because a single SIMD vector can store 16 bytes. + +```ignore +f |--> 00000001 +b |--> 00000010, 00000100 +``` + +How do we perform lookup though? It turns out that SSSE3 introduced a very cool +instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`, +and returns a third vector `C`. All vectors are treated as 16 8-bit integers. +`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true +for the purposes of this algorithm. For full details, see Intel's Intrinsics +Guide[5].) This essentially lets us use the values in `B` to lookup values in +`A`. + +If we could somehow cause `B` to contain our 16 byte block from the haystack, +and if `A` could contain our bitmasks, then we'd end up with something like +this for `A`: + +```ignore + 0x00 0x01 ... 0x62 ... 0x66 ... 0xFF +A = 0 0 00000001 00000110 0 +``` + +And if `B` contains our window from our haystack, we could use shuffle to take +the values from `B` and use them to look up our bitsets in `A`. But of course, +we can't do this because `A` in the above example contains 256 bytes, which +is much larger than the size of a SIMD vector. + +Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of +our bitsets, we can use two masks, where one mask corresponds to the lower four +bits of our fingerprint and the other mask corresponds to the upper four bits. +So our map now looks like: + +```ignore +'f' & 0xF = 0x6 |--> 00000001 +'f' >> 4 = 0x6 |--> 00000111 +'b' & 0xF = 0x2 |--> 00000110 +'b' >> 4 = 0x6 |--> 00000111 +``` + +Notice that the bitsets for each nybble correspond to the union of all +fingerprints that contain that nibble. For example, both `f` and `b` have the +same upper 4 bits but differ on the lower 4 bits. Putting this together, we +have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is +our mask for the upper nybble and `B` is our 16 byte block from the haystack: + +```ignore + 0x00 0x01 0x02 0x03 ... 0x06 ... 0xF +A0 = 0 0 00000110 0 00000001 0 +A1 = 0 0 0 0 00000111 0 +B = b a t _ t p +B = 0x62 0x61 0x74 0x20 0x74 0x70 +``` + +But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits, +and we need indexes that are at most 4 bits (corresponding to one of 16 +values). We can apply the same transformation to split `B` into lower and upper +nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and +`B1` corresponds to the upper nybbles: + +```ignore + b a t _ c a t _ f o o _ b u m p +B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0 +B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7 +``` + +And now we have a nice correspondence. `B0` can index `A0` and `B1` can index +`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`: + +```ignore + b a ... f o ... p + A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0] +C0 = 00000110 0 00000001 0 0 +``` + +And `C1 = PSHUFB(A1, B1)`: + +```ignore + b a ... f o ... p + A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7] +C1 = 00000111 00000111 00000111 00000111 0 +``` + +Notice how neither one of `C0` or `C1` is guaranteed to report fully correct +results all on its own. For example, `C1` claims that `b` is a fingerprint for +the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint +for all of our patterns. But if we combined `C0` and `C1` with an `AND` +operation: + +``` + b a ... f o ... p +C = 00000110 0 00000001 0 0 +``` + +Then we now have that `C[i]` contains a bitset corresponding to the matching +fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that +block. + +Once we have that, we can look for the position of the least significant bit +in `C`. That position, modulo `8`, gives us the pattern that the fingerprint +matches. That position, integer divided by `8`, also gives us the byte offset +that the fingerprint occurs in inside the 16 byte haystack block. Using those +two pieces of information, we can run a verification procedure that tries +to match all substrings containing that fingerprint at that position in the +haystack. + + +Implementation notes +-------------------- +The problem with the algorithm as described above is that it uses a single byte +for a fingerprint. This will work will if the fingerprints are rare in the +haystack (e.g., capital letters or special characters in normal English text), +but if the fingerprints are common, you'll wind up spending too much time in +the verification step, which effectively gives the performance benefits of +scanning 16 bytes at a time. Remember, the key to the performance of this +algorithm is to do as little work as possible per 16 bytes. + +This algorithm can be extrapolated in a relatively straight-forward way to use +larger fingerprints. That is, instead of a single byte prefix, we might use a +three byte prefix. The implementation below implements N = {1, 2, 3} and always +picks the largest N possible. The rationale is that the bigger the fingerprint, +the fewer verification steps we'll do. Of course, if N is too large, then we'll +end up doing too much on each step. + +The way to extend it is: + +1. Add a mask for each byte in the fingerprint. (Remember that each mask is + composed of two SIMD vectors.) This results in a value of `C` for each byte + in the fingerprint while searching. +2. When testing each 16 byte block, each value of `C` must be shifted so that + they are aligned. Once aligned, they should all be `AND`'d together. This + will give you only the bitsets corresponding to the full match of the + fingerprint. + +The implementation below is commented to fill in the nitty gritty details. + +[1] - https://github.com/01org/hyperscan +[2a] - http://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf +[2b] - http://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf +[3] - http://www.sciencedirect.com/science/article/pii/S1570866710000353 +[4a] - http://www.dmi.unict.it/~faro/papers/conference/faro32.pdf +[4b] - https://pdfs.semanticscholar.org/fed7/ca62dc469314f3552017d0da7ebd669d4649.pdf +[4c] - http://arxiv.org/pdf/1209.6449.pdf +[4d] - http://www.sciencedirect.com/science/article/pii/S1570866714000471 +[5] - https://software.intel.com/sites/landingpage/IntrinsicsGuide +*/ + +// TODO: Extend this to use AVX2 instructions. +// TODO: Extend this to use AVX512 instructions. +// TODO: Extend this to cleverly use Aho-Corasick. Possibly to replace both +// "slow" searching and the verification step. +// TODO: Make the inner loop do aligned loads. + +use std::cmp; +use std::mem::transmute; +use std::ptr; + +use simd::u8x16; +use simd::x86::sse2::u64x2; +use simd::x86::ssse3::Ssse3U8x16; + +use syntax; + +/// Corresponds to the number of bytes read at a time in the haystack. +const BLOCK_SIZE: usize = 16; + +/// Match reports match information. +#[derive(Debug, Clone)] +pub struct Match { + /// The index of the pattern that matched. The index is in correspondence + /// with the order of the patterns given at construction. + pub pat: usize, + /// The start byte offset of the match. + pub start: usize, + /// The end byte offset of the match. This is always start + pat.len(). + pub end: usize, +} + +/// A SIMD accelerated multi substring searcher. +#[derive(Debug, Clone)] +pub struct Teddy { + /// A list of substrings to match. + pats: Vec<Vec<u8>>, + /// A set of 8 buckets. Each bucket corresponds to a single member of a + /// bitset. A bucket contains zero or more substrings. This is useful + /// when the number of substrings exceeds 8, since our bitsets cannot have + /// more than 8 members. + buckets: Vec<Vec<usize>>, + /// Our set of masks. There's one mask for each byte in the fingerprint. + masks: Masks, +} + +/// A list of masks. This has length equal to the length of the fingerprint. +/// The length of the fingerprint is always `max(3, len(smallest substring))`. +#[derive(Debug, Clone)] +struct Masks(Vec<Mask>); + +/// A single mask. +#[derive(Debug, Clone, Copy)] +struct Mask { + /// Bitsets for the low nybbles in a fingerprint. + lo: u8x16, + /// Bitsets for the high nybbles in a fingerprint. + hi: u8x16, +} + +impl Teddy { + /// Create a new Teddy multi substring matcher. + /// + /// If a Teddy matcher could not be created (e.g., `pats` is empty or has + /// an empty substring), then `None` is returned. + pub fn new(pats: &syntax::Literals) -> Option<Teddy> { + let pats: Vec<_> = pats.literals().iter().map(|p|p.to_vec()).collect(); + let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0); + // Don't allow any empty patterns and require that we have at + // least one pattern. + if min_len < 1 { + return None; + } + // Pick the largest mask possible, but no larger than 3. + let nmasks = cmp::min(3, min_len); + let mut masks = Masks::new(nmasks); + let mut buckets = vec![vec![]; 8]; + // Assign a substring to each bucket, and add the bucket's bitfield to + // the appropriate position in the mask. + for (pati, pat) in pats.iter().enumerate() { + let bucket = pati % 8; + buckets[bucket].push(pati); + masks.add(bucket as u8, pat); + } + Some(Teddy { + pats: pats.to_vec(), + buckets: buckets, + masks: masks, + }) + } + + /// Returns all of the substrings matched by this Teddy. + pub fn patterns(&self) -> &[Vec<u8>] { + &self.pats + } + + /// Returns the number of substrings in this matcher. + pub fn len(&self) -> usize { + self.pats.len() + } + + /// Returns the approximate size on the heap used by this matcher. + pub fn approximate_size(&self) -> usize { + self.pats.iter().fold(0, |a, b| a + b.len()) + } + + /// Searches `haystack` for the substrings in this Teddy. If a match was + /// found, then it is returned. Otherwise, `None` is returned. + pub fn find(&self, haystack: &[u8]) -> Option<Match> { + // If our haystack is smaller than the block size, then fall back to + // a naive brute force search. + // + // TODO: Use Aho-Corasick. + if haystack.is_empty() || haystack.len() < BLOCK_SIZE { + return self.slow(haystack, 0); + } + match self.masks.len() { + 0 => None, + 1 => self.find1(haystack), + 2 => self.find2(haystack), + 3 => self.find3(haystack), + _ => unreachable!(), + } + } + + /// find1 is used when there is only 1 mask. This is the easy case and is + /// pretty much as described in the module documentation. + #[inline(always)] + fn find1(&self, haystack: &[u8]) -> Option<Match> { + let mut pos = 0; + let zero = u8x16::splat(0); + let len = haystack.len(); + debug_assert!(len >= BLOCK_SIZE); + while pos <= len - BLOCK_SIZE { + let h = unsafe { u8x16::load_unchecked(haystack, pos) }; + // N.B. res0 is our `C` in the module documentation. + let res0 = self.masks.members1(h); + // Only do expensive verification if there are any non-zero bits. + if res0.ne(zero).any() { + if let Some(m) = self.verify_128(haystack, pos, res0) { + return Some(m); + } + } + pos += BLOCK_SIZE; + } + self.slow(haystack, pos) + } + + /// find2 is used when there are 2 masks, e.g., the fingerprint is 2 bytes + /// long. + #[inline(always)] + fn find2(&self, haystack: &[u8]) -> Option<Match> { + // This is an exotic way to right shift a SIMD vector across lanes. + // See below at use for more details. + let res0shuffle = u8x16::new( + 0, 0, 1, 2, + 3, 4, 5, 6, + 7, 8, 9, 10, + 11, 12, 13, 14, + ); + let zero = u8x16::splat(0); + let len = haystack.len(); + // The previous value of C (from the module documentation) for the + // *first* byte in the fingerprint. On subsequent iterations, we take + // the last bitset from the previous C and insert it into the first + // position of the current C, shifting all other bitsets to the right + // one lane. This causes C for the first byte to line up with C for the + // second byte, so that they can be AND'd together. + let mut prev0 = u8x16::splat(0); + let mut pos = 0; + debug_assert!(len >= BLOCK_SIZE); + while pos <= len - BLOCK_SIZE { + let h = unsafe { u8x16::load_unchecked(haystack, pos) }; + let (res0, res1) = self.masks.members2(h); + + // The next three lines are essentially equivalent to + // + // (prev0 << 15) | (res0 >> 1) + // + // ... if SIMD vectors could shift across lanes. There is the + // PALIGNR instruction, but apparently LLVM doesn't expose it as + // a proper intrinsic. Thankfully, it appears the following + // sequence does indeed compile down to a PALIGNR. + let prev0byte0 = prev0.extract(15); + let res0shiftr8 = res0.shuffle_bytes(res0shuffle); + let res0prev0 = res0shiftr8.replace(0, prev0byte0); + + // AND's our C values together. + let res = res0prev0 & res1; + prev0 = res0; + if res.ne(zero).any() { + if let Some(m) = self.verify_128(haystack, pos - 1, res) { + return Some(m); + } + } + pos += BLOCK_SIZE; + } + // The windowing above doesn't check the last byte in the last + // window, so start the slow search at the last byte of the last + // window. + self.slow(haystack, pos - 1) + } + + /// find3 is used when there are 3 masks, e.g., the fingerprint is 3 bytes + /// long. + /// + /// N.B. This is a straight-forward extrapolation of find2. The only + /// difference is that we need to keep track of two previous values of + /// C, since we now need to align for three bytes. + #[inline(always)] + fn find3(&self, haystack: &[u8]) -> Option<Match> { + let zero = u8x16::splat(0); + let len = haystack.len(); + + let res0shuffle = u8x16::new( + 0, 0, 0, 1, + 2, 3, 4, 5, + 6, 7, 8, 9, + 10, 11, 12, 13, + ); + let res1shuffle = u8x16::new( + 0, 0, 1, 2, + 3, 4, 5, 6, + 7, 8, 9, 10, + 11, 12, 13, 14, + ); + let mut prev0 = u8x16::splat(0); + let mut prev1 = u8x16::splat(0); + let mut pos = 0; + while pos <= len - BLOCK_SIZE { + let h = unsafe { u8x16::load_unchecked(haystack, pos) }; + let (res0, res1, res2) = self.masks.members3(h); + + let prev0byte0 = prev0.extract(14); + let prev0byte1 = prev0.extract(15); + let res0shiftr16 = res0.shuffle_bytes(res0shuffle); + let res0prev0 = res0shiftr16.replace(0, prev0byte0) + .replace(1, prev0byte1); + + let prev1byte0 = prev1.extract(15); + let res1shiftr8 = res1.shuffle_bytes(res1shuffle); + let res1prev1 = res1shiftr8.replace(0, prev1byte0); + + let res = res0prev0 & res1prev1 & res2; + + prev0 = res0; + prev1 = res1; + if res.ne(zero).any() { + if let Some(m) = self.verify_128(haystack, pos - 2, res) { + return Some(m); + } + } + pos += BLOCK_SIZE; + } + // The windowing above doesn't check the last two bytes in the last + // window, so start the slow search at the penultimate byte of the + // last window. + self.slow(haystack, pos - 2) + } + + /// Runs the verification procedure on `res` (i.e., `C` from the module + /// documentation), where the haystack block starts at `pos` in + /// `haystack`. + /// + /// If a match exists, it returns the first one. + #[inline(always)] + fn verify_128( + &self, + haystack: &[u8], + pos: usize, + res: u8x16, + ) -> Option<Match> { + // The verification procedure is more amenable to standard 64 bit + // values, so get those. + let res64: u64x2 = unsafe { transmute(res) }; + let reshi = res64.extract(0); + let reslo = res64.extract(1); + if let Some(m) = self.verify_64(haystack, pos, reshi, 0) { + return Some(m); + } + if let Some(m) = self.verify_64(haystack, pos, reslo, 8) { + return Some(m); + } + None + } + + /// Runs the verification procedure on half of `C`. + /// + /// If a match exists, it returns the first one. + /// + /// offset is an additional byte offset to add to the position before + /// substring match verification. + #[inline(always)] + fn verify_64( + &self, + haystack: &[u8], + pos: usize, + mut res: u64, + offset: usize, + ) -> Option<Match> { + // There's a possible match so long as there's at least one bit set. + while res != 0 { + // The next possible match is at the least significant bit. + let bit = res.trailing_zeros(); + // The position of the bit in its corresponding lane gives us the + // corresponding bucket. + let bucket = (bit % 8) as usize; + // The lane that the bit is in gives us its offset. + let bytei = (bit / 8) as usize; + // Compute the start of where a substring would start. + let start = pos + offset + bytei; + // Kill off this bit. If we couldn't match anything, we'll go to + // the next bit. + res &= !(1 << bit); + // Actual substring search verification. + if let Some(m) = self.verify_bucket(haystack, bucket, start) { + return Some(m); + } + } + None + } + + /// Verifies whether any substring in the given bucket matches in haystack + /// at the given starting position. + #[inline(always)] + fn verify_bucket( + &self, + haystack: &[u8], + bucket: usize, + start: usize, + ) -> Option<Match> { + // This cycles through the patterns in the bucket in the order that + // the patterns were given. Therefore, we guarantee leftmost-first + // semantics. + for &pati in &self.buckets[bucket] { + let pat = &*self.pats[pati]; + if start + pat.len() > haystack.len() { + continue; + } + if pat == &haystack[start..start + pat.len()] { + return Some(Match { + pat: pati, + start: start, + end: start + pat.len(), + }); + } + } + None + } + + /// Slow substring search through all patterns in this matcher. + /// + /// This is used when we don't have enough bytes in the haystack for our + /// block based approach. + fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> { + // TODO: Use Aho-Corasick, or otherwise adapt the block based approach + // to be capable of using smaller blocks. + let mut m = None; + for (pi, p) in self.pats.iter().enumerate() { + if let Some(i) = find_slow(p, &haystack[pos..]) { + let candidate = Match { + pat: pi, + start: pos + i, + end: pos + i + p.len(), + }; + match m { + None => m = Some(candidate), + Some(ref mut m) => { + if candidate.start < m.start { + *m = candidate; + } + } + } + } + } + m + } +} + +impl Masks { + /// Create a new set of masks of size `n`, where `n` corresponds to the + /// number of bytes in a fingerprint. + fn new(n: usize) -> Masks { + Masks(vec![Mask::new(); n]) + } + + /// Returns the number of masks. + fn len(&self) -> usize { + self.0.len() + } + + /// Adds the given pattern to the given bucket. The bucket should be a + /// power of 2 <= 2^7. + fn add(&mut self, bucket: u8, pat: &[u8]) { + for (i, mask) in self.0.iter_mut().enumerate() { + mask.add(bucket, pat[i]); + } + } + + /// Finds the fingerprints that are in the given haystack block. i.e., this + /// returns C as described in the module documentation. + /// + /// More specifically, for i in 0..16 and j in 0..8, C[i][j] == 1 if and + /// only if `haystack_block[i]` corresponds to a fingerprint that is part + /// of a pattern in bucket `j`. + #[inline(always)] + fn members1(&self, haystack_block: u8x16) -> u8x16 { + let masklo = u8x16::splat(0xF); + let hlo = haystack_block & masklo; + let hhi = (haystack_block >> 4) & masklo; + + self.0[0].lo.shuffle_bytes(hlo) & self.0[0].hi.shuffle_bytes(hhi) + } + + /// Like members1, but computes C for the first and second bytes in the + /// fingerprint. + #[inline(always)] + fn members2(&self, haystack_block: u8x16) -> (u8x16, u8x16) { + let masklo = u8x16::splat(0xF); + let hlo = haystack_block & masklo; + let hhi = (haystack_block >> 4) & masklo; + + let res0 = self.0[0].lo.shuffle_bytes(hlo) + & self.0[0].hi.shuffle_bytes(hhi); + let res1 = self.0[1].lo.shuffle_bytes(hlo) + & self.0[1].hi.shuffle_bytes(hhi); + (res0, res1) + } + + /// Like members1, but computes C for the first, second and third bytes in + /// the fingerprint. + #[inline(always)] + fn members3(&self, haystack_block: u8x16) -> (u8x16, u8x16, u8x16) { + let masklo = u8x16::splat(0xF); + let hlo = haystack_block & masklo; + let hhi = (haystack_block >> 4) & masklo; + + let res0 = self.0[0].lo.shuffle_bytes(hlo) + & self.0[0].hi.shuffle_bytes(hhi); + let res1 = self.0[1].lo.shuffle_bytes(hlo) + & self.0[1].hi.shuffle_bytes(hhi); + let res2 = self.0[2].lo.shuffle_bytes(hlo) + & self.0[2].hi.shuffle_bytes(hhi); + (res0, res1, res2) + } +} + +impl Mask { + /// Create a new mask with no members. + fn new() -> Mask { + Mask { + lo: u8x16::splat(0), + hi: u8x16::splat(0), + } + } + + /// Adds the given byte to the given bucket. + fn add(&mut self, bucket: u8, byte: u8) { + // Split our byte into two nybbles, and add each nybble to our + // mask. + let byte_lo = (byte & 0xF) as u32; + let byte_hi = (byte >> 4) as u32; + + let lo = self.lo.extract(byte_lo); + self.lo = self.lo.replace(byte_lo, ((1 << bucket) as u8) | lo); + + let hi = self.hi.extract(byte_hi); + self.hi = self.hi.replace(byte_hi, ((1 << bucket) as u8) | hi); + } +} + +/// UnsafeLoad permits loading data into a SIMD vector without bounds checks. +/// +/// Ideally, this would be part of the `simd` crate, or even better, we could +/// figure out how to do it without `unsafe` at all. +trait UnsafeLoad { + type Elem; + + /// load_unchecked creates a new SIMD vector from the elements in `slice` + /// starting at `offset`. `slice` must have at least the number of elements + /// required to fill a SIMD vector. + unsafe fn load_unchecked(slice: &[Self::Elem], offset: usize) -> Self; +} + +impl UnsafeLoad for u8x16 { + type Elem = u8; + + unsafe fn load_unchecked(slice: &[u8], offset: usize) -> u8x16 { + // TODO: Can we just do pointer casting here? I don't think so, since + // this could be an unaligned load? Help me. + let mut x = u8x16::splat(0); + ptr::copy_nonoverlapping( + slice.get_unchecked(offset), + &mut x as *mut u8x16 as *mut u8, + 16); + x + } +} + +/// Slow single-substring search use for naive brute force matching. +#[cold] +pub fn find_slow(pattern: &[u8], haystack: &[u8]) -> Option<usize> { + if pattern.len() > haystack.len() { + return None; + } + for i in 0..(haystack.len() - pattern.len() + 1) { + if pattern == &haystack[i..i + pattern.len()] { + return Some(i); + } + } + None +} diff --git a/src/simd_fallback/mod.rs b/src/simd_fallback/mod.rs new file mode 100644 index 0000000000..fe097ff43b --- /dev/null +++ b/src/simd_fallback/mod.rs @@ -0,0 +1 @@ +mod teddy128; diff --git a/src/simd_fallback/teddy128.rs b/src/simd_fallback/teddy128.rs new file mode 100644 index 0000000000..8ffae702a5 --- /dev/null +++ b/src/simd_fallback/teddy128.rs @@ -0,0 +1,19 @@ +use syntax; + +#[derive(Debug, Clone)] +pub struct Teddy(()); + +#[derive(Debug, Clone)] +pub struct Match { + pub pat: usize, + pub start: usize, + pub end: usize, +} + +impl Teddy { + pub fn new(_pats: &syntax::Literals) -> Option<Teddy> { None } + pub fn patterns(&self) -> &[Vec<u8>] { &[] } + pub fn len(&self) -> usize { 0 } + pub fn approximate_size(&self) -> usize { 0 } + pub fn find(&self, _haystack: &[u8]) -> Option<Match> { None } +}