From 203c509df9e1dcba61eaa60ba9c09be9ca0c4b25 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 18 May 2016 10:24:30 -0400
Subject: [PATCH] Add SIMD accelerated multiple pattern search.

This uses the "Teddy" algorithm, as learned from the Hyperscan regular
expression library: https://01.org/hyperscan

This support optional, subject to the following:

1. A nightly compiler.
2. Enabling the `simd-accel` feature.
3. Adding `RUSTFLAGS="-C target-feature=+ssse3"` when compiling.
---
 .travis.yml                    |   2 +-
 Cargo.toml                     |   4 +
 bench/Cargo.toml               |   2 +-
 bench/compile                  |   3 +
 bench/run                      |   3 +
 regex-syntax/src/properties.rs |   1 -
 src/lib.rs                     |   7 +
 src/literals.rs                |  50 ++-
 src/simd_accel/mod.rs          |   5 +
 src/simd_accel/teddy128.rs     | 795 +++++++++++++++++++++++++++++++++
 src/simd_fallback/mod.rs       |   1 +
 src/simd_fallback/teddy128.rs  |  19 +
 12 files changed, 879 insertions(+), 13 deletions(-)
 create mode 100644 src/simd_accel/mod.rs
 create mode 100644 src/simd_accel/teddy128.rs
 create mode 100644 src/simd_fallback/mod.rs
 create mode 100644 src/simd_fallback/teddy128.rs

diff --git a/.travis.yml b/.travis.yml
index e4d7218ddf..b8a3de88b5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,7 +9,7 @@ script:
   - cargo build --verbose
   - cargo build --verbose --manifest-path=regex-debug/Cargo.toml
   - if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
-      travis_wait cargo test --verbose --features pattern;
+      RUSTFLAGS="-C target-feature=+ssse3" cargo test --verbose --features 'simd-accel pattern';
     else
       travis_wait cargo test --verbose;
     fi
diff --git a/Cargo.toml b/Cargo.toml
index 91256321f3..0bd85e3b60 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,6 +21,8 @@ memchr = "0.1.9"
 thread_local = "0.2.4"
 # For parsing regular expressions.
 regex-syntax = { path = "regex-syntax", version = "0.3.1" }
+# For accelerating text search.
+simd = { version = "0.1.0", optional = true }
 # For compiling UTF-8 decoding into automata.
 utf8-ranges = "0.1.3"
 
@@ -35,6 +37,8 @@ rand = "0.3"
 [features]
 # Enable to use the unstable pattern traits defined in std.
 pattern = []
+# Enable to use simd acceleration.
+simd-accel = ["simd"]
 
 [lib]
 # There are no benchmarks in the library code itself
diff --git a/bench/Cargo.toml b/bench/Cargo.toml
index 3155cd804c..05654e072c 100644
--- a/bench/Cargo.toml
+++ b/bench/Cargo.toml
@@ -17,7 +17,7 @@ libc = "0.2"
 onig = { version = "0.4", optional = true }
 libpcre-sys = { version = "0.2", optional = true }
 memmap = "0.2"
-regex = { version = "0.1", path = ".." }
+regex = { version = "0.1", path = "..", features = ["simd-accel"] }
 regex_macros = { version = "0.1", path = "../regex_macros", optional = true }
 regex-syntax = { version = "0.3", path = "../regex-syntax" }
 rustc-serialize = "0.3"
diff --git a/bench/compile b/bench/compile
index 8825ad1955..420d9873b1 100755
--- a/bench/compile
+++ b/bench/compile
@@ -1,5 +1,8 @@
 #!/bin/sh
 
+# Enable SIMD.
+export RUSTFLAGS="-C target-feature=+ssse3"
+
 exec cargo build \
   --release \
   --features 're-onig re-pcre1 re-pcre2 re-re2 re-rust re-rust-bytes re-tcl' \
diff --git a/bench/run b/bench/run
index 79feab05ce..ccebd3a720 100755
--- a/bench/run
+++ b/bench/run
@@ -9,6 +9,9 @@ if [ $# = 0 ] || [ $1 = '-h' ] || [ $1 = '--help' ]; then
   usage
 fi
 
+# Enable SIMD.
+export RUSTFLAGS="-C target-feature=+ssse3"
+
 which="$1"
 shift
 case $which in
diff --git a/regex-syntax/src/properties.rs b/regex-syntax/src/properties.rs
index 76a1441037..804a90ba12 100644
--- a/regex-syntax/src/properties.rs
+++ b/regex-syntax/src/properties.rs
@@ -9,7 +9,6 @@
 // except according to those terms.
 
 use quickcheck::{Arbitrary, Gen, Testable, QuickCheck, StdGen};
-use rand::Rng;
 
 use {
     Expr, ExprBuilder,
diff --git a/src/lib.rs b/src/lib.rs
index a2c3c76b7a..04df5e6033 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -449,6 +449,7 @@
 #![deny(missing_docs)]
 #![cfg_attr(test, deny(warnings))]
 #![cfg_attr(feature = "pattern", feature(pattern))]
+#![cfg_attr(feature = "simd-accel", feature(cfg_target_feature))]
 #![doc(html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
        html_favicon_url = "https://www.rust-lang.org/favicon.ico",
        html_root_url = "https://doc.rust-lang.org/regex/")]
@@ -458,6 +459,7 @@ extern crate memchr;
 extern crate thread_local;
 #[cfg(test)] extern crate quickcheck;
 extern crate regex_syntax as syntax;
+#[cfg(feature = "simd-accel")] extern crate simd;
 extern crate utf8_ranges;
 
 pub use error::Error;
@@ -582,6 +584,11 @@ mod re_plugin;
 mod re_set;
 mod re_trait;
 mod re_unicode;
+#[cfg(feature = "simd-accel")]
+mod simd_accel;
+#[cfg(not(feature = "simd-accel"))]
+#[path = "simd_fallback/mod.rs"]
+mod simd_accel;
 mod sparse;
 
 /// The `internal` module exists to support the `regex!` macro and other
diff --git a/src/literals.rs b/src/literals.rs
index 6e1b2a000c..69870e081e 100644
--- a/src/literals.rs
+++ b/src/literals.rs
@@ -15,6 +15,7 @@ use memchr::{memchr, memchr2, memchr3};
 use syntax;
 
 use freqs::BYTE_FREQUENCIES;
+use simd_accel::teddy128::Teddy;
 
 /// A prefix extracted from a compiled regular expression.
 ///
@@ -51,6 +52,8 @@ enum Matcher {
     Single(SingleSearch),
     /// An Aho-Corasick automaton.
     AC(FullAcAutomaton<syntax::Lit>),
+    /// A simd accelerated multiple string matcher.
+    Teddy128(Teddy),
 }
 
 impl LiteralSearcher {
@@ -100,6 +103,7 @@ impl LiteralSearcher {
             Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
             Single(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
             AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)),
+            Teddy128(ref ted) => ted.find(haystack).map(|m| (m.start, m.end)),
         }
     }
 
@@ -136,6 +140,9 @@ impl LiteralSearcher {
             Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
             Matcher::Single(ref s) => LiteralIter::Single(&s.pat),
             Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()),
+            Matcher::Teddy128(ref ted) => {
+                LiteralIter::Teddy128(ted.patterns())
+            }
         }
     }
 
@@ -162,6 +169,7 @@ impl LiteralSearcher {
             Bytes(ref sset) => sset.dense.len(),
             Single(_) => 1,
             AC(ref aut) => aut.len(),
+            Teddy128(ref ted) => ted.len(),
         }
     }
 
@@ -173,6 +181,7 @@ impl LiteralSearcher {
             Bytes(ref sset) => sset.approximate_size(),
             Single(ref single) => single.approximate_size(),
             AC(ref aut) => aut.heap_bytes(),
+            Teddy128(ref ted) => ted.approximate_size(),
         }
     }
 }
@@ -190,23 +199,34 @@ impl Matcher {
 
     fn new(lits: &syntax::Literals, sset: SingleByteSet) -> Self {
         if lits.literals().is_empty() {
-            Matcher::Empty
-        } else if sset.dense.len() >= 26 {
+            return Matcher::Empty;
+        }
+        if sset.dense.len() >= 26 {
             // Avoid trying to match a large number of single bytes.
             // This is *very* sensitive to a frequency analysis comparison
             // between the bytes in sset and the composition of the haystack.
             // No matter the size of sset, if its members all are rare in the
             // haystack, then it'd be worth using it. How to tune this... IDK.
             // ---AG
-            Matcher::Empty
-        } else if sset.complete {
-            Matcher::Bytes(sset)
-        } else if lits.literals().len() == 1 {
-            Matcher::Single(SingleSearch::new(lits.literals()[0].to_vec()))
-        } else {
-            let pats = lits.literals().to_owned();
-            Matcher::AC(AcAutomaton::new(pats).into_full())
+            return Matcher::Empty;
+        }
+        if sset.complete {
+            return Matcher::Bytes(sset);
+        }
+        if lits.literals().len() == 1 {
+            let lit = lits.literals()[0].to_vec();
+            return Matcher::Single(SingleSearch::new(lit));
         }
+        // Only try Teddy if Aho-Corasick can't use memchr.
+        // Also, in its current form, Teddy doesn't scale well to lots of
+        // literals.
+        if sset.dense.len() > 1 && lits.literals().len() <= 32 {
+            if let Some(ted) = Teddy::new(lits) {
+                return Matcher::Teddy128(ted);
+            }
+        }
+        let pats = lits.literals().to_owned();
+        Matcher::AC(AcAutomaton::new(pats).into_full())
     }
 }
 
@@ -215,6 +235,7 @@ pub enum LiteralIter<'a> {
     Bytes(&'a [u8]),
     Single(&'a [u8]),
     AC(&'a [syntax::Lit]),
+    Teddy128(&'a [Vec<u8>]),
 }
 
 impl<'a> Iterator for LiteralIter<'a> {
@@ -250,6 +271,15 @@ impl<'a> Iterator for LiteralIter<'a> {
                     Some(&**next)
                 }
             }
+            LiteralIter::Teddy128(ref mut lits) => {
+                if lits.is_empty() {
+                    None
+                } else {
+                    let next = &lits[0];
+                    *lits = &lits[1..];
+                    Some(&**next)
+                }
+            }
         }
     }
 }
diff --git a/src/simd_accel/mod.rs b/src/simd_accel/mod.rs
new file mode 100644
index 0000000000..f3c868dd88
--- /dev/null
+++ b/src/simd_accel/mod.rs
@@ -0,0 +1,5 @@
+#[cfg(target_feature = "ssse3")]
+pub mod teddy128;
+#[cfg(not(target_feature = "ssse3"))]
+#[path = "../simd_fallback/teddy128.rs"]
+pub mod teddy128;
diff --git a/src/simd_accel/teddy128.rs b/src/simd_accel/teddy128.rs
new file mode 100644
index 0000000000..e0cd69c7bc
--- /dev/null
+++ b/src/simd_accel/teddy128.rs
@@ -0,0 +1,795 @@
+/*!
+Teddy is a simd accelerated multiple substring matching algorithm. The name
+and the core ideas in the algorithm were learned from the Hyperscan[1]
+project.
+
+
+Background
+----------
+The key idea of Teddy is to do *packed* substring matching. In the literature,
+packed substring matching is the idea of examing multiple bytes in a haystack
+at a time to detect matches. Implementations of, for example, memchr (which
+detects matches of a single byte) have been doing this for years. Only
+recently, with the introduction of various SIMD instructions, has this been
+extended to substring matching. The PCMPESTRI instruction (and its relatives),
+for example, implements substring matching in hardware. It is, however, limited
+to substrings of length 16 bytes or fewer, but this restriction is fine in a
+regex engine, since we rarely care about the performance difference between
+searching for a 16 byte literal and a 16 + N literal---16 is already long
+enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs
+at least, is its latency and throughput. As a result, it is often faster to do
+substring search with a Boyer-Moore variant and a well placed memchr to quickly
+skip through the haystack.
+
+There are fewer results from the literature on packed substring matching,
+and even fewer for packed multiple substring matching. Ben-Kiki et al.[2]
+describes use of PCMPESTRI for substring matching, but is mostly theoretical
+and hand-waves performance. There is other theoretical work done by Bille[3]
+as well.
+
+The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci
+and is generally focused on multiple pattern search. Their first paper[4a]
+introduces the concept of a fingerprint, which is computed for every block of
+N bytes in every pattern. The haystack is then scanned N bytes at a time and
+a fingerprint is computed in the same way it was computed for blocks in the
+patterns. If the fingerprint corresponds to one that was found in a pattern,
+then a verification step follows to confirm that one of the substrings with the
+corresponding fingerprint actually matches at the current location. Various
+implementation tricks are employed to make sure the fingerprint lookup is fast;
+typically by truncating the fingerprint. (This may, of course, provoke more
+steps in the verification process, so a balance must be struck.)
+
+The main downside of [4a] is that the minimum substring length is 32 bytes,
+presumably because of how the algorithm uses certain SIMD instructions. This
+essentially makes it useless for general purpose regex matching, where a small
+number of short patterns is far more likely.
+
+Faro and Kulekci published another paper[4b] that is conceptually very similar
+to [4a]. The key difference is that it uses the CRC32 instruction (introduced
+as part of SSE 4.2) to compute fingerprint values. This also enables the
+algorithm to work effectively on substrings as short at 7 bytes with 4 byte
+windows. 7 bytes is unfortunately still too long. The window could be
+technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the
+small window size ends up negating most performance benefits---and it's likely
+the common case in a general purpose regex engine.
+
+Faro and Kulekci also published [4c] that appears to be intended as a
+replacement to using PCMPESTRI. In particular, it is specifically motivated by
+the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD
+instructions that are faster. While this approach works for short substrings,
+I personally couldn't see a way to generalize it to multiple substring search.
+
+Faro and Kulekci have another paper[4d] that I haven't been able to read
+because it is behind a paywall.
+
+
+Teddy
+-----
+Finally, we get to Teddy. If the above literature review is complete, then it
+appears that Teddy is a novel algorithm. More than that, in my experience, it
+completely blows away the competition for short substrings, which is exactly
+what we want in a general purpose regex engine. Again, the algorithm appears
+to be developed by the authors of Hyperscan[1]. Hyperscan was open sourced late
+2015, and no earlier history could be found. Therefore, tracking the exact
+provenance of the algorithm with respect to the published literature seems
+difficult.
+
+DISCLAIMER: My understanding of Teddy is limited to reading auto-generated C
+code, its disassembly and observing its runtime behavior.
+
+At a high level, Teddy works somewhat similarly to the fingerprint algorithms
+published by Faro and Kulekci, but Teddy does it in a way that scales a bit
+better. Namely:
+
+1. Teddy's core algorithm scans the haystack in 16 byte chunks. 16 is
+   significant because it corresponds to the number of bytes in a SIMD vector.
+   If one used AVX2 instructions, then we could scan the haystack in 32 byte
+   chunks. Similarly, if one used AVX512 instructions, we could sca the
+   haystack in 64 byte chunks. Hyperscan implements SIMD + AVX2, we only
+   implement SIMD for the moment. (The author doesn't have a CPU with AVX2
+   support... yet.)
+2. Bitwise operations are performed on each chunk to discover if any region of
+   it matches a set of precomputed fingerprints from the patterns. If there are
+   matches, then a verification step is performed. In this implementation, our
+   verificiation step is a naive. This can be improved upon.
+
+The details to make this work are quite clever. First, we must choose how to
+pick our fingerprints. In Hyperscan's implementation, I *believe* they use the
+last N bytes of each substring, where N must be at least the minimum length of
+any substring in the set being searched. In this implementation, we use the
+first N bytes of each substring. (The tradeoffs between these choices aren't
+yet clear to me.) We then must figure out how to quickly test whether an
+occurrence of any fingerprint from the set of patterns appears in a 16 byte
+block from the haystack. To keep things simple, let's assume N = 1 and examine
+some examples to motivate the approach. Here are our patterns:
+
+```ignore
+foo
+bar
+baz
+```
+
+The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set
+our 16 byte block to:
+
+```ignore
+bat cat foo bump
+xxxxxxxxxxxxxxxx
+```
+
+To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates
+a mask that allows us to quickly compute membership of a fingerprint in a 16
+byte block that also tells which pattern the fingerprint corresponds to. In
+this case, our fingerprint is a single byte, so an appropriate abstraction is
+a map from a single byte to a list of patterns that contain that fingerprint:
+
+```ignore
+f |--> foo
+b |--> bar, baz
+```
+
+Now, all we need to do is figure out how to represent this map in vector space
+and use normal SIMD operations to perform a lookup. The first simplification
+we can make is to represent our patterns as bit fields occupying a single
+byte. This is important, because a single SIMD vector can store 16 bytes.
+
+```ignore
+f |--> 00000001
+b |--> 00000010, 00000100
+```
+
+How do we perform lookup though? It turns out that SSSE3 introduced a very cool
+instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`,
+and returns a third vector `C`. All vectors are treated as 16 8-bit integers.
+`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true
+for the purposes of this algorithm. For full details, see Intel's Intrinsics
+Guide[5].) This essentially lets us use the values in `B` to lookup values in
+`A`.
+
+If we could somehow cause `B` to contain our 16 byte block from the haystack,
+and if `A` could contain our bitmasks, then we'd end up with something like
+this for `A`:
+
+```ignore
+    0x00 0x01 ... 0x62      ... 0x66      ... 0xFF
+A = 0    0        00000001      00000110      0
+```
+
+And if `B` contains our window from our haystack, we could use shuffle to take
+the values from `B` and use them to look up our bitsets in `A`. But of course,
+we can't do this because `A` in the above example contains 256 bytes, which
+is much larger than the size of a SIMD vector.
+
+Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of
+our bitsets, we can use two masks, where one mask corresponds to the lower four
+bits of our fingerprint and the other mask corresponds to the upper four bits.
+So our map now looks like:
+
+```ignore
+'f' & 0xF = 0x6 |--> 00000001
+'f' >> 4  = 0x6 |--> 00000111
+'b' & 0xF = 0x2 |--> 00000110
+'b' >> 4  = 0x6 |--> 00000111
+```
+
+Notice that the bitsets for each nybble correspond to the union of all
+fingerprints that contain that nibble. For example, both `f` and `b` have the
+same upper 4 bits but differ on the lower 4 bits. Putting this together, we
+have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is
+our mask for the upper nybble and `B` is our 16 byte block from the haystack:
+
+```ignore
+      0x00 0x01 0x02      0x03 ... 0x06      ... 0xF
+A0 =  0    0    00000110  0        00000001      0
+A1 =  0    0    0         0        00000111      0
+B  =  b    a    t         _        t             p
+B  =  0x62 0x61 0x74      0x20     0x74          0x70
+```
+
+But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits,
+and we need indexes that are at most 4 bits (corresponding to one of 16
+values). We can apply the same transformation to split `B` into lower and upper
+nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and
+`B1` corresponds to the upper nybbles:
+
+```ignore
+     b   a   t   _   c   a   t   _   f   o   o   _   b   u   m   p
+B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0
+B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7
+```
+
+And now we have a nice correspondence. `B0` can index `A0` and `B1` can index
+`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`:
+
+```ignore
+     b         a        ... f         o         ... p
+     A0[0x2]   A0[0x1]      A0[0x6]   A0[0xF]       A0[0x0]
+C0 = 00000110  0            00000001  0             0
+```
+
+And `C1 = PSHUFB(A1, B1)`:
+
+```ignore
+     b         a        ... f         o        ... p
+     A1[0x6]   A1[0x6]      A1[0x6]   A1[0x6]      A1[0x7]
+C1 = 00000111  00000111     00000111  00000111     0
+```
+
+Notice how neither one of `C0` or `C1` is guaranteed to report fully correct
+results all on its own. For example, `C1` claims that `b` is a fingerprint for
+the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint
+for all of our patterns. But if we combined `C0` and `C1` with an `AND`
+operation:
+
+```
+     b         a        ... f         o        ... p
+C  = 00000110  0            00000001  0            0
+```
+
+Then we now have that `C[i]` contains a bitset corresponding to the matching
+fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that
+block.
+
+Once we have that, we can look for the position of the least significant bit
+in `C`. That position, modulo `8`, gives us the pattern that the fingerprint
+matches. That position, integer divided by `8`, also gives us the byte offset
+that the fingerprint occurs in inside the 16 byte haystack block. Using those
+two pieces of information, we can run a verification procedure that tries
+to match all substrings containing that fingerprint at that position in the
+haystack.
+
+
+Implementation notes
+--------------------
+The problem with the algorithm as described above is that it uses a single byte
+for a fingerprint. This will work well if the fingerprints are rare in the
+haystack (e.g., capital letters or special characters in normal English text),
+but if the fingerprints are common, you'll wind up spending too much time in
+the verification step, which effectively negate the performance benefits of
+scanning 16 bytes at a time. Remember, the key to the performance of this
+algorithm is to do as little work as possible per 16 bytes.
+
+This algorithm can be extrapolated in a relatively straight-forward way to use
+larger fingerprints. That is, instead of a single byte prefix, we might use a
+three byte prefix. The implementation below implements N = {1, 2, 3} and always
+picks the largest N possible. The rationale is that the bigger the fingerprint,
+the fewer verification steps we'll do. Of course, if N is too large, then we'll
+end up doing too much on each step.
+
+The way to extend it is:
+
+1. Add a mask for each byte in the fingerprint. (Remember that each mask is
+   composed of two SIMD vectors.) This results in a value of `C` for each byte
+   in the fingerprint while searching.
+2. When testing each 16 byte block, each value of `C` must be shifted so that
+   they are aligned. Once aligned, they should all be `AND`'d together. This
+   will give you only the bitsets corresponding to the full match of the
+   fingerprint.
+
+The implementation below is commented to fill in the nitty gritty details.
+
+[1] - https://github.com/01org/hyperscan
+[2a] - http://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf
+[2b] - http://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf
+[3] - http://www.sciencedirect.com/science/article/pii/S1570866710000353
+[4a] - http://www.dmi.unict.it/~faro/papers/conference/faro32.pdf
+[4b] - https://pdfs.semanticscholar.org/fed7/ca62dc469314f3552017d0da7ebd669d4649.pdf
+[4c] - http://arxiv.org/pdf/1209.6449.pdf
+[4d] - http://www.sciencedirect.com/science/article/pii/S1570866714000471
+[5] - https://software.intel.com/sites/landingpage/IntrinsicsGuide
+*/
+
+// TODO: Extend this to use AVX2 instructions.
+// TODO: Extend this to use AVX512 instructions.
+// TODO: Extend this to cleverly use Aho-Corasick. Possibly to replace both
+//       "slow" searching and the verification step.
+// TODO: Make the inner loop do aligned loads.
+
+use std::cmp;
+use std::mem::transmute;
+use std::ptr;
+
+use simd::u8x16;
+use simd::x86::sse2::u64x2;
+use simd::x86::ssse3::Ssse3U8x16;
+
+use syntax;
+
+/// Corresponds to the number of bytes read at a time in the haystack.
+const BLOCK_SIZE: usize = 16;
+
+/// Match reports match information.
+#[derive(Debug, Clone)]
+pub struct Match {
+    /// The index of the pattern that matched. The index is in correspondence
+    /// with the order of the patterns given at construction.
+    pub pat: usize,
+    /// The start byte offset of the match.
+    pub start: usize,
+    /// The end byte offset of the match. This is always start + pat.len().
+    pub end: usize,
+}
+
+/// A SIMD accelerated multi substring searcher.
+#[derive(Debug, Clone)]
+pub struct Teddy {
+    /// A list of substrings to match.
+    pats: Vec<Vec<u8>>,
+    /// A set of 8 buckets. Each bucket corresponds to a single member of a
+    /// bitset. A bucket contains zero or more substrings. This is useful
+    /// when the number of substrings exceeds 8, since our bitsets cannot have
+    /// more than 8 members.
+    buckets: Vec<Vec<usize>>,
+    /// Our set of masks. There's one mask for each byte in the fingerprint.
+    masks: Masks,
+}
+
+/// A list of masks. This has length equal to the length of the fingerprint.
+/// The length of the fingerprint is always `max(3, len(smallest substring))`.
+#[derive(Debug, Clone)]
+struct Masks(Vec<Mask>);
+
+/// A single mask.
+#[derive(Debug, Clone, Copy)]
+struct Mask {
+    /// Bitsets for the low nybbles in a fingerprint.
+    lo: u8x16,
+    /// Bitsets for the high nybbles in a fingerprint.
+    hi: u8x16,
+}
+
+impl Teddy {
+    /// Create a new Teddy multi substring matcher.
+    ///
+    /// If a Teddy matcher could not be created (e.g., `pats` is empty or has
+    /// an empty substring), then `None` is returned.
+    pub fn new(pats: &syntax::Literals) -> Option<Teddy> {
+        let pats: Vec<_> = pats.literals().iter().map(|p|p.to_vec()).collect();
+        let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0);
+        // Don't allow any empty patterns and require that we have at
+        // least one pattern.
+        if min_len < 1 {
+            return None;
+        }
+        // Pick the largest mask possible, but no larger than 3.
+        let nmasks = cmp::min(3, min_len);
+        let mut masks = Masks::new(nmasks);
+        let mut buckets = vec![vec![]; 8];
+        // Assign a substring to each bucket, and add the bucket's bitfield to
+        // the appropriate position in the mask.
+        for (pati, pat) in pats.iter().enumerate() {
+            let bucket = pati % 8;
+            buckets[bucket].push(pati);
+            masks.add(bucket as u8, pat);
+        }
+        Some(Teddy {
+            pats: pats.to_vec(),
+            buckets: buckets,
+            masks: masks,
+        })
+    }
+
+    /// Returns all of the substrings matched by this Teddy.
+    pub fn patterns(&self) -> &[Vec<u8>] {
+        &self.pats
+    }
+
+    /// Returns the number of substrings in this matcher.
+    pub fn len(&self) -> usize {
+        self.pats.len()
+    }
+
+    /// Returns the approximate size on the heap used by this matcher.
+    pub fn approximate_size(&self) -> usize {
+        self.pats.iter().fold(0, |a, b| a + b.len())
+    }
+
+    /// Searches `haystack` for the substrings in this Teddy. If a match was
+    /// found, then it is returned. Otherwise, `None` is returned.
+    pub fn find(&self, haystack: &[u8]) -> Option<Match> {
+        // If our haystack is smaller than the block size, then fall back to
+        // a naive brute force search.
+        //
+        // TODO: Use Aho-Corasick.
+        if haystack.is_empty() || haystack.len() < (BLOCK_SIZE + 2) {
+            return self.slow(haystack, 0);
+        }
+        match self.masks.len() {
+            0 => None,
+            1 => self.find1(haystack),
+            2 => self.find2(haystack),
+            3 => self.find3(haystack),
+            _ => unreachable!(),
+        }
+    }
+
+    /// find1 is used when there is only 1 mask. This is the easy case and is
+    /// pretty much as described in the module documentation.
+    #[inline(always)]
+    fn find1(&self, haystack: &[u8]) -> Option<Match> {
+        let mut pos = 0;
+        let zero = u8x16::splat(0);
+        let len = haystack.len();
+        debug_assert!(len >= BLOCK_SIZE);
+        while pos <= len - BLOCK_SIZE {
+            let h = unsafe { u8x16::load_unchecked(haystack, pos) };
+            // N.B. res0 is our `C` in the module documentation.
+            let res0 = self.masks.members1(h);
+            // Only do expensive verification if there are any non-zero bits.
+            if res0.ne(zero).any() {
+                if let Some(m) = self.verify_128(haystack, pos, res0) {
+                    return Some(m);
+                }
+            }
+            pos += BLOCK_SIZE;
+        }
+        self.slow(haystack, pos)
+    }
+
+    /// find2 is used when there are 2 masks, e.g., the fingerprint is 2 bytes
+    /// long.
+    #[inline(always)]
+    fn find2(&self, haystack: &[u8]) -> Option<Match> {
+        // This is an exotic way to right shift a SIMD vector across lanes.
+        // See below at use for more details.
+        let res0shuffle = u8x16::new(
+            0, 0, 1, 2,
+            3, 4, 5, 6,
+            7, 8, 9, 10,
+            11, 12, 13, 14,
+        );
+        let zero = u8x16::splat(0);
+        let len = haystack.len();
+        // The previous value of C (from the module documentation) for the
+        // *first* byte in the fingerprint. On subsequent iterations, we take
+        // the last bitset from the previous C and insert it into the first
+        // position of the current C, shifting all other bitsets to the right
+        // one lane. This causes C for the first byte to line up with C for the
+        // second byte, so that they can be AND'd together.
+        let mut prev0 = u8x16::splat(0xFF);
+        let mut pos = 1;
+        debug_assert!(len >= BLOCK_SIZE);
+        while pos <= len - BLOCK_SIZE {
+            let h = unsafe { u8x16::load_unchecked(haystack, pos) };
+            let (res0, res1) = self.masks.members2(h);
+
+            // The next three lines are essentially equivalent to
+            //
+            //     (prev0 << 15) | (res0 >> 1)
+            //
+            // ... if SIMD vectors could shift across lanes. There is the
+            // PALIGNR instruction, but apparently LLVM doesn't expose it as
+            // a proper intrinsic. Thankfully, it appears the following
+            // sequence does indeed compile down to a PALIGNR.
+            let prev0byte0 = prev0.extract(15);
+            let res0shiftr8 = res0.shuffle_bytes(res0shuffle);
+            let res0prev0 = res0shiftr8.replace(0, prev0byte0);
+
+            // AND's our C values together.
+            let res = res0prev0 & res1;
+            prev0 = res0;
+            if res.ne(zero).any() {
+                let pos = pos.checked_sub(1).unwrap();
+                if let Some(m) = self.verify_128(haystack, pos, res) {
+                    return Some(m);
+                }
+            }
+            pos += BLOCK_SIZE;
+        }
+        // The windowing above doesn't check the last byte in the last
+        // window, so start the slow search at the last byte of the last
+        // window.
+        self.slow(haystack, pos.checked_sub(1).unwrap())
+    }
+
+    /// find3 is used when there are 3 masks, e.g., the fingerprint is 3 bytes
+    /// long.
+    ///
+    /// N.B. This is a straight-forward extrapolation of find2. The only
+    /// difference is that we need to keep track of two previous values of
+    /// C, since we now need to align for three bytes.
+    #[inline(always)]
+    fn find3(&self, haystack: &[u8]) -> Option<Match> {
+        let zero = u8x16::splat(0);
+        let len = haystack.len();
+
+        let res0shuffle = u8x16::new(
+            0, 0, 0, 1,
+            2, 3, 4, 5,
+            6, 7, 8, 9,
+            10, 11, 12, 13,
+        );
+        let res1shuffle = u8x16::new(
+            0, 0, 1, 2,
+            3, 4, 5, 6,
+            7, 8, 9, 10,
+            11, 12, 13, 14,
+        );
+        let mut prev0 = u8x16::splat(0xFF);
+        let mut prev1 = u8x16::splat(0xFF);
+        let mut pos = 2;
+        while pos <= len - BLOCK_SIZE {
+            let h = unsafe { u8x16::load_unchecked(haystack, pos) };
+            let (res0, res1, res2) = self.masks.members3(h);
+
+            let prev0byte0 = prev0.extract(14);
+            let prev0byte1 = prev0.extract(15);
+            let res0shiftr16 = res0.shuffle_bytes(res0shuffle);
+            let res0prev0 = res0shiftr16.replace(0, prev0byte0)
+                                        .replace(1, prev0byte1);
+
+            let prev1byte0 = prev1.extract(15);
+            let res1shiftr8 = res1.shuffle_bytes(res1shuffle);
+            let res1prev1 = res1shiftr8.replace(0, prev1byte0);
+
+            let res = res0prev0 & res1prev1 & res2;
+
+            prev0 = res0;
+            prev1 = res1;
+            if res.ne(zero).any() {
+                let pos = pos.checked_sub(2).unwrap();
+                if let Some(m) = self.verify_128(haystack, pos, res) {
+                    return Some(m);
+                }
+            }
+            pos += BLOCK_SIZE;
+        }
+        // The windowing above doesn't check the last two bytes in the last
+        // window, so start the slow search at the penultimate byte of the
+        // last window.
+        // self.slow(haystack, pos.saturating_sub(2))
+        self.slow(haystack, pos.checked_sub(2).unwrap())
+    }
+
+    /// Runs the verification procedure on `res` (i.e., `C` from the module
+    /// documentation), where the haystack block starts at `pos` in
+    /// `haystack`.
+    ///
+    /// If a match exists, it returns the first one.
+    #[inline(always)]
+    fn verify_128(
+        &self,
+        haystack: &[u8],
+        pos: usize,
+        res: u8x16,
+    ) -> Option<Match> {
+        // The verification procedure is more amenable to standard 64 bit
+        // values, so get those.
+        let res64: u64x2 = unsafe { transmute(res) };
+        let reshi = res64.extract(0);
+        let reslo = res64.extract(1);
+        if let Some(m) = self.verify_64(haystack, pos, reshi, 0) {
+            return Some(m);
+        }
+        if let Some(m) = self.verify_64(haystack, pos, reslo, 8) {
+            return Some(m);
+        }
+        None
+    }
+
+    /// Runs the verification procedure on half of `C`.
+    ///
+    /// If a match exists, it returns the first one.
+    ///
+    /// offset is an additional byte offset to add to the position before
+    /// substring match verification.
+    #[inline(always)]
+    fn verify_64(
+        &self,
+        haystack: &[u8],
+        pos: usize,
+        mut res: u64,
+        offset: usize,
+    ) -> Option<Match> {
+        // There's a possible match so long as there's at least one bit set.
+        while res != 0 {
+            // The next possible match is at the least significant bit.
+            let bit = res.trailing_zeros();
+            // The position of the bit in its corresponding lane gives us the
+            // corresponding bucket.
+            let bucket = (bit % 8) as usize;
+            // The lane that the bit is in gives us its offset.
+            let bytei = (bit / 8) as usize;
+            // Compute the start of where a substring would start.
+            let start = pos + offset + bytei;
+            // Kill off this bit. If we couldn't match anything, we'll go to
+            // the next bit.
+            res &= !(1 << bit);
+            // Actual substring search verification.
+            if let Some(m) = self.verify_bucket(haystack, bucket, start) {
+                return Some(m);
+            }
+        }
+        None
+    }
+
+    /// Verifies whether any substring in the given bucket matches in haystack
+    /// at the given starting position.
+    #[inline(always)]
+    fn verify_bucket(
+        &self,
+        haystack: &[u8],
+        bucket: usize,
+        start: usize,
+    ) -> Option<Match> {
+        // This cycles through the patterns in the bucket in the order that
+        // the patterns were given. Therefore, we guarantee leftmost-first
+        // semantics.
+        for &pati in &self.buckets[bucket] {
+            let pat = &*self.pats[pati];
+            if start + pat.len() > haystack.len() {
+                continue;
+            }
+            if pat == &haystack[start..start + pat.len()] {
+                return Some(Match {
+                    pat: pati,
+                    start: start,
+                    end: start + pat.len(),
+                });
+            }
+        }
+        None
+    }
+
+    /// Slow substring search through all patterns in this matcher.
+    ///
+    /// This is used when we don't have enough bytes in the haystack for our
+    /// block based approach.
+    fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
+        // TODO: Use Aho-Corasick, or otherwise adapt the block based approach
+        // to be capable of using smaller blocks.
+        let mut m = None;
+        for (pi, p) in self.pats.iter().enumerate() {
+            if let Some(i) = find_slow(p, &haystack[pos..]) {
+                let candidate = Match {
+                    pat: pi,
+                    start: pos + i,
+                    end: pos + i + p.len(),
+                };
+                match m {
+                    None => m = Some(candidate),
+                    Some(ref mut m) => {
+                        if candidate.start < m.start {
+                            *m = candidate;
+                        }
+                    }
+                }
+            }
+        }
+        m
+    }
+}
+
+impl Masks {
+    /// Create a new set of masks of size `n`, where `n` corresponds to the
+    /// number of bytes in a fingerprint.
+    fn new(n: usize) -> Masks {
+        Masks(vec![Mask::new(); n])
+    }
+
+    /// Returns the number of masks.
+    fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    /// Adds the given pattern to the given bucket. The bucket should be a
+    /// power of 2 <= 2^7.
+    fn add(&mut self, bucket: u8, pat: &[u8]) {
+        for (i, mask) in self.0.iter_mut().enumerate() {
+            mask.add(bucket, pat[i]);
+        }
+    }
+
+    /// Finds the fingerprints that are in the given haystack block. i.e., this
+    /// returns C as described in the module documentation.
+    ///
+    /// More specifically, for i in 0..16 and j in 0..8, C[i][j] == 1 if and
+    /// only if `haystack_block[i]` corresponds to a fingerprint that is part
+    /// of a pattern in bucket `j`.
+    #[inline(always)]
+    fn members1(&self, haystack_block: u8x16) -> u8x16 {
+        let masklo = u8x16::splat(0xF);
+        let hlo = haystack_block & masklo;
+        let hhi = (haystack_block >> 4) & masklo;
+
+        self.0[0].lo.shuffle_bytes(hlo) & self.0[0].hi.shuffle_bytes(hhi)
+    }
+
+    /// Like members1, but computes C for the first and second bytes in the
+    /// fingerprint.
+    #[inline(always)]
+    fn members2(&self, haystack_block: u8x16) -> (u8x16, u8x16) {
+        let masklo = u8x16::splat(0xF);
+        let hlo = haystack_block & masklo;
+        let hhi = (haystack_block >> 4) & masklo;
+
+        let res0 = self.0[0].lo.shuffle_bytes(hlo)
+                   & self.0[0].hi.shuffle_bytes(hhi);
+        let res1 = self.0[1].lo.shuffle_bytes(hlo)
+                   & self.0[1].hi.shuffle_bytes(hhi);
+        (res0, res1)
+    }
+
+    /// Like members1, but computes C for the first, second and third bytes in
+    /// the fingerprint.
+    #[inline(always)]
+    fn members3(&self, haystack_block: u8x16) -> (u8x16, u8x16, u8x16) {
+        let masklo = u8x16::splat(0xF);
+        let hlo = haystack_block & masklo;
+        let hhi = (haystack_block >> 4) & masklo;
+
+        let res0 = self.0[0].lo.shuffle_bytes(hlo)
+                   & self.0[0].hi.shuffle_bytes(hhi);
+        let res1 = self.0[1].lo.shuffle_bytes(hlo)
+                   & self.0[1].hi.shuffle_bytes(hhi);
+        let res2 = self.0[2].lo.shuffle_bytes(hlo)
+                   & self.0[2].hi.shuffle_bytes(hhi);
+        (res0, res1, res2)
+    }
+}
+
+impl Mask {
+    /// Create a new mask with no members.
+    fn new() -> Mask {
+        Mask {
+            lo: u8x16::splat(0),
+            hi: u8x16::splat(0),
+        }
+    }
+
+    /// Adds the given byte to the given bucket.
+    fn add(&mut self, bucket: u8, byte: u8) {
+        // Split our byte into two nybbles, and add each nybble to our
+        // mask.
+        let byte_lo = (byte & 0xF) as u32;
+        let byte_hi = (byte >> 4) as u32;
+
+        let lo = self.lo.extract(byte_lo);
+        self.lo = self.lo.replace(byte_lo, ((1 << bucket) as u8) | lo);
+
+        let hi = self.hi.extract(byte_hi);
+        self.hi = self.hi.replace(byte_hi, ((1 << bucket) as u8) | hi);
+    }
+}
+
+/// UnsafeLoad permits loading data into a SIMD vector without bounds checks.
+///
+/// Ideally, this would be part of the `simd` crate, or even better, we could
+/// figure out how to do it without `unsafe` at all.
+trait UnsafeLoad {
+    type Elem;
+
+    /// load_unchecked creates a new SIMD vector from the elements in `slice`
+    /// starting at `offset`. `slice` must have at least the number of elements
+    /// required to fill a SIMD vector.
+    unsafe fn load_unchecked(slice: &[Self::Elem], offset: usize) -> Self;
+}
+
+impl UnsafeLoad for u8x16 {
+    type Elem = u8;
+
+    unsafe fn load_unchecked(slice: &[u8], offset: usize) -> u8x16 {
+        // TODO: Can we just do pointer casting here? I don't think so, since
+        // this could be an unaligned load? Help me.
+        let mut x = u8x16::splat(0);
+        ptr::copy_nonoverlapping(
+            slice.get_unchecked(offset),
+            &mut x as *mut u8x16 as *mut u8,
+            16);
+        x
+    }
+}
+
+/// Slow single-substring search use for naive brute force matching.
+#[cold]
+pub fn find_slow(pattern: &[u8], haystack: &[u8]) -> Option<usize> {
+    if pattern.len() > haystack.len() {
+        return None;
+    }
+    for i in 0..(haystack.len() - pattern.len() + 1) {
+        if pattern == &haystack[i..i + pattern.len()] {
+            return Some(i);
+        }
+    }
+    None
+}
diff --git a/src/simd_fallback/mod.rs b/src/simd_fallback/mod.rs
new file mode 100644
index 0000000000..b7ce4b188a
--- /dev/null
+++ b/src/simd_fallback/mod.rs
@@ -0,0 +1 @@
+pub mod teddy128;
diff --git a/src/simd_fallback/teddy128.rs b/src/simd_fallback/teddy128.rs
new file mode 100644
index 0000000000..8ffae702a5
--- /dev/null
+++ b/src/simd_fallback/teddy128.rs
@@ -0,0 +1,19 @@
+use syntax;
+
+#[derive(Debug, Clone)]
+pub struct Teddy(());
+
+#[derive(Debug, Clone)]
+pub struct Match {
+    pub pat: usize,
+    pub start: usize,
+    pub end: usize,
+}
+
+impl Teddy {
+    pub fn new(_pats: &syntax::Literals) -> Option<Teddy> { None }
+    pub fn patterns(&self) -> &[Vec<u8>] { &[] }
+    pub fn len(&self) -> usize { 0 }
+    pub fn approximate_size(&self) -> usize { 0 }
+    pub fn find(&self, _haystack: &[u8]) -> Option<Match> { None }
+}