From c12c28be23b0a1e6518ff1930694237d60539862 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 22 Apr 2016 21:17:02 -0400
Subject: [PATCH] Fix #204.

The DFA handles word boundaries by tagging each state with an `is_word`
flag that lets us determine whether the next byte in the haystack should
cause a word boundary instruction to match. We were mishandling how this
tagging happened for start states. In particular, the tag was not used as
an index into the start state cache, and therefore could wind up choosing
an incorrect but previously computed start state with the wrong flags set.
This leads to incorrect matches.

We fix this by using the right flags to generate an index.
---
 src/dfa.rs             | 26 +++++++++++++-------------
 tests/api.rs           |  5 +++++
 tests/macros.rs        | 11 +++++++++++
 tests/regression.rs    |  8 ++++++++
 tests/word_boundary.rs |  1 +
 5 files changed, 38 insertions(+), 13 deletions(-)
diff --git a/src/dfa.rs b/src/dfa.rs
index 55abf9f0e5..22d141d9b9 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -1319,7 +1319,19 @@ impl<'a> Fsm<'a> {
         empty_flags: EmptyFlags,
         state_flags: StateFlags,
     ) -> Option<StatePtr> {
-        let flagi = empty_flags.as_index();
+        // Compute an index into our cache of start states based on the set
+        // of empty/state flags set at the current position in the input. We
+        // don't use every flag since not all flags matter. For example, since
+        // matches are delayed by one byte, start states can never be match
+        // states.
+        let flagi = {
+            (((empty_flags.start as u8) << 0) |
+             ((empty_flags.end as u8) << 1) |
+             ((empty_flags.start_line as u8) << 2) |
+             ((empty_flags.end_line as u8) << 3) |
+             ((state_flags.is_word() as u8) << 4))
+            as usize
+        };
         match self.cache.start_states[flagi] {
             STATE_UNKNOWN => {}
             STATE_DEAD => return Some(STATE_DEAD),
@@ -1592,18 +1604,6 @@ impl Transitions {
     }
 }
 
-impl EmptyFlags {
-    fn as_index(&self) -> usize {
-        (((self.start as u8) << 0) |
-         ((self.end as u8) << 1) |
-         ((self.start_line as u8) << 2) |
-         ((self.end_line as u8) << 3) |
-         ((self.word_boundary as u8) << 4) |
-         ((self.not_word_boundary as u8) << 5))
-        as usize
-    }
-}
-
 impl StateFlags {
     fn is_match(&self) -> bool {
         self.0 & 0b0000000_1 > 0
diff --git a/tests/api.rs b/tests/api.rs
index 275157bf07..e6c3a27a8d 100644
--- a/tests/api.rs
+++ b/tests/api.rs
@@ -234,3 +234,8 @@ expand!(expand9, r"(?P<a>\w+)\s+(?P<b>\d+)",
         "abc 123", " $b $a ", " 123 abc ");
 expand!(expand10, r"(?P<a>\w+)\s+(?P<b>\d+)",
         "abc 123", "$bz$az", "");
+
+split!(split1, r"\s+", "a b\nc\td\n\t e",
+       &[t!("a"), t!("b"), t!("c"), t!("d"), t!("e")]);
+split!(split2, r"\b", "a b c",
+       &[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c")]);
diff --git a/tests/macros.rs b/tests/macros.rs
index c3b79e2221..f9e8912630 100644
--- a/tests/macros.rs
+++ b/tests/macros.rs
@@ -114,3 +114,14 @@ macro_rules! nomatset {
         }
     }
 }
+
+macro_rules! split {
+    ($name:ident, $re:expr, $text:expr, $expected:expr) => {
+        #[test]
+        fn $name() {
+            let re = regex!($re);
+            let splitted: Vec<_> = re.split(t!($text)).collect();
+            assert_eq!($expected, &*splitted);
+        }
+    }
+}
diff --git a/tests/regression.rs b/tests/regression.rs
index 913a6baefb..e694dd01b9 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -46,6 +46,14 @@ mat!(uni_case_lower_nocase_flag, u!(r"(?i)\p{Ll}+"), "ΛΘΓΔα", Some((0, 10))
 // See: https://github.com/rust-lang-nursery/regex/issues/191
 mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3)));
 
+// burntsushi was bad and didn't create an issue for this bug.
 mat!(anchored_prefix1, r"^a\S", "a ", None);
 mat!(anchored_prefix2, r"^a\S", "foo boo a ", None);
 mat!(anchored_prefix3, r"^-[a-z]", "r-f", None);
+
+// See: https://github.com/rust-lang-nursery/regex/issues/204
+split!(split_on_word_boundary, r"\b", r"Should this (work?)",
+       &[t!(""), t!("Should"), t!(" "), t!("this"),
+         t!(" ("), t!("work"), t!("?)")]);
+matiter!(word_boundary_dfa, r"\b", "a b c",
+         (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
diff --git a/tests/word_boundary.rs b/tests/word_boundary.rs
index 3c27eaddfa..7fe97a2974 100644
--- a/tests/word_boundary.rs
+++ b/tests/word_boundary.rs
@@ -40,6 +40,7 @@ matiter!(wb37, r"^^^^^\b.$$$$$", "x", (0, 1));
 matiter!(wb38, r"^^^^^\b$$$$$", "x");
 matiter!(wb39, r"^^^^^\b\b\b.\b\b\b$$$$$", "x", (0, 1));
 matiter!(wb40, r"\b.+\b", "$$abc$$", (2, 5));
+matiter!(wb41, r"\b", "a b c", (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
 
 matiter!(nb1, r"\Bfoo\B", "n foo xfoox that", (7, 10));
 matiter!(nb2, r"a\B", "faoa x", (1, 2));