Skip to content

Commit

Permalink
Fix #204.
Browse files Browse the repository at this point in the history
The DFA handles word boundaries by tagging each state with an `is_word`
flag that lets us determine whether the next byte in the haystack should
cause a word boundary instruction to match. We were mishandling how this
tagging happened for start states. In particular, the tag was not used as
an index into the start state cache, and therefore could wind up choosing
an incorrect but previously computed start state with the wrong flags set.
This leads to incorrect matches.

We fix this by using the right flags to generate an index.
  • Loading branch information
BurntSushi committed Apr 23, 2016
1 parent 767f939 commit c12c28b
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 13 deletions.
26 changes: 13 additions & 13 deletions src/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1319,7 +1319,19 @@ impl<'a> Fsm<'a> {
empty_flags: EmptyFlags,
state_flags: StateFlags,
) -> Option<StatePtr> {
let flagi = empty_flags.as_index();
// Compute an index into our cache of start states based on the set
// of empty/state flags set at the current position in the input. We
// don't use every flag since not all flags matter. For example, since
// matches are delayed by one byte, start states can never be match
// states.
let flagi = {
(((empty_flags.start as u8) << 0) |
((empty_flags.end as u8) << 1) |
((empty_flags.start_line as u8) << 2) |
((empty_flags.end_line as u8) << 3) |
((state_flags.is_word() as u8) << 4))
as usize
};
match self.cache.start_states[flagi] {
STATE_UNKNOWN => {}
STATE_DEAD => return Some(STATE_DEAD),
Expand Down Expand Up @@ -1592,18 +1604,6 @@ impl Transitions {
}
}

impl EmptyFlags {
fn as_index(&self) -> usize {
(((self.start as u8) << 0) |
((self.end as u8) << 1) |
((self.start_line as u8) << 2) |
((self.end_line as u8) << 3) |
((self.word_boundary as u8) << 4) |
((self.not_word_boundary as u8) << 5))
as usize
}
}

impl StateFlags {
fn is_match(&self) -> bool {
self.0 & 0b0000000_1 > 0
Expand Down
5 changes: 5 additions & 0 deletions tests/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,8 @@ expand!(expand9, r"(?P<a>\w+)\s+(?P<b>\d+)",
"abc 123", " $b $a ", " 123 abc ");
expand!(expand10, r"(?P<a>\w+)\s+(?P<b>\d+)",
"abc 123", "$bz$az", "");

split!(split1, r"\s+", "a b\nc\td\n\t e",
&[t!("a"), t!("b"), t!("c"), t!("d"), t!("e")]);
split!(split2, r"\b", "a b c",
&[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c")]);
11 changes: 11 additions & 0 deletions tests/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,14 @@ macro_rules! nomatset {
}
}
}

macro_rules! split {
($name:ident, $re:expr, $text:expr, $expected:expr) => {
#[test]
fn $name() {
let re = regex!($re);
let splitted: Vec<_> = re.split(t!($text)).collect();
assert_eq!($expected, &*splitted);
}
}
}
8 changes: 8 additions & 0 deletions tests/regression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ mat!(uni_case_lower_nocase_flag, u!(r"(?i)\p{Ll}+"), "ΛΘΓΔα", Some((0, 10))
// See: https://github.com/rust-lang-nursery/regex/issues/191
mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3)));

// burntsushi was bad and didn't create an issue for this bug.
mat!(anchored_prefix1, r"^a\S", "a ", None);
mat!(anchored_prefix2, r"^a\S", "foo boo a ", None);
mat!(anchored_prefix3, r"^-[a-z]", "r-f", None);

// See: https://github.com/rust-lang-nursery/regex/issues/204
split!(split_on_word_boundary, r"\b", r"Should this (work?)",
&[t!(""), t!("Should"), t!(" "), t!("this"),
t!(" ("), t!("work"), t!("?)")]);
matiter!(word_boundary_dfa, r"\b", "a b c",
(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
1 change: 1 addition & 0 deletions tests/word_boundary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ matiter!(wb37, r"^^^^^\b.$$$$$", "x", (0, 1));
matiter!(wb38, r"^^^^^\b$$$$$", "x");
matiter!(wb39, r"^^^^^\b\b\b.\b\b\b$$$$$", "x", (0, 1));
matiter!(wb40, r"\b.+\b", "$$abc$$", (2, 5));
matiter!(wb41, r"\b", "a b c", (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));

matiter!(nb1, r"\Bfoo\B", "n foo xfoox that", (7, 10));
matiter!(nb2, r"a\B", "faoa x", (1, 2));
Expand Down

0 comments on commit c12c28b

Please sign in to comment.