Skip to content

Commit

Permalink
grep-regex: add fast path for -w/--word-regexp
Browse files Browse the repository at this point in the history
Previously, ripgrep would always defer to the regex engine's capturing
matches in order to implement word matching. Namely, ripgrep would
determine the correct match offsets via a capturing group, since the
word regex is itself generated from the user supplied regex.

Unfortunately, the regex engine's capturing mode is still fairly slow,
so this commit adds a fast path to avoid capturing mode in the vast
majority of cases. See comments in the code for details.
  • Loading branch information
BurntSushi committed Feb 17, 2020
1 parent 6a0e014 commit cd8ec38
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 4 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ Performance improvements:
of ` `.
* PERF:
Improve literal detection when the `-w/--word-regexp` flag is used.
* PERF:
Improve overall performance of the `-w/--word-regexp` flag.

Feature enhancements:

Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions grep-regex/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ license = "Unlicense/MIT"

[dependencies]
aho-corasick = "0.7.3"
bstr = "0.2.10"
grep-matcher = { version = "0.1.2", path = "../grep-matcher" }
log = "0.4.5"
regex = "1.1"
Expand Down
101 changes: 97 additions & 4 deletions grep-regex/src/word.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ use matcher::RegexCaptures;
pub struct WordMatcher {
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
regex: Regex,
/// The original regex supplied by the user, which we use in a fast path
/// to try and detect matches before deferring to slower engines.
original: Regex,
/// A map from capture group name to capture group index.
names: HashMap<String, usize>,
/// A reusable buffer for finding the match location of the inner group.
Expand All @@ -28,6 +31,7 @@ impl Clone for WordMatcher {
// usings `locs` to hit the fast path.
WordMatcher {
regex: self.regex.clone(),
original: self.original.clone(),
names: self.names.clone(),
locs: Arc::new(CachedThreadLocal::new()),
}
Expand All @@ -41,8 +45,13 @@ impl WordMatcher {
/// The given options are used to construct the regular expression
/// internally.
pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
let original = expr.with_pattern(|pat| {
format!("^(?:{})$", pat)
})?.regex()?;
let word_expr = expr.with_pattern(|pat| {
format!(r"(?:(?m:^)|\W)({})(?:(?m:$)|\W)", pat)
let pat = format!(r"(?:(?-m:^)|\W)({})(?:(?-m:$)|\W)", pat);
debug!("word regex: {:?}", pat);
pat
})?;
let regex = word_expr.regex()?;
let locs = Arc::new(CachedThreadLocal::new());
Expand All @@ -53,13 +62,65 @@ impl WordMatcher {
names.insert(name.to_string(), i.checked_sub(1).unwrap());
}
}
Ok(WordMatcher { regex, names, locs })
Ok(WordMatcher { regex, original, names, locs })
}

/// Return the underlying regex used by this matcher.
pub fn regex(&self) -> &Regex {
&self.regex
}

/// Attempt to do a fast confirmation of a word match that covers a subset
/// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned
/// when a match is found. Ok(None) is returned when there is definitively
/// no match. Err(()) is returned when this routine could not detect
/// whether there was a match or not.
fn fast_find(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<Match>, ()> {
// This is a bit hairy. The whole point here is to avoid running an
// NFA simulation in the regex engine. Remember, our word regex looks
// like this:
//
// (^|\W)(<original regex>)($|\W)
// where ^ and $ have multiline mode DISABLED
//
// What we want are the match offsets of <original regex>. So in the
// easy/common case, the original regex will be sandwiched between
// two codepoints that are in the \W class. So our approach here is to
// look for a match of the overall word regexp, strip the \W ends and
// then check whether the original regex matches what's left. If so,
// then we are guaranteed a correct match.
//
// This only works though if we know that the match is sandwiched
// between two \W codepoints. This only occurs when neither ^ nor $
// match. This in turn only occurs when the match is at either the
// beginning or end of the haystack. In either of those cases, we
// declare defeat and defer to the slower implementation.
//
// The reason why we cannot handle the ^/$ cases here is because we
// can't assume anything about the original pattern. (Try commenting
// out the checks for ^/$ below and run the tests to see examples.)
let mut cand = match self.regex.find_at(haystack, at) {
None => return Ok(None),
Some(m) => Match::new(m.start(), m.end()),
};
if cand.start() == 0 || cand.end() == haystack.len() {
return Err(());
}
let (_, slen) = bstr::decode_utf8(&haystack[cand]);
let (_, elen) = bstr::decode_last_utf8(&haystack[cand]);
cand = cand
.with_start(cand.start() + slen)
.with_end(cand.end() - elen);
if self.original.is_match(&haystack[cand]) {
Ok(Some(cand))
} else {
Err(())
}
}
}

impl Matcher for WordMatcher {
Expand All @@ -76,6 +137,16 @@ impl Matcher for WordMatcher {
// of `0`. We *could* use `find_at` here and then trim the match after
// the fact, but that's a bit harder to get right, and it's not clear
// if it's worth it.
//
// OK, well, it turns out that it is worth it! But it is quite tricky.
// See `fast_find` for details. Effectively, this lets us skip running
// the NFA simulation in the regex engine in the vast majority of
// cases. However, the NFA simulation is required for full correctness.
match self.fast_find(haystack, at) {
Ok(Some(m)) => return Ok(Some(m)),
Ok(None) => return Ok(None),
Err(()) => {}
}

let cell = self.locs.get_or(|| {
RefCell::new(self.regex.capture_locations())
Expand Down Expand Up @@ -152,9 +223,31 @@ mod tests {

assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));
assert_eq!(None, find(r"foo", "fooб"));
// assert_eq!(Some((0, 3)), find(r"foo", "fooб"));

// See: https://github.com/BurntSushi/ripgrep/issues/389
assert_eq!(Some((0, 4)), find(r"foo5", "foo5"));
assert_eq!(None, find(r"foo", "foo5"));

assert_eq!(Some((1, 4)), find(r"foo", "!foo!"));
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!"));
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!"));

assert_eq!(Some((0, 3)), find(r"foo", "foo\n"));
assert_eq!(Some((1, 4)), find(r"foo", "!foo!\n"));
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!\n"));
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!\n"));

assert_eq!(Some((1, 6)), find(r"!?foo!?", "!!foo!!"));
assert_eq!(Some((0, 5)), find(r"!?foo!?", "!foo!"));
assert_eq!(Some((2, 5)), find(r"!?foo!?", "a!foo!a"));

assert_eq!(Some((2, 7)), find(r"!?foo!?", "##!foo!\n"));
assert_eq!(Some((3, 7)), find(r"f?oo!?", "##\nfoo!##"));
assert_eq!(Some((2, 5)), find(r"(?-u)foo[^a]*", "#!foo☃aaa"));
}

// See: https://github.com/BurntSushi/ripgrep/issues/389
#[test]
fn regression_dash() {
assert_eq!(Some((0, 2)), find(r"-2", "-2"));
}

Expand Down

0 comments on commit cd8ec38

Please sign in to comment.