Skip to content

Commit

Permalink
regex: make CRLF hack more robust
Browse files Browse the repository at this point in the history
This commit improves the CRLF hack to be more robust. In particular, in
addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end
of a match if and only if the regex has an ending line anchor required for
a match. This doesn't quite make the hack 100% correct, but should fix most
use cases in practice. An example of a regex that will still be incorrect
is `foo|bar$`, since the analysis isn't quite sophisticated enough to
determine that a `\r` can be safely stripped from any match. Even if we
fix that, regexes like `foo\r|bar$` still won't be handled correctly. Alas,
more work on this front should really be focused on enabling this in the
regex engine itself.

The specific cause of this bug was that grep-searcher was sneakily
stripping CRLF from matching lines when it really shouldn't have. We remove
that code now, and instead rely on better match semantics provided at a
lower level.

Fixes #1095
  • Loading branch information
BurntSushi committed Jan 26, 2019
1 parent e99b6bd commit 9d70311
Show file tree
Hide file tree
Showing 7 changed files with 267 additions and 18 deletions.
44 changes: 43 additions & 1 deletion grep-printer/src/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -817,7 +817,8 @@ impl<'a> SubMatches<'a> {

#[cfg(test)]
mod tests {
use grep_regex::RegexMatcher;
use grep_regex::{RegexMatcher, RegexMatcherBuilder};
use grep_matcher::LineTerminator;
use grep_searcher::SearcherBuilder;

use super::{JSON, JSONBuilder};
Expand Down Expand Up @@ -918,4 +919,45 @@ and exhibited clearly, with a label attached.\
assert_eq!(got.lines().count(), 2);
assert!(got.contains("begin") && got.contains("end"));
}

#[test]
fn missing_crlf() {
let haystack = "test\r\n".as_bytes();

let matcher = RegexMatcherBuilder::new()
.build("test")
.unwrap();
let mut printer = JSONBuilder::new()
.build(vec![]);
SearcherBuilder::new()
.build()
.search_reader(&matcher, haystack, printer.sink(&matcher))
.unwrap();
let got = printer_contents(&mut printer);
assert_eq!(got.lines().count(), 3);
assert!(
got.lines().nth(1).unwrap().contains(r"test\r\n"),
r"missing 'test\r\n' in '{}'",
got.lines().nth(1).unwrap(),
);

let matcher = RegexMatcherBuilder::new()
.crlf(true)
.build("test")
.unwrap();
let mut printer = JSONBuilder::new()
.build(vec![]);
SearcherBuilder::new()
.line_terminator(LineTerminator::crlf())
.build()
.search_reader(&matcher, haystack, printer.sink(&matcher))
.unwrap();
let got = printer_contents(&mut printer);
assert_eq!(got.lines().count(), 3);
assert!(
got.lines().nth(1).unwrap().contains(r"test\r\n"),
r"missing 'test\r\n' in '{}'",
got.lines().nth(1).unwrap(),
);
}
}
2 changes: 1 addition & 1 deletion grep-regex/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ license = "Unlicense/MIT"
log = "0.4.5"
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
regex = "1.1"
regex-syntax = "0.6.4"
regex-syntax = "0.6.5"
thread_local = "0.3.6"
utf8-ranges = "1.0.1"
8 changes: 8 additions & 0 deletions grep-regex/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,14 @@ impl ConfiguredHIR {
non_matching_bytes(&self.expr)
}

/// Returns true if and only if this regex needs to have its match offsets
/// tweaked because of CRLF support. Specifically, this occurs when the
/// CRLF hack is enabled and the regex is line anchored at the end. In
/// this case, matches that end with a `\r` have the `\r` stripped.
pub fn needs_crlf_stripped(&self) -> bool {
self.config.crlf && self.expr.is_line_anchored_end()
}

/// Builds a regular expression from this HIR expression.
pub fn regex(&self) -> Result<Regex, Error> {
self.pattern_to_regex(&self.expr.to_string())
Expand Down
100 changes: 100 additions & 0 deletions grep-regex/src/crlf.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,105 @@
use std::collections::HashMap;

use grep_matcher::{Match, Matcher, NoError};
use regex::bytes::Regex;
use regex_syntax::hir::{self, Hir, HirKind};

use config::ConfiguredHIR;
use error::Error;
use matcher::RegexCaptures;

/// A matcher for implementing "word match" semantics.
#[derive(Clone, Debug)]
pub struct CRLFMatcher {
/// The regex.
regex: Regex,
/// A map from capture group name to capture group index.
names: HashMap<String, usize>,
}

impl CRLFMatcher {
/// Create a new matcher from the given pattern that strips `\r` from the
/// end of every match.
///
/// This panics if the given expression doesn't need its CRLF stripped.
pub fn new(expr: &ConfiguredHIR) -> Result<CRLFMatcher, Error> {
assert!(expr.needs_crlf_stripped());

let regex = expr.regex()?;
let mut names = HashMap::new();
for (i, optional_name) in regex.capture_names().enumerate() {
if let Some(name) = optional_name {
names.insert(name.to_string(), i.checked_sub(1).unwrap());
}
}
Ok(CRLFMatcher { regex, names })
}
}

impl Matcher for CRLFMatcher {
type Captures = RegexCaptures;
type Error = NoError;

fn find_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<Match>, NoError> {
let m = match self.regex.find_at(haystack, at) {
None => return Ok(None),
Some(m) => Match::new(m.start(), m.end()),
};
Ok(Some(adjust_match(haystack, m)))
}

fn new_captures(&self) -> Result<RegexCaptures, NoError> {
Ok(RegexCaptures::new(self.regex.capture_locations()))
}

fn capture_count(&self) -> usize {
self.regex.captures_len().checked_sub(1).unwrap()
}

fn capture_index(&self, name: &str) -> Option<usize> {
self.names.get(name).map(|i| *i)
}

fn captures_at(
&self,
haystack: &[u8],
at: usize,
caps: &mut RegexCaptures,
) -> Result<bool, NoError> {
caps.strip_crlf(false);
let r = self.regex.captures_read_at(caps.locations(), haystack, at);
if !r.is_some() {
return Ok(false);
}

// If the end of our match includes a `\r`, then strip it from all
// capture groups ending at the same location.
let end = caps.locations().get(0).unwrap().1;
if end > 0 && haystack.get(end - 1) == Some(&b'\r') {
caps.strip_crlf(true);
}
Ok(true)
}

// We specifically do not implement other methods like find_iter or
// captures_iter. Namely, the iter methods are guaranteed to be correct
// by virtue of implementing find_at and captures_at above.
}

/// If the given match ends with a `\r`, then return a new match that ends
/// immediately before the `\r`.
pub fn adjust_match(haystack: &[u8], m: Match) -> Match {
if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') {
m.with_end(m.end() - 1)
} else {
m
}
}

/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
///
/// This does not preserve the exact semantics of the given expression,
Expand Down
Loading

0 comments on commit 9d70311

Please sign in to comment.