Skip to content

Commit

Permalink
grep: add "perfect" smart case detection
Browse files Browse the repository at this point in the history
This commit removes the previous smart case detection logic and replaces
it with detection based on the regex AST. This particular AST is a faithful
representation of the concrete syntax, which lets us be very precise in
how we handle it.

Closes #851
  • Loading branch information
BurntSushi committed Mar 14, 2018
1 parent a12868c commit 6d9bc5c
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 49 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ Bug fixes:
Support backslash escapes in globs.
* [BUG #832](https://github.com/BurntSushi/ripgrep/issues/832):
Clarify usage instructions for `-f/--file` flag.
* [BUG #851](https://github.com/BurntSushi/ripgrep/issues/851):
Fix `-S/--smart-case` detection once and for all.
* [BUG #852](https://github.com/BurntSushi/ripgrep/issues/852):
Be robust with respect to `ENOMEM` errors returned by `mmap`.
* [BUG #853](https://github.com/BurntSushi/ripgrep/issues/853):
Expand Down
1 change: 1 addition & 0 deletions grep/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pub use search::{Grep, GrepBuilder, Iter, Match};
mod literals;
mod nonl;
mod search;
mod smart_case;
mod word_boundary;

/// Result is a convenient type alias that fixes the type of the error to
Expand Down
58 changes: 9 additions & 49 deletions grep/src/search.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
use memchr::{memchr, memrchr};
use syntax::ParserBuilder;
use syntax::hir::Hir;
use regex::bytes::{Regex, RegexBuilder};

use literals::LiteralSets;
use nonl;
use syntax::ParserBuilder;
use syntax::hir::Hir;
use smart_case::Cased;
use word_boundary::strip_unicode_word_boundaries;
use Result;

Expand Down Expand Up @@ -205,7 +206,11 @@ impl GrepBuilder {
if !self.opts.case_smart {
return Ok(false);
}
Ok(!has_uppercase_literal(&self.pattern))
let cased = match Cased::from_pattern(&self.pattern) {
None => return Ok(false),
Some(cased) => cased,
};
Ok(cased.any_literal && !cased.any_uppercase)
}
}

Expand Down Expand Up @@ -311,44 +316,15 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
}
}

/// Determine whether the pattern contains an uppercase character which should
/// negate the effect of the smart-case option.
///
/// Ideally we would be able to check the AST in order to correctly handle
/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
/// cased), but we don't currently have that option. For now, our 'good enough'
/// solution is to simply perform a semi-naïve scan of the input pattern and
/// ignore all characters following a '\'. The ExprBuilder will handle any
/// actual errors, and this at least lets us support the most common cases,
/// like 'foo\w' and 'foo\S', in an intuitive manner.
fn has_uppercase_literal(pattern: &str) -> bool {
let mut chars = pattern.chars();
while let Some(c) = chars.next() {
if c == '\\' {
chars.next();
} else if c.is_uppercase() {
return true;
}
}
false
}

#[cfg(test)]
mod tests {
#![allow(unused_imports)]

use memchr::{memchr, memrchr};
use regex::bytes::Regex;

use super::{GrepBuilder, Match, has_uppercase_literal};
use super::{GrepBuilder, Match};

static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");

#[allow(dead_code)]
fn s(bytes: &[u8]) -> String {
String::from_utf8(bytes.to_vec()).unwrap()
}

fn find_lines(pat: &str, haystack: &[u8]) -> Vec<Match> {
let re = Regex::new(pat).unwrap();
let mut lines = vec![];
Expand Down Expand Up @@ -377,20 +353,4 @@ mod tests {
assert_eq!(expected.len(), got.len());
assert_eq!(expected, got);
}

#[test]
fn pattern_case() {
assert_eq!(has_uppercase_literal(&"".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo".to_string()), false);
assert_eq!(has_uppercase_literal(&"Foo".to_string()), true);
assert_eq!(has_uppercase_literal(&"foO".to_string()), true);
assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true);
assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true);
assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false);
assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true);
}
}
191 changes: 191 additions & 0 deletions grep/src/smart_case.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
use syntax::ast::{self, Ast};
use syntax::ast::parse::Parser;

/// The results of analyzing a regex for cased literals.
#[derive(Clone, Debug, Default)]
pub struct Cased {
/// True if and only if a literal uppercase character occurs in the regex.
///
/// A regex like `\pL` contains no uppercase literals, even though `L`
/// is uppercase and the `\pL` class contains uppercase characters.
pub any_uppercase: bool,
/// True if and only if the regex contains any literal at all. A regex like
/// `\pL` has this set to false.
pub any_literal: bool,
}

impl Cased {
/// Returns a `Cased` value by doing analysis on the AST of `pattern`.
///
/// If `pattern` is not a valid regular expression, then `None` is
/// returned.
pub fn from_pattern(pattern: &str) -> Option<Cased> {
Parser::new()
.parse(pattern)
.map(|ast| Cased::from_ast(&ast))
.ok()
}

fn from_ast(ast: &Ast) -> Cased {
let mut cased = Cased::default();
cased.from_ast_impl(ast);
cased
}

fn from_ast_impl(&mut self, ast: &Ast) {
if self.done() {
return;
}
match *ast {
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Dot(_)
| Ast::Assertion(_)
| Ast::Class(ast::Class::Unicode(_))
| Ast::Class(ast::Class::Perl(_)) => {}
Ast::Literal(ref x) => {
self.from_ast_literal(x);
}
Ast::Class(ast::Class::Bracketed(ref x)) => {
self.from_ast_class_set(&x.kind);
}
Ast::Repetition(ref x) => {
self.from_ast_impl(&x.ast);
}
Ast::Group(ref x) => {
self.from_ast_impl(&x.ast);
}
Ast::Alternation(ref alt) => {
for x in &alt.asts {
self.from_ast_impl(x);
}
}
Ast::Concat(ref alt) => {
for x in &alt.asts {
self.from_ast_impl(x);
}
}
}
}

fn from_ast_class_set(&mut self, ast: &ast::ClassSet) {
if self.done() {
return;
}
match *ast {
ast::ClassSet::Item(ref item) => {
self.from_ast_class_set_item(item);
}
ast::ClassSet::BinaryOp(ref x) => {
self.from_ast_class_set(&x.lhs);
self.from_ast_class_set(&x.rhs);
}
}
}

fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) {
if self.done() {
return;
}
match *ast {
ast::ClassSetItem::Empty(_)
| ast::ClassSetItem::Ascii(_)
| ast::ClassSetItem::Unicode(_)
| ast::ClassSetItem::Perl(_) => {}
ast::ClassSetItem::Literal(ref x) => {
self.from_ast_literal(x);
}
ast::ClassSetItem::Range(ref x) => {
self.from_ast_literal(&x.start);
self.from_ast_literal(&x.end);
}
ast::ClassSetItem::Bracketed(ref x) => {
self.from_ast_class_set(&x.kind);
}
ast::ClassSetItem::Union(ref union) => {
for x in &union.items {
self.from_ast_class_set_item(x);
}
}
}
}

fn from_ast_literal(&mut self, ast: &ast::Literal) {
self.any_literal = true;
self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
}

/// Returns true if and only if the attributes can never change no matter
/// what other AST it might see.
fn done(&self) -> bool {
self.any_uppercase && self.any_literal
}
}

#[cfg(test)]
mod tests {
use super::*;

fn cased(pattern: &str) -> Cased {
Cased::from_pattern(pattern).unwrap()
}

#[test]
fn various() {
let x = cased("");
assert!(!x.any_uppercase);
assert!(!x.any_literal);

let x = cased("foo");
assert!(!x.any_uppercase);
assert!(x.any_literal);

let x = cased("Foo");
assert!(x.any_uppercase);
assert!(x.any_literal);

let x = cased("foO");
assert!(x.any_uppercase);
assert!(x.any_literal);

let x = cased(r"foo\\");
assert!(!x.any_uppercase);
assert!(x.any_literal);

let x = cased(r"foo\w");
assert!(!x.any_uppercase);
assert!(x.any_literal);

let x = cased(r"foo\S");
assert!(!x.any_uppercase);
assert!(x.any_literal);

let x = cased(r"foo\p{Ll}");
assert!(!x.any_uppercase);
assert!(x.any_literal);

let x = cased(r"foo[a-z]");
assert!(!x.any_uppercase);
assert!(x.any_literal);

let x = cased(r"foo[A-Z]");
assert!(x.any_uppercase);
assert!(x.any_literal);

let x = cased(r"foo[\S\t]");
assert!(!x.any_uppercase);
assert!(x.any_literal);

let x = cased(r"foo\\S");
assert!(x.any_uppercase);
assert!(x.any_literal);

let x = cased(r"\p{Ll}");
assert!(!x.any_uppercase);
assert!(!x.any_literal);

let x = cased(r"aBc\w");
assert!(x.any_uppercase);
assert!(x.any_literal);
}
}

0 comments on commit 6d9bc5c

Please sign in to comment.