Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor perf improvements and code touch ups. #222

Merged
merged 3 commits into from
May 1, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions bench/src/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,23 @@ pub use ffi::tcl::Regex;
// Due to macro scoping rules, this definition only applies for the modules
// defined below. Effectively, it allows us to use the same tests for both
// native and dynamic regexes.
#[cfg(not(feature = "re-rust-bytes"))]
#[cfg(not(feature = "re-rust-plugin"))]
macro_rules! regex {
($re:expr) => { ::Regex::new($re).unwrap() }
}

#[cfg(feature = "re-rust-bytes")]
#[cfg(not(feature = "re-rust-plugin"))]
macro_rules! regex {
($re:expr) => {{
// Always enable the Unicode flag for byte based regexes.
// Really, this should have been enabled by default. *sigh*
use regex::bytes::RegexBuilder;
RegexBuilder::new($re).unicode(true).compile().unwrap()
}}
}

// Usage: text!(haystack)
//
// Builds a ::Text from an owned string.
Expand Down
19 changes: 0 additions & 19 deletions bench/src/sherlock.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,32 +101,17 @@ sherlock!(everything_greedy_nl, r"(?s).*", 1);
// How fast can we match every letter? This also defeats any clever prefix
// tricks.
#[cfg(not(feature = "re-tcl"))]
#[cfg(not(feature = "re-rust-bytes"))]
sherlock!(letters, r"\p{L}", 447160);
#[cfg(not(feature = "re-tcl"))]
#[cfg(feature = "re-rust-bytes")]
sherlock!(letters, r"(?u)\p{L}", 447160);

#[cfg(not(feature = "re-tcl"))]
#[cfg(not(feature = "re-rust-bytes"))]
sherlock!(letters_upper, r"\p{Lu}", 14180);
#[cfg(not(feature = "re-tcl"))]
#[cfg(feature = "re-rust-bytes")]
sherlock!(letters_upper, r"(?u)\p{Lu}", 14180);

#[cfg(not(feature = "re-tcl"))]
#[cfg(not(feature = "re-rust-bytes"))]
sherlock!(letters_lower, r"\p{Ll}", 432980);
#[cfg(not(feature = "re-tcl"))]
#[cfg(feature = "re-rust-bytes")]
sherlock!(letters_lower, r"(?u)\p{Ll}", 432980);

// Similarly, for words.
#[cfg(not(feature = "re-rust-bytes"))]
#[cfg(not(feature = "re-re2"))]
sherlock!(words, r"\w+", 109214);
#[cfg(feature = "re-rust-bytes")]
sherlock!(words, r"(?u)\w+", 109214);
#[cfg(feature = "re-re2")]
sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?

Expand Down Expand Up @@ -195,8 +180,4 @@ sherlock!(ing_suffix, r"[a-zA-Z]+ing", 2824);
//
// Onig does surprisingly well on this benchmark and yet does quite poorly on
// the ing_suffix benchmark. That one has me stumped.
//
// Interestingly, this is slower in the rust-bytes benchmark, presumably
// because scanning for one of the bytes in the Unicode *unaware* `\s` ends
// up being slower than avoiding the prefix scan at all.
sherlock!(ing_suffix_limited_space, r"\s[a-zA-Z]{0,12}ing\s", 2081);
97 changes: 51 additions & 46 deletions regex-syntax/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,7 @@ impl Parser {
'{' => try!(self.parse_counted_repeat()),
'[' => match self.maybe_parse_ascii() {
None => try!(self.parse_class()),
Some(cls) => {
Build::Expr(if self.flags.unicode {
Expr::Class(cls)
} else {
Expr::ClassBytes(cls.to_byte_class())
})
}
Some(cls) => Build::Expr(Expr::Class(cls)),
},
'^' => {
if self.flags.multi {
Expand Down Expand Up @@ -224,11 +218,7 @@ impl Parser {
}
'd'|'s'|'w'|'D'|'S'|'W' => {
self.bump();
Ok(Build::Expr(if self.flags.unicode {
Expr::Class(self.parse_perl_class(c))
} else {
Expr::ClassBytes(self.parse_perl_class(c).to_byte_class())
}))
Ok(Build::Expr(Expr::Class(self.parse_perl_class(c))))
}
c => Err(self.err(ErrorKind::UnrecognizedEscape(c))),
}
Expand Down Expand Up @@ -1328,16 +1318,28 @@ mod tests {
ByteClass::new(ranges)
}

fn asciid() -> ByteClass {
ascii_class("digit").unwrap().to_byte_class()
fn asciid() -> CharClass {
ascii_class("digit").unwrap()
}

fn asciis() -> CharClass {
ascii_class("space").unwrap()
}

fn asciiw() -> CharClass {
ascii_class("word").unwrap()
}

fn asciis() -> ByteClass {
ascii_class("space").unwrap().to_byte_class()
fn asciid_bytes() -> ByteClass {
asciid().to_byte_class()
}

fn asciiw() -> ByteClass {
ascii_class("word").unwrap().to_byte_class()
fn asciis_bytes() -> ByteClass {
asciis().to_byte_class()
}

fn asciiw_bytes() -> ByteClass {
asciiw().to_byte_class()
}

#[test]
Expand Down Expand Up @@ -1905,79 +1907,79 @@ mod tests {
#[test]
fn escape_perl_d() {
assert_eq!(p(r"\d"), Expr::Class(class(PERLD)));
assert_eq!(pb(r"(?-u)\d"), Expr::ClassBytes(asciid()));
assert_eq!(pb(r"(?-u)\d"), Expr::Class(asciid()));
}

#[test]
fn escape_perl_s() {
assert_eq!(p(r"\s"), Expr::Class(class(PERLS)));
assert_eq!(pb(r"(?-u)\s"), Expr::ClassBytes(asciis()));
assert_eq!(pb(r"(?-u)\s"), Expr::Class(asciis()));
}

#[test]
fn escape_perl_w() {
assert_eq!(p(r"\w"), Expr::Class(class(PERLW)));
assert_eq!(pb(r"(?-u)\w"), Expr::ClassBytes(asciiw()));
assert_eq!(pb(r"(?-u)\w"), Expr::Class(asciiw()));
}

#[test]
fn escape_perl_d_negate() {
assert_eq!(p(r"\D"), Expr::Class(class(PERLD).negate()));
assert_eq!(pb(r"(?-u)\D"), Expr::ClassBytes(asciid().negate()));
assert_eq!(pb(r"(?-u)\D"), Expr::Class(asciid().negate()));
}

#[test]
fn escape_perl_s_negate() {
assert_eq!(p(r"\S"), Expr::Class(class(PERLS).negate()));
assert_eq!(pb(r"(?-u)\S"), Expr::ClassBytes(asciis().negate()));
assert_eq!(pb(r"(?-u)\S"), Expr::Class(asciis().negate()));
}

#[test]
fn escape_perl_w_negate() {
assert_eq!(p(r"\W"), Expr::Class(class(PERLW).negate()));
assert_eq!(pb(r"(?-u)\W"), Expr::ClassBytes(asciiw().negate()));
assert_eq!(pb(r"(?-u)\W"), Expr::Class(asciiw().negate()));
}

#[test]
fn escape_perl_d_case_fold() {
assert_eq!(p(r"(?i)\d"), Expr::Class(class(PERLD).case_fold()));
assert_eq!(pb(r"(?i-u)\d"), Expr::ClassBytes(asciid().case_fold()));
assert_eq!(pb(r"(?i-u)\d"), Expr::Class(asciid().case_fold()));
}

#[test]
fn escape_perl_s_case_fold() {
assert_eq!(p(r"(?i)\s"), Expr::Class(class(PERLS).case_fold()));
assert_eq!(pb(r"(?i-u)\s"), Expr::ClassBytes(asciis().case_fold()));
assert_eq!(pb(r"(?i-u)\s"), Expr::Class(asciis().case_fold()));
}

#[test]
fn escape_perl_w_case_fold() {
assert_eq!(p(r"(?i)\w"), Expr::Class(class(PERLW).case_fold()));
assert_eq!(pb(r"(?i-u)\w"), Expr::ClassBytes(asciiw().case_fold()));
assert_eq!(pb(r"(?i-u)\w"), Expr::Class(asciiw().case_fold()));
}

#[test]
fn escape_perl_d_case_fold_negate() {
assert_eq!(p(r"(?i)\D"),
Expr::Class(class(PERLD).case_fold().negate()));
let bytes = asciid().case_fold().negate();
assert_eq!(pb(r"(?i-u)\D"), Expr::ClassBytes(bytes));
assert_eq!(pb(r"(?i-u)\D"), Expr::Class(bytes));
}

#[test]
fn escape_perl_s_case_fold_negate() {
assert_eq!(p(r"(?i)\S"),
Expr::Class(class(PERLS).case_fold().negate()));
let bytes = asciis().case_fold().negate();
assert_eq!(pb(r"(?i-u)\S"), Expr::ClassBytes(bytes));
assert_eq!(pb(r"(?i-u)\S"), Expr::Class(bytes));
}

#[test]
fn escape_perl_w_case_fold_negate() {
assert_eq!(p(r"(?i)\W"),
Expr::Class(class(PERLW).case_fold().negate()));
let bytes = asciiw().case_fold().negate();
assert_eq!(pb(r"(?i-u)\W"), Expr::ClassBytes(bytes));
assert_eq!(pb(r"(?i-u)\W"), Expr::Class(bytes));
}

#[test]
Expand Down Expand Up @@ -2039,11 +2041,11 @@ mod tests {
assert_eq!(p(r"[^\w]"), Expr::Class(class(PERLW).negate()));
assert_eq!(p(r"[^\s]"), Expr::Class(class(PERLS).negate()));

let bytes = asciid().negate();
let bytes = asciid_bytes().negate();
assert_eq!(pb(r"(?-u)[^\d]"), Expr::ClassBytes(bytes));
let bytes = asciiw().negate();
let bytes = asciiw_bytes().negate();
assert_eq!(pb(r"(?-u)[^\w]"), Expr::ClassBytes(bytes));
let bytes = asciis().negate();
let bytes = asciis_bytes().negate();
assert_eq!(pb(r"(?-u)[^\s]"), Expr::ClassBytes(bytes));
}

Expand All @@ -2053,17 +2055,18 @@ mod tests {
assert_eq!(p(r"[^\W]"), Expr::Class(class(PERLW)));
assert_eq!(p(r"[^\S]"), Expr::Class(class(PERLS)));

assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid()));
assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw()));
assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis()));
assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid_bytes()));
assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw_bytes()));
assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis_bytes()));
}

#[test]
fn class_singleton_class_casei() {
assert_eq!(p(r"(?i)[\d]"), Expr::Class(class(PERLD).case_fold()));
assert_eq!(p(r"(?i)[\p{Yi}]"), Expr::Class(class(YI).case_fold()));

assert_eq!(pb(r"(?i-u)[\d]"), Expr::ClassBytes(asciid().case_fold()));
assert_eq!(pb(r"(?i-u)[\d]"),
Expr::ClassBytes(asciid_bytes().case_fold()));
}

#[test]
Expand All @@ -2075,11 +2078,11 @@ mod tests {
assert_eq!(p(r"(?i)[^\s]"),
Expr::Class(class(PERLS).case_fold().negate()));

let bytes = asciid().case_fold().negate();
let bytes = asciid_bytes().case_fold().negate();
assert_eq!(pb(r"(?i-u)[^\d]"), Expr::ClassBytes(bytes));
let bytes = asciiw().case_fold().negate();
let bytes = asciiw_bytes().case_fold().negate();
assert_eq!(pb(r"(?i-u)[^\w]"), Expr::ClassBytes(bytes));
let bytes = asciis().case_fold().negate();
let bytes = asciis_bytes().case_fold().negate();
assert_eq!(pb(r"(?i-u)[^\s]"), Expr::ClassBytes(bytes));
}

Expand All @@ -2089,9 +2092,12 @@ mod tests {
assert_eq!(p(r"(?i)[^\W]"), Expr::Class(class(PERLW).case_fold()));
assert_eq!(p(r"(?i)[^\S]"), Expr::Class(class(PERLS).case_fold()));

assert_eq!(pb(r"(?i-u)[^\D]"), Expr::ClassBytes(asciid().case_fold()));
assert_eq!(pb(r"(?i-u)[^\W]"), Expr::ClassBytes(asciiw().case_fold()));
assert_eq!(pb(r"(?i-u)[^\S]"), Expr::ClassBytes(asciis().case_fold()));
assert_eq!(pb(r"(?i-u)[^\D]"),
Expr::ClassBytes(asciid_bytes().case_fold()));
assert_eq!(pb(r"(?i-u)[^\W]"),
Expr::ClassBytes(asciiw_bytes().case_fold()));
assert_eq!(pb(r"(?i-u)[^\S]"),
Expr::ClassBytes(asciis_bytes().case_fold()));
}

#[test]
Expand Down Expand Up @@ -2184,8 +2190,7 @@ mod tests {
assert_eq!(p("[:upper:]"), Expr::Class(class(UPPER)));
assert_eq!(p("[[:upper:]]"), Expr::Class(class(UPPER)));

assert_eq!(pb("(?-u)[:upper:]"),
Expr::ClassBytes(class(UPPER).to_byte_class()));
assert_eq!(pb("(?-u)[:upper:]"), Expr::Class(class(UPPER)));
assert_eq!(pb("(?-u)[[:upper:]]"),
Expr::ClassBytes(class(UPPER).to_byte_class()));
}
Expand Down Expand Up @@ -2233,7 +2238,7 @@ mod tests {
Expr::Class(class(UPPER).case_fold()));

assert_eq!(pb("(?i-u)[:upper:]"),
Expr::ClassBytes(class(UPPER).to_byte_class().case_fold()));
Expr::Class(class(UPPER).case_fold()));
assert_eq!(pb("(?i-u)[[:upper:]]"),
Expr::ClassBytes(class(UPPER).to_byte_class().case_fold()));
}
Expand Down
Loading