From 4e3a10737603194cbcecf0b4ccaaaadec30bdf7a Mon Sep 17 00:00:00 2001 From: Matthew Krupcale Date: Sat, 24 Mar 2018 15:03:22 -0400 Subject: [PATCH] bench: add boost This commit adds a new `re-boost` feature that enables benchmarking Boost's regex implementation. Closes #459 --- bench/Cargo.toml | 3 ++- bench/build.rs | 9 +++++++++ bench/compile | 2 +- bench/run | 5 ++++- bench/src/bench.rs | 8 ++++++-- bench/src/ffi/mod.rs | 5 ++++- bench/src/ffi/stdcpp.cpp | 41 +++++++++++++++++++++++++--------------- bench/src/main.rs | 9 ++++++++- bench/src/misc.rs | 2 ++ bench/src/sherlock.rs | 29 +++++++++++++++++++++++----- 10 files changed, 86 insertions(+), 27 deletions(-) diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 1515ed4be8..82496ceb66 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -47,8 +47,9 @@ re-pcre1 = ["libpcre-sys"] re-pcre2 = [] re-onig = ["onig"] re-stdcpp = [] -re-re2 = [] libcxx = [] +re-boost = [] +re-re2 = [] re-dphobos = [] re-dphobos-dmd = ["re-dphobos"] re-dphobos-dmd-ct = ["re-dphobos-dmd"] diff --git a/bench/build.rs b/bench/build.rs index e71f4e1543..6b2d755b6a 100644 --- a/bench/build.rs +++ b/bench/build.rs @@ -35,6 +35,15 @@ fn main() { .compile("libcstdcpp.a"); } } + if env::var("CARGO_FEATURE_RE_BOOST").is_ok() { + // stdcpp is a C++ library, so we need to compile our shim layer. + cc::Build::new() + .cpp(true) + .file("src/ffi/stdcpp.cpp") + .define("USE_BOOST", None) + .compile("libcboost.a"); + println!("cargo:rustc-link-lib=boost_regex"); + } if env::var("CARGO_FEATURE_RE_RE2").is_ok() { // RE2 is a C++ library, so we need to compile our shim layer. cc::Build::new() diff --git a/bench/compile b/bench/compile index 05219341a0..3a8d22ffe2 100755 --- a/bench/compile +++ b/bench/compile @@ -2,5 +2,5 @@ exec cargo build \ --release \ - --features 're-stdcpp re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \ + --features 're-stdcpp re-boost re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \ "$@" diff --git a/bench/run b/bench/run index f1d9f3db6c..800a4d5aff 100755 --- a/bench/run +++ b/bench/run @@ -1,7 +1,7 @@ #!/bin/bash usage() { - echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | re2 | onig | tcl ]" >&2 + echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | boost | re2 | onig | tcl ]" >&2 exit 1 } @@ -36,6 +36,9 @@ case $which in stdcpp-libcxx) exec cargo bench --bench bench --features 're-stdcpp libcxx' "$@" ;; + boost) + exec cargo bench --bench bench --features re-boost "$@" + ;; re2) exec cargo bench --bench bench --features re-re2 "$@" ;; diff --git a/bench/src/bench.rs b/bench/src/bench.rs index 803a1336ad..6cb56db850 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -28,14 +28,16 @@ extern crate regex; extern crate regex_syntax; extern crate test; - #[cfg(feature = "re-onig")] pub use ffi::onig::Regex; #[cfg(feature = "re-pcre1")] pub use ffi::pcre1::Regex; #[cfg(feature = "re-pcre2")] pub use ffi::pcre2::Regex; -#[cfg(feature = "re-stdcpp")] +#[cfg(any( + feature = "re-stdcpp", + feature = "re-boost", + ))] pub use ffi::stdcpp::Regex; #[cfg(feature = "re-re2")] pub use ffi::re2::Regex; @@ -93,6 +95,7 @@ macro_rules! text { feature = "re-pcre1", feature = "re-pcre2", feature = "re-stdcpp", + feature = "re-boost", feature = "re-re2", feature = "re-dphobos", feature = "re-rust", @@ -111,6 +114,7 @@ type Text = Vec; feature = "re-pcre1", feature = "re-pcre2", feature = "re-stdcpp", + feature = "re-boost", feature = "re-re2", feature = "re-dphobos", feature = "re-rust", diff --git a/bench/src/ffi/mod.rs b/bench/src/ffi/mod.rs index c2033f6313..e9733715ef 100644 --- a/bench/src/ffi/mod.rs +++ b/bench/src/ffi/mod.rs @@ -20,7 +20,10 @@ pub mod onig; pub mod pcre1; #[cfg(feature = "re-pcre2")] pub mod pcre2; -#[cfg(feature = "re-stdcpp")] +#[cfg(any( + feature = "re-stdcpp", + feature = "re-boost", + ))] pub mod stdcpp; #[cfg(feature = "re-re2")] pub mod re2; diff --git a/bench/src/ffi/stdcpp.cpp b/bench/src/ffi/stdcpp.cpp index 68d6bfb14f..d5abc9cdae 100644 --- a/bench/src/ffi/stdcpp.cpp +++ b/bench/src/ffi/stdcpp.cpp @@ -1,6 +1,17 @@ +#ifdef USE_BOOST +#include +#else #include +#endif extern "C" { + +#ifdef USE_BOOST + namespace regex_ns = boost; +#else + namespace regex_ns = std; +#endif + typedef void stdcpp_regexp; typedef struct stdcpp_string { @@ -9,34 +20,34 @@ extern "C" { } stdcpp_string; stdcpp_regexp* stdcpp_regexp_new(stdcpp_string pat) { - return reinterpret_cast(new std::regex(pat.text, - pat.len, - std::regex::optimize)); + return reinterpret_cast(new regex_ns::regex(pat.text, + pat.len, + regex_ns::regex::optimize)); } void stdcpp_regexp_free(stdcpp_regexp *re) { - delete reinterpret_cast(re); + delete reinterpret_cast(re); } bool stdcpp_regexp_match(stdcpp_regexp *re, stdcpp_string text, int startpos, int endpos) { - std::regex cpp_re(*reinterpret_cast(re)); - return std::regex_search(text.text + startpos, text.text + endpos, - cpp_re); + regex_ns::regex cpp_re(*reinterpret_cast(re)); + return regex_ns::regex_search(text.text + startpos, text.text + endpos, + cpp_re); } bool stdcpp_regexp_find(stdcpp_regexp *re, stdcpp_string text, int startpos, int endpos, int *match_start, int *match_end) { - std::regex cpp_re(*reinterpret_cast(re)); - std::cmatch result; - bool matched; - matched = std::regex_search(text.text + startpos, text.text + endpos, - result, cpp_re); - if (matched) { + regex_ns::regex cpp_re(*reinterpret_cast(re)); + regex_ns::cmatch result; + bool matched; + matched = regex_ns::regex_search(text.text + startpos, text.text + endpos, + result, cpp_re); + if (matched) { *match_start = result[0].first - text.text; *match_end = *match_start + result.length(0); - } - return matched; + } + return matched; } } diff --git a/bench/src/main.rs b/bench/src/main.rs index 11e2864425..e4dc7c933b 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -135,8 +135,15 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize { Regex::new(pat).unwrap().find_iter(haystack).count() } +#[cfg(not(any( + feature = "re-stdcpp", + feature = "re-boost", + )))] nada!("re-stdcpp", count_stdcpp); -#[cfg(feature = "re-stdcpp")] +#[cfg(any( + feature = "re-stdcpp", + feature = "re-boost", + ))] fn count_stdcpp(pat: &str, haystack: &str) -> usize { use ffi::stdcpp::Regex; Regex::new(pat).unwrap().find_iter(haystack).count() diff --git a/bench/src/misc.rs b/bench/src/misc.rs index a9c09f3821..ad516e23e4 100644 --- a/bench/src/misc.rs +++ b/bench/src/misc.rs @@ -46,7 +46,9 @@ bench_match!(match_class_in_range, "[ac]", { }); #[cfg(not(feature = "re-rust-bytes"))] +// std C++ does not support unicode character classes #[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] bench_match!(match_class_unicode, r"\p{L}", { format!("{}a", repeat("☃5☃5").take(20).collect::()) diff --git a/bench/src/sherlock.rs b/bench/src/sherlock.rs index 8f66799dc2..c54c9b772e 100644 --- a/bench/src/sherlock.rs +++ b/bench/src/sherlock.rs @@ -106,10 +106,14 @@ sherlock!(the_whitespace, r"the\s+\w+", 5410); #[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] #[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] sherlock!(everything_greedy, r".*", 13053); // std::regex . does not match \r -#[cfg(feature = "re-stdcpp")] +#[cfg(any( + feature = "re-stdcpp", + feature = "re-boost", + ))] sherlock!(everything_greedy, r"[^\n]*", 13053); #[cfg(not(feature = "re-dphobos"))] #[cfg(not(feature = "re-onig"))] @@ -122,24 +126,34 @@ sherlock!(everything_greedy_nl, r"(?s).*", 1); // How fast can we match every letter? This also defeats any clever prefix // tricks. +// std C++ does not support unicode character classes #[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] sherlock!(letters, r"\p{L}", 447160); +// std C++ does not support unicode character classes #[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] sherlock!(letters_upper, r"\p{Lu}", 14180); +// std C++ does not support unicode character classes #[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] sherlock!(letters_lower, r"\p{Ll}", 432980); // Similarly, for words. -#[cfg(not(feature = "re-re2"))] #[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] +#[cfg(not(feature = "re-re2"))] sherlock!(words, r"\w+", 109214); -#[cfg(feature = "re-re2")] -#[cfg(feature = "re-stdcpp")] +#[cfg(any( + feature = "re-stdcpp", + feature = "re-boost", + feature = "re-re2", + ))] sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here? // Find complete words before Holmes. The `\w` defeats any prefix @@ -162,6 +176,7 @@ sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7); #[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] #[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-tcl"))] sherlock!( holmes_coword_watson, @@ -178,13 +193,17 @@ sherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767); // lazy DFA the entire way. // std C++ does not support multiline until C++17 nor the inline modifier syntax #[cfg(not(feature = "re-stdcpp"))] +#[cfg(not(feature = "re-boost"))] #[cfg(not(feature = "re-dphobos"))] sherlock!( line_boundary_sherlock_holmes, r"(?m)^Sherlock Holmes|Sherlock Holmes$", 34); // D matches both \r\n and \n as EOL -#[cfg(feature = "re-dphobos")] +#[cfg(any( + feature = "re-boost", + feature = "re-dphobos", + ))] sherlock!( line_boundary_sherlock_holmes, r"(?m)^Sherlock Holmes|Sherlock Holmes$",