Skip to content

Commit

Permalink
Add support for C++ boost::regex to benchmarks
Browse files Browse the repository at this point in the history
This is very similar to the `std::regex` benchmark implementation since Boost.Regex and `std::regex` have very similar APIs and regex grammar support. As such, it uses the `stdcpp` Rust and C FFIs to reduce code duplication.

 * bench/Cargo.toml: add `re-boost` feature
 * bench/build.rs: add `cboost` library to bench build. This uses a compiler preprocessor definition to indicate whether or not to use Boost when compiling the `stdcpp` FFI.
 * bench/compile: add `re-boost` feature to bench compile script
 * bench/run: add `re-boost` feature to bench run script
 * bench/src/bench.rs: use `ffi::stdcpp::Regex`, define its `text!` macro, and `Text` type for feature `re-boost`
 * bench/src/ffi/mod.rs: declare `stdcpp` module for `re-boost` feature
 * bench/src/ffi/stdcpp.cpp: implement C API using C++ `boost::regex`. The Boost.Regex API is very similar to the `std::regex` API and therefore only uses a different namespace.
 * bench/src/main.rs: add boost to bench main
 * bench/src/misc.rs:
    - do not run `match_class_unicode` benchmark for `re-boost` feature because `boost::regex` ECMAScript grammar does not support unicode character classes
 * bench/src/sherlock.rs:
    - do not run `letters`, `letters_upper`, and `letters_lower` benchmarks for `re-boost` feature because `boost::regex` ECMAScript grammar does not support unicode character classes
    - use a different regex for `everything_greedy` benchmark because `boost::regex` '.' does not match '\r'
    - `words` benchmark for `boost::regex` matches RE2 test result, so use that test for `re-boost` feature as well. Also fixes conditional compilation issue for `re-stdcpp`.
    - do not run `holmes_coword_watson` benchmark for `re-boost` feature because Boost.Regex implementation currently seems to have exponential behavior here
  • Loading branch information
mkrupcale committed Mar 24, 2018
1 parent abc30a8 commit 6c5158e
Show file tree
Hide file tree
Showing 10 changed files with 86 additions and 27 deletions.
3 changes: 2 additions & 1 deletion bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ re-pcre1 = ["libpcre-sys"]
re-pcre2 = []
re-onig = ["onig"]
re-stdcpp = []
re-re2 = []
libcxx = []
re-boost = []
re-re2 = []
re-dphobos = []
re-dphobos-dmd = ["re-dphobos"]
re-dphobos-dmd-ct = ["re-dphobos-dmd"]
Expand Down
9 changes: 9 additions & 0 deletions bench/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ fn main() {
.compile("libcstdcpp.a");
}
}
if env::var("CARGO_FEATURE_RE_BOOST").is_ok() {
// stdcpp is a C++ library, so we need to compile our shim layer.
cc::Build::new()
.cpp(true)
.file("src/ffi/stdcpp.cpp")
.define("USE_BOOST", None)
.compile("libcboost.a");
println!("cargo:rustc-link-lib=boost_regex");
}
if env::var("CARGO_FEATURE_RE_RE2").is_ok() {
// RE2 is a C++ library, so we need to compile our shim layer.
cc::Build::new()
Expand Down
2 changes: 1 addition & 1 deletion bench/compile
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

exec cargo build \
--release \
--features 're-stdcpp re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
--features 're-stdcpp re-boost re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
"$@"
5 changes: 4 additions & 1 deletion bench/run
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

usage() {
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | re2 | onig | tcl ]" >&2
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | boost | re2 | onig | tcl ]" >&2
exit 1
}

Expand Down Expand Up @@ -36,6 +36,9 @@ case $which in
stdcpp-libcxx)
exec cargo bench --bench bench --features 're-stdcpp libcxx' "$@"
;;
boost)
exec cargo bench --bench bench --features re-boost "$@"
;;
re2)
exec cargo bench --bench bench --features re-re2 "$@"
;;
Expand Down
8 changes: 6 additions & 2 deletions bench/src/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,16 @@ extern crate regex;
extern crate regex_syntax;
extern crate test;


#[cfg(feature = "re-onig")]
pub use ffi::onig::Regex;
#[cfg(feature = "re-pcre1")]
pub use ffi::pcre1::Regex;
#[cfg(feature = "re-pcre2")]
pub use ffi::pcre2::Regex;
#[cfg(feature = "re-stdcpp")]
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
))]
pub use ffi::stdcpp::Regex;
#[cfg(feature = "re-re2")]
pub use ffi::re2::Regex;
Expand Down Expand Up @@ -93,6 +95,7 @@ macro_rules! text {
feature = "re-pcre1",
feature = "re-pcre2",
feature = "re-stdcpp",
feature = "re-boost",
feature = "re-re2",
feature = "re-dphobos",
feature = "re-rust",
Expand All @@ -111,6 +114,7 @@ type Text = Vec<u8>;
feature = "re-pcre1",
feature = "re-pcre2",
feature = "re-stdcpp",
feature = "re-boost",
feature = "re-re2",
feature = "re-dphobos",
feature = "re-rust",
Expand Down
5 changes: 4 additions & 1 deletion bench/src/ffi/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ pub mod onig;
pub mod pcre1;
#[cfg(feature = "re-pcre2")]
pub mod pcre2;
#[cfg(feature = "re-stdcpp")]
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
))]
pub mod stdcpp;
#[cfg(feature = "re-re2")]
pub mod re2;
Expand Down
41 changes: 26 additions & 15 deletions bench/src/ffi/stdcpp.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
#ifdef USE_BOOST
#include <boost/regex.hpp>
#else
#include <regex>
#endif

extern "C" {

#ifdef USE_BOOST
namespace regex_ns = boost;
#else
namespace regex_ns = std;
#endif

typedef void stdcpp_regexp;

typedef struct stdcpp_string {
Expand All @@ -9,34 +20,34 @@ extern "C" {
} stdcpp_string;

stdcpp_regexp* stdcpp_regexp_new(stdcpp_string pat) {
return reinterpret_cast<stdcpp_regexp*>(new std::regex(pat.text,
pat.len,
std::regex::optimize));
return reinterpret_cast<stdcpp_regexp*>(new regex_ns::regex(pat.text,
pat.len,
regex_ns::regex::optimize));
}

void stdcpp_regexp_free(stdcpp_regexp *re) {
delete reinterpret_cast<std::regex*>(re);
delete reinterpret_cast<regex_ns::regex*>(re);
}

bool stdcpp_regexp_match(stdcpp_regexp *re, stdcpp_string text,
int startpos, int endpos) {
std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
return std::regex_search(text.text + startpos, text.text + endpos,
cpp_re);
regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
return regex_ns::regex_search(text.text + startpos, text.text + endpos,
cpp_re);
}

bool stdcpp_regexp_find(stdcpp_regexp *re, stdcpp_string text,
int startpos, int endpos,
int *match_start, int *match_end) {
std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
std::cmatch result;
bool matched;
matched = std::regex_search(text.text + startpos, text.text + endpos,
result, cpp_re);
if (matched) {
regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
regex_ns::cmatch result;
bool matched;
matched = regex_ns::regex_search(text.text + startpos, text.text + endpos,
result, cpp_re);
if (matched) {
*match_start = result[0].first - text.text;
*match_end = *match_start + result.length(0);
}
return matched;
}
return matched;
}
}
9 changes: 8 additions & 1 deletion bench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,15 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize {
Regex::new(pat).unwrap().find_iter(haystack).count()
}

#[cfg(not(any(
feature = "re-stdcpp",
feature = "re-boost",
)))]
nada!("re-stdcpp", count_stdcpp);
#[cfg(feature = "re-stdcpp")]
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
))]
fn count_stdcpp(pat: &str, haystack: &str) -> usize {
use ffi::stdcpp::Regex;
Regex::new(pat).unwrap().find_iter(haystack).count()
Expand Down
2 changes: 2 additions & 0 deletions bench/src/misc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ bench_match!(match_class_in_range, "[ac]", {
});

#[cfg(not(feature = "re-rust-bytes"))]
// std C++ does not support unicode character classes
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
bench_match!(match_class_unicode, r"\p{L}", {
format!("{}a", repeat("☃5☃5").take(20).collect::<String>())
Expand Down
29 changes: 24 additions & 5 deletions bench/src/sherlock.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,14 @@ sherlock!(the_whitespace, r"the\s+\w+", 5410);
#[cfg(not(feature = "re-pcre1"))]
#[cfg(not(feature = "re-pcre2"))]
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(everything_greedy, r".*", 13053);
// std::regex . does not match \r
#[cfg(feature = "re-stdcpp")]
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
))]
sherlock!(everything_greedy, r"[^\n]*", 13053);
#[cfg(not(feature = "re-dphobos"))]
#[cfg(not(feature = "re-onig"))]
Expand All @@ -122,24 +126,34 @@ sherlock!(everything_greedy_nl, r"(?s).*", 1);

// How fast can we match every letter? This also defeats any clever prefix
// tricks.
// std C++ does not support unicode character classes
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(letters, r"\p{L}", 447160);

// std C++ does not support unicode character classes
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(letters_upper, r"\p{Lu}", 14180);

// std C++ does not support unicode character classes
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(letters_lower, r"\p{Ll}", 432980);

// Similarly, for words.
#[cfg(not(feature = "re-re2"))]
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-re2"))]
sherlock!(words, r"\w+", 109214);
#[cfg(feature = "re-re2")]
#[cfg(feature = "re-stdcpp")]
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
feature = "re-re2",
))]
sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?

// Find complete words before Holmes. The `\w` defeats any prefix
Expand All @@ -162,6 +176,7 @@ sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7);
#[cfg(not(feature = "re-pcre1"))]
#[cfg(not(feature = "re-pcre2"))]
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-tcl"))]
sherlock!(
holmes_coword_watson,
Expand All @@ -178,13 +193,17 @@ sherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767);
// lazy DFA the entire way.
// std C++ does not support multiline until C++17 nor the inline modifier syntax
#[cfg(not(feature = "re-stdcpp"))]
#[cfg(not(feature = "re-boost"))]
#[cfg(not(feature = "re-dphobos"))]
sherlock!(
line_boundary_sherlock_holmes,
r"(?m)^Sherlock Holmes|Sherlock Holmes$",
34);
// D matches both \r\n and \n as EOL
#[cfg(feature = "re-dphobos")]
#[cfg(any(
feature = "re-boost",
feature = "re-dphobos",
))]
sherlock!(
line_boundary_sherlock_holmes,
r"(?m)^Sherlock Holmes|Sherlock Holmes$",
Expand Down

0 comments on commit 6c5158e

Please sign in to comment.