From 4e3a10737603194cbcecf0b4ccaaaadec30bdf7a Mon Sep 17 00:00:00 2001
From: Matthew Krupcale <mkrupcale@matthewkrupcale.com>
Date: Sat, 24 Mar 2018 15:03:22 -0400
Subject: [PATCH] bench: add boost

This commit adds a new `re-boost` feature that enables benchmarking
Boost's regex implementation.

Closes #459
---
 bench/Cargo.toml         |  3 ++-
 bench/build.rs           |  9 +++++++++
 bench/compile            |  2 +-
 bench/run                |  5 ++++-
 bench/src/bench.rs       |  8 ++++++--
 bench/src/ffi/mod.rs     |  5 ++++-
 bench/src/ffi/stdcpp.cpp | 41 +++++++++++++++++++++++++---------------
 bench/src/main.rs        |  9 ++++++++-
 bench/src/misc.rs        |  2 ++
 bench/src/sherlock.rs    | 29 +++++++++++++++++++++++-----
 10 files changed, 86 insertions(+), 27 deletions(-)
diff --git a/bench/Cargo.toml b/bench/Cargo.toml
index 1515ed4be8..82496ceb66 100644
--- a/bench/Cargo.toml
+++ b/bench/Cargo.toml
@@ -47,8 +47,9 @@ re-pcre1 = ["libpcre-sys"]
 re-pcre2 = []
 re-onig = ["onig"]
 re-stdcpp = []
-re-re2 = []
 libcxx = []
+re-boost = []
+re-re2 = []
 re-dphobos = []
 re-dphobos-dmd = ["re-dphobos"]
 re-dphobos-dmd-ct = ["re-dphobos-dmd"]
diff --git a/bench/build.rs b/bench/build.rs
index e71f4e1543..6b2d755b6a 100644
--- a/bench/build.rs
+++ b/bench/build.rs
@@ -35,6 +35,15 @@ fn main() {
                 .compile("libcstdcpp.a");
         }
     }
+    if env::var("CARGO_FEATURE_RE_BOOST").is_ok() {
+        // stdcpp is a C++ library, so we need to compile our shim layer.
+        cc::Build::new()
+            .cpp(true)
+            .file("src/ffi/stdcpp.cpp")
+            .define("USE_BOOST", None)
+            .compile("libcboost.a");
+        println!("cargo:rustc-link-lib=boost_regex");
+    }
     if env::var("CARGO_FEATURE_RE_RE2").is_ok() {
         // RE2 is a C++ library, so we need to compile our shim layer.
         cc::Build::new()
diff --git a/bench/compile b/bench/compile
index 05219341a0..3a8d22ffe2 100755
--- a/bench/compile
+++ b/bench/compile
@@ -2,5 +2,5 @@
 
 exec cargo build \
   --release \
-  --features 're-stdcpp re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
+  --features 're-stdcpp re-boost re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
   "$@"
diff --git a/bench/run b/bench/run
index f1d9f3db6c..800a4d5aff 100755
--- a/bench/run
+++ b/bench/run
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 usage() {
-  echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | re2 | onig | tcl ]" >&2
+  echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | boost | re2 | onig | tcl ]" >&2
   exit 1
 }
 
@@ -36,6 +36,9 @@ case $which in
   stdcpp-libcxx)
     exec cargo bench --bench bench --features 're-stdcpp libcxx' "$@"
     ;;
+  boost)
+    exec cargo bench --bench bench --features re-boost "$@"
+    ;;
   re2)
     exec cargo bench --bench bench --features re-re2 "$@"
     ;;
diff --git a/bench/src/bench.rs b/bench/src/bench.rs
index 803a1336ad..6cb56db850 100644
--- a/bench/src/bench.rs
+++ b/bench/src/bench.rs
@@ -28,14 +28,16 @@ extern crate regex;
 extern crate regex_syntax;
 extern crate test;
 
-
 #[cfg(feature = "re-onig")]
 pub use ffi::onig::Regex;
 #[cfg(feature = "re-pcre1")]
 pub use ffi::pcre1::Regex;
 #[cfg(feature = "re-pcre2")]
 pub use ffi::pcre2::Regex;
-#[cfg(feature = "re-stdcpp")]
+#[cfg(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+  ))]
 pub use ffi::stdcpp::Regex;
 #[cfg(feature = "re-re2")]
 pub use ffi::re2::Regex;
@@ -93,6 +95,7 @@ macro_rules! text {
     feature = "re-pcre1",
     feature = "re-pcre2",
     feature = "re-stdcpp",
+    feature = "re-boost",
     feature = "re-re2",
     feature = "re-dphobos",
     feature = "re-rust",
@@ -111,6 +114,7 @@ type Text = Vec<u8>;
     feature = "re-pcre1",
     feature = "re-pcre2",
     feature = "re-stdcpp",
+    feature = "re-boost",
     feature = "re-re2",
     feature = "re-dphobos",
     feature = "re-rust",
diff --git a/bench/src/ffi/mod.rs b/bench/src/ffi/mod.rs
index c2033f6313..e9733715ef 100644
--- a/bench/src/ffi/mod.rs
+++ b/bench/src/ffi/mod.rs
@@ -20,7 +20,10 @@ pub mod onig;
 pub mod pcre1;
 #[cfg(feature = "re-pcre2")]
 pub mod pcre2;
-#[cfg(feature = "re-stdcpp")]
+#[cfg(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+  ))]
 pub mod stdcpp;
 #[cfg(feature = "re-re2")]
 pub mod re2;
diff --git a/bench/src/ffi/stdcpp.cpp b/bench/src/ffi/stdcpp.cpp
index 68d6bfb14f..d5abc9cdae 100644
--- a/bench/src/ffi/stdcpp.cpp
+++ b/bench/src/ffi/stdcpp.cpp
@@ -1,6 +1,17 @@
+#ifdef USE_BOOST
+#include <boost/regex.hpp>
+#else
 #include <regex>
+#endif
 
 extern "C" {
+
+#ifdef USE_BOOST
+    namespace regex_ns = boost;
+#else
+    namespace regex_ns = std;
+#endif
+
     typedef void stdcpp_regexp;
 
     typedef struct stdcpp_string {
@@ -9,34 +20,34 @@ extern "C" {
     } stdcpp_string;
 
     stdcpp_regexp* stdcpp_regexp_new(stdcpp_string pat) {
-        return reinterpret_cast<stdcpp_regexp*>(new std::regex(pat.text,
-							       pat.len,
-							       std::regex::optimize));
+	return reinterpret_cast<stdcpp_regexp*>(new regex_ns::regex(pat.text,
+								    pat.len,
+								    regex_ns::regex::optimize));
     }
 
     void stdcpp_regexp_free(stdcpp_regexp *re) {
-        delete reinterpret_cast<std::regex*>(re);
+	delete reinterpret_cast<regex_ns::regex*>(re);
     }
 
     bool stdcpp_regexp_match(stdcpp_regexp *re, stdcpp_string text,
 			     int startpos, int endpos) {
-	std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
-        return std::regex_search(text.text + startpos, text.text + endpos,
-				 cpp_re);
+	regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
+	return regex_ns::regex_search(text.text + startpos, text.text + endpos,
+				      cpp_re);
     }
 
     bool stdcpp_regexp_find(stdcpp_regexp *re, stdcpp_string text,
 			    int startpos, int endpos,
 			    int *match_start, int *match_end) {
-	std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
-	std::cmatch result;
-        bool matched;
-        matched = std::regex_search(text.text + startpos, text.text + endpos,
-				    result, cpp_re);
-        if (matched) {
+	regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
+	regex_ns::cmatch result;
+	bool matched;
+	matched = regex_ns::regex_search(text.text + startpos, text.text + endpos,
+					 result, cpp_re);
+	if (matched) {
 	    *match_start = result[0].first - text.text;
 	    *match_end = *match_start + result.length(0);
-        }
-        return matched;
+	}
+	return matched;
     }
 }
diff --git a/bench/src/main.rs b/bench/src/main.rs
index 11e2864425..e4dc7c933b 100644
--- a/bench/src/main.rs
+++ b/bench/src/main.rs
@@ -135,8 +135,15 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize {
     Regex::new(pat).unwrap().find_iter(haystack).count()
 }
 
+#[cfg(not(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+  )))]
 nada!("re-stdcpp", count_stdcpp);
-#[cfg(feature = "re-stdcpp")]
+#[cfg(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+  ))]
 fn count_stdcpp(pat: &str, haystack: &str) -> usize {
     use ffi::stdcpp::Regex;
     Regex::new(pat).unwrap().find_iter(haystack).count()
diff --git a/bench/src/misc.rs b/bench/src/misc.rs
index a9c09f3821..ad516e23e4 100644
--- a/bench/src/misc.rs
+++ b/bench/src/misc.rs
@@ -46,7 +46,9 @@ bench_match!(match_class_in_range, "[ac]", {
 });
 
 #[cfg(not(feature = "re-rust-bytes"))]
+// std C++ does not support unicode character classes
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 bench_match!(match_class_unicode, r"\p{L}", {
     format!("{}a", repeat("☃5☃5").take(20).collect::<String>())
diff --git a/bench/src/sherlock.rs b/bench/src/sherlock.rs
index 8f66799dc2..c54c9b772e 100644
--- a/bench/src/sherlock.rs
+++ b/bench/src/sherlock.rs
@@ -106,10 +106,14 @@ sherlock!(the_whitespace, r"the\s+\w+", 5410);
 #[cfg(not(feature = "re-pcre1"))]
 #[cfg(not(feature = "re-pcre2"))]
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 sherlock!(everything_greedy, r".*", 13053);
 // std::regex . does not match \r
-#[cfg(feature = "re-stdcpp")]
+#[cfg(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+  ))]
 sherlock!(everything_greedy, r"[^\n]*", 13053);
 #[cfg(not(feature = "re-dphobos"))]
 #[cfg(not(feature = "re-onig"))]
@@ -122,24 +126,34 @@ sherlock!(everything_greedy_nl, r"(?s).*", 1);
 
 // How fast can we match every letter? This also defeats any clever prefix
 // tricks.
+// std C++ does not support unicode character classes
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 sherlock!(letters, r"\p{L}", 447160);
 
+// std C++ does not support unicode character classes
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 sherlock!(letters_upper, r"\p{Lu}", 14180);
 
+// std C++ does not support unicode character classes
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 sherlock!(letters_lower, r"\p{Ll}", 432980);
 
 // Similarly, for words.
-#[cfg(not(feature = "re-re2"))]
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
+#[cfg(not(feature = "re-re2"))]
 sherlock!(words, r"\w+", 109214);
-#[cfg(feature = "re-re2")]
-#[cfg(feature = "re-stdcpp")]
+#[cfg(any(
+    feature = "re-stdcpp",
+    feature = "re-boost",
+    feature = "re-re2",
+  ))]
 sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?
 
 // Find complete words before Holmes. The `\w` defeats any prefix
@@ -162,6 +176,7 @@ sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7);
 #[cfg(not(feature = "re-pcre1"))]
 #[cfg(not(feature = "re-pcre2"))]
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-tcl"))]
 sherlock!(
     holmes_coword_watson,
@@ -178,13 +193,17 @@ sherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767);
 // lazy DFA the entire way.
 // std C++ does not support multiline until C++17 nor the inline modifier syntax
 #[cfg(not(feature = "re-stdcpp"))]
+#[cfg(not(feature = "re-boost"))]
 #[cfg(not(feature = "re-dphobos"))]
 sherlock!(
     line_boundary_sherlock_holmes,
     r"(?m)^Sherlock Holmes|Sherlock Holmes$",
     34);
 // D matches both \r\n and \n as EOL
-#[cfg(feature = "re-dphobos")]
+#[cfg(any(
+    feature = "re-boost",
+    feature = "re-dphobos",
+  ))]
 sherlock!(
     line_boundary_sherlock_holmes,
     r"(?m)^Sherlock Holmes|Sherlock Holmes$",