From ae92ad249e57678f0549c258efbd985c7a8d3edb Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Sat, 12 Jun 2021 20:20:00 -0700 Subject: [PATCH] Add wasm simd support This commit adds simd acceleration support to the `memmem` module. This is added with the freshly-stabilized support from rust-lang/rust#86204. This mostly just cribs off the generic simd support for 128-bit types built for sse, copying bits and pieces of code here and there. Some refactoring happened internally to help reduce duplication where possible. I ran some initial benchmarks with the `memmem/krate/*` regex and a hacked up single-threaded version of criterion. Some [initial comparisons][compare] using Wasmtime as a runtime do indeed show a lot of improvements, but there are indeed some slowdowns as well. [compare]: https://gist.github.com/alexcrichton/6a72e682e7b6d505ade605359fbe3f2d --- .github/workflows/ci.yml | 15 ++++++ bench/src/memmem/imp.rs | 35 ++++++++++---- build.rs | 28 ++++++++--- src/memmem/mod.rs | 85 ++++++++++++++------------------- src/memmem/prefilter/mod.rs | 72 +++++++++++++++------------- src/memmem/prefilter/wasm.rs | 39 +++++++++++++++ src/memmem/prefilter/x86/sse.rs | 15 +----- src/memmem/vector.rs | 33 +++++++++++++ src/memmem/wasm.rs | 75 +++++++++++++++++++++++++++++ 9 files changed, 284 insertions(+), 113 deletions(-) create mode 100644 src/memmem/prefilter/wasm.rs create mode 100644 src/memmem/wasm.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 06321b8..45af901 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,6 +25,7 @@ jobs: - stable - stable-32 - stable-mips + - wasm - beta - nightly - macos @@ -60,6 +61,10 @@ jobs: - build: win-gnu os: windows-2019 rust: stable-x86_64-gnu + - build: wasm + os: ubuntu-18.04 + rust: nightly-x86_64-gnu # waiting for wasm simd to hit stable + wasm: true steps: - name: Checkout repository uses: actions/checkout@v1 @@ -79,6 +84,16 @@ jobs: cargo install --git https://github.com/rust-embedded/cross echo "CARGO=cross" >> $GITHUB_ENV echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV + - name: Download Wasmtime + if: matrix.wasm + run: | + rustup target add wasm32-wasi + echo "CARGO_BUILD_TARGET=wasm32-wasi" >> $GITHUB_ENV + echo "RUSTFLAGS=-Ctarget-feature=+simd128" >> $GITHUB_ENV + curl -LO https://github.com/bytecodealliance/wasmtime/releases/download/v0.28.0/wasmtime-v0.28.0-x86_64-linux.tar.xz + tar xvf wasmtime-v0.28.0-x86_64-linux.tar.xz + echo `pwd`/wasmtime-v0.28.0-x86_64-linux >> $GITHUB_PATH + echo "CARGO_TARGET_WASM32_WASI_RUNNER=wasmtime run --enable-simd --" >> $GITHUB_ENV - name: Show command used for Cargo run: | echo "cargo command is: ${{ env.CARGO }}" diff --git a/bench/src/memmem/imp.rs b/bench/src/memmem/imp.rs index a0c0b83..18a264b 100644 --- a/bench/src/memmem/imp.rs +++ b/bench/src/memmem/imp.rs @@ -640,44 +640,47 @@ pub(crate) mod sliceslice { } pub(crate) fn prebuilt(_: &str) -> impl Fn(&str) -> bool + 'static { - unimplemented!("sliceslice only runs on x86") + if true { + unimplemented!("sliceslice only runs on x86") + } + |_| false } pub(crate) fn oneshotiter<'a>( - haystack: &'a str, - needle: &'a str, + _haystack: &'a str, + _needle: &'a str, ) -> impl Iterator + 'static { std::iter::from_fn(move || { unimplemented!("sliceslice only runs on x86") }) } - pub(crate) fn prebuiltiter(needle: &str) -> super::super::NoIter { + pub(crate) fn prebuiltiter(_needle: &str) -> super::super::NoIter { unimplemented!("sliceslice only runs on x86") } } pub(crate) mod rev { - pub(crate) fn oneshot(haystack: &str, needle: &str) -> bool { + pub(crate) fn oneshot(_haystack: &str, _needle: &str) -> bool { unimplemented!("sliceslice does not support reverse searches") } pub(crate) fn prebuilt( - needle: &str, + _needle: &str, ) -> impl Fn(&str) -> bool + 'static { |_| unimplemented!("sliceslice does not support reverse searches") } pub(crate) fn oneshotiter( - haystack: &str, - needle: &str, + _haystack: &str, + _needle: &str, ) -> impl Iterator + 'static { std::iter::from_fn(move || { unimplemented!("sliceslice does not support reverse searches") }) } - pub(crate) fn prebuiltiter(needle: &str) -> super::super::NoIter { + pub(crate) fn prebuiltiter(_needle: &str) -> super::super::NoIter { unimplemented!("sliceslice does not support reverse searches") } } @@ -693,9 +696,21 @@ pub(crate) mod libc { } pub(crate) mod fwd { + #[cfg(target_arch = "wasm32")] + extern "C" { + fn memmem( + haystack: *const libc::c_void, + haystack_len: usize, + needle: *const libc::c_void, + needle_len: usize, + ) -> *const libc::c_void; + } + #[cfg(not(target_arch = "wasm32"))] + use libc::memmem; + fn find(haystack: &[u8], needle: &[u8]) -> Option { let p = unsafe { - libc::memmem( + memmem( haystack.as_ptr() as *const libc::c_void, haystack.len(), needle.as_ptr() as *const libc::c_void, diff --git a/build.rs b/build.rs index e07ad6f..584a608 100644 --- a/build.rs +++ b/build.rs @@ -15,15 +15,29 @@ fn main() { // is not a problem. In that case, the fastest option will be chosen at // runtime. fn enable_simd_optimizations() { - if is_env_set("CARGO_CFG_MEMCHR_DISABLE_AUTO_SIMD") - || !target_has_feature("sse2") - { + if is_env_set("CARGO_CFG_MEMCHR_DISABLE_AUTO_SIMD") { return; } - println!("cargo:rustc-cfg=memchr_runtime_simd"); - println!("cargo:rustc-cfg=memchr_runtime_sse2"); - println!("cargo:rustc-cfg=memchr_runtime_sse42"); - println!("cargo:rustc-cfg=memchr_runtime_avx"); + let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); + match &arch[..] { + "x86_64" => { + if !target_has_feature("sse2") { + return; + } + println!("cargo:rustc-cfg=memchr_runtime_simd"); + println!("cargo:rustc-cfg=memchr_runtime_sse2"); + println!("cargo:rustc-cfg=memchr_runtime_sse42"); + println!("cargo:rustc-cfg=memchr_runtime_avx"); + } + "wasm32" | "wasm64" => { + if !target_has_feature("simd128") { + return; + } + println!("cargo:rustc-cfg=memchr_runtime_simd"); + println!("cargo:rustc-cfg=memchr_runtime_wasm128"); + } + _ => {} + } } // This adds a `memchr_libc` cfg if and only if libc can be used, if no other diff --git a/src/memmem/mod.rs b/src/memmem/mod.rs index 0dd6186..5e0e6d3 100644 --- a/src/memmem/mod.rs +++ b/src/memmem/mod.rs @@ -146,16 +146,17 @@ macro_rules! define_memmem_simple_tests { } mod byte_frequencies; -#[cfg(all(target_arch = "x86_64", memchr_runtime_simd))] +#[cfg(memchr_runtime_simd)] mod genericsimd; mod prefilter; mod rabinkarp; mod rarebytes; mod twoway; mod util; -// SIMD is only supported on x86_64 currently. -#[cfg(target_arch = "x86_64")] +#[cfg(memchr_runtime_simd)] mod vector; +#[cfg(all(memchr_runtime_wasm128))] +mod wasm; #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] mod x86; @@ -773,47 +774,47 @@ enum SearcherKind { TwoWay(twoway::Forward), #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] GenericSIMD128(x86::sse::Forward), + #[cfg(memchr_runtime_wasm128)] + GenericSIMD128(wasm::Forward), #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] GenericSIMD256(x86::avx::Forward), } impl<'n> Searcher<'n> { - #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> { use self::SearcherKind::*; let ninfo = NeedleInfo::new(needle); - let prefn = - prefilter::forward(&config.prefilter, &ninfo.rarebytes, needle); - let kind = if needle.len() == 0 { - Empty - } else if needle.len() == 1 { - OneByte(needle[0]) - } else if let Some(fwd) = x86::avx::Forward::new(&ninfo, needle) { - GenericSIMD256(fwd) - } else if let Some(fwd) = x86::sse::Forward::new(&ninfo, needle) { - GenericSIMD128(fwd) - } else { - TwoWay(twoway::Forward::new(needle)) + let mk = |kind: SearcherKind| { + let prefn = prefilter::forward( + &config.prefilter, + &ninfo.rarebytes, + needle, + ); + Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind } }; - Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind } - } - - #[cfg(not(all(not(miri), target_arch = "x86_64", memchr_runtime_simd)))] - fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> { - use self::SearcherKind::*; + if needle.len() == 0 { + return mk(Empty); + } + if needle.len() == 1 { + return mk(OneByte(needle[0])); + } + #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] + { + if let Some(fwd) = x86::avx::Forward::new(&ninfo, needle) { + return mk(GenericSIMD256(fwd)); + } else if let Some(fwd) = x86::sse::Forward::new(&ninfo, needle) { + return mk(GenericSIMD128(fwd)); + } + } + #[cfg(all(target_arch = "wasm32", memchr_runtime_simd))] + { + if let Some(fwd) = wasm::Forward::new(&ninfo, needle) { + return mk(GenericSIMD128(fwd)); + } + } - let ninfo = NeedleInfo::new(needle); - let prefn = - prefilter::forward(&config.prefilter, &ninfo.rarebytes, needle); - let kind = if needle.len() == 0 { - Empty - } else if needle.len() == 1 { - OneByte(needle[0]) - } else { - TwoWay(twoway::Forward::new(needle)) - }; - Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind } + mk(TwoWay(twoway::Forward::new(needle))) } /// Return a fresh prefilter state that can be used with this searcher. @@ -844,11 +845,7 @@ impl<'n> Searcher<'n> { Empty => Empty, OneByte(b) => OneByte(b), TwoWay(tw) => TwoWay(tw), - #[cfg(all( - not(miri), - target_arch = "x86_64", - memchr_runtime_simd - ))] + #[cfg(all(not(miri), memchr_runtime_simd))] GenericSIMD128(gs) => GenericSIMD128(gs), #[cfg(all( not(miri), @@ -873,11 +870,7 @@ impl<'n> Searcher<'n> { Empty => Empty, OneByte(b) => OneByte(b), TwoWay(tw) => TwoWay(tw), - #[cfg(all( - not(miri), - target_arch = "x86_64", - memchr_runtime_simd - ))] + #[cfg(all(not(miri), memchr_runtime_simd))] GenericSIMD128(gs) => GenericSIMD128(gs), #[cfg(all( not(miri), @@ -921,11 +914,7 @@ impl<'n> Searcher<'n> { self.find_tw(tw, state, haystack, needle) } } - #[cfg(all( - not(miri), - target_arch = "x86_64", - memchr_runtime_simd - ))] + #[cfg(all(not(miri), memchr_runtime_simd))] GenericSIMD128(ref gs) => { // The SIMD matcher can't handle particularly short haystacks, // so we fall back to RK in these cases. diff --git a/src/memmem/prefilter/mod.rs b/src/memmem/prefilter/mod.rs index 2481cfe..da99a94 100644 --- a/src/memmem/prefilter/mod.rs +++ b/src/memmem/prefilter/mod.rs @@ -1,8 +1,10 @@ use crate::memmem::{rarebytes::RareNeedleBytes, NeedleInfo}; mod fallback; -#[cfg(all(target_arch = "x86_64", memchr_runtime_simd))] +#[cfg(memchr_runtime_simd)] mod genericsimd; +#[cfg(all(not(miri), target_arch = "wasm32", memchr_runtime_simd))] +mod wasm; #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] mod x86; @@ -90,6 +92,21 @@ pub(crate) type PrefilterFnTy = unsafe fn( needle: &[u8], ) -> Option; +// If the haystack is too small for SSE2, then just run memchr on the +// rarest byte and be done with it. (It is likely that this code path is +// rarely exercised, since a higher level routine will probably dispatch to +// Rabin-Karp for such a small haystack.) +#[cfg(memchr_runtime_simd)] +fn simple_memchr_fallback( + _prestate: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], +) -> Option { + let (rare, _) = ninfo.rarebytes.as_rare_ordered_usize(); + crate::memchr(needle[rare], haystack).map(|i| i.saturating_sub(rare)) +} + impl PrefilterFn { /// Create a new prefilter function from the function pointer given. /// @@ -269,7 +286,6 @@ impl PrefilterState { /// This only applies to x86_64 when runtime SIMD detection is enabled (which /// is the default). In general, we try to use an AVX prefilter, followed by /// SSE and then followed by a generic one based on memchr. -#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] #[inline(always)] pub(crate) fn forward( config: &Prefilter, @@ -280,20 +296,30 @@ pub(crate) fn forward( return None; } - #[cfg(feature = "std")] + #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] { - if cfg!(memchr_runtime_avx) { - if is_x86_feature_detected!("avx2") { - // SAFETY: x86::avx::find only requires the avx2 feature, - // which we've just checked above. - return unsafe { Some(PrefilterFn::new(x86::avx::find)) }; + #[cfg(feature = "std")] + { + if cfg!(memchr_runtime_avx) { + if is_x86_feature_detected!("avx2") { + // SAFETY: x86::avx::find only requires the avx2 feature, + // which we've just checked above. + return unsafe { Some(PrefilterFn::new(x86::avx::find)) }; + } } } + if cfg!(memchr_runtime_sse2) { + // SAFETY: x86::sse::find only requires the sse2 feature, which is + // guaranteed to be available on x86_64. + return unsafe { Some(PrefilterFn::new(x86::sse::find)) }; + } } - if cfg!(memchr_runtime_sse2) { - // SAFETY: x86::sse::find only requires the sse2 feature, which is - // guaranteed to be available on x86_64. - return unsafe { Some(PrefilterFn::new(x86::sse::find)) }; + #[cfg(all(not(miri), target_arch = "wasm32", memchr_runtime_simd))] + { + if true { + // SAFETY: `wasm::find` is actually a safe function + return unsafe { Some(PrefilterFn::new(wasm::find)) }; + } } // Check that our rarest byte has a reasonably low rank. The main issue // here is that the fallback prefilter can perform pretty poorly if it's @@ -306,28 +332,6 @@ pub(crate) fn forward( None } -/// Determine which prefilter function, if any, to use. -/// -/// Since SIMD is currently only supported on x86_64, this will just select -/// the fallback prefilter if the rare bytes provided have a low enough rank. -#[cfg(not(all(not(miri), target_arch = "x86_64", memchr_runtime_simd)))] -#[inline(always)] -pub(crate) fn forward( - config: &Prefilter, - rare: &RareNeedleBytes, - needle: &[u8], -) -> Option { - if config.is_none() || needle.len() <= 1 { - return None; - } - let (rare1_rank, _) = rare.as_ranks(needle); - if rare1_rank <= MAX_FALLBACK_RANK { - // SAFETY: fallback::find is safe to call in all environments. - return unsafe { Some(PrefilterFn::new(fallback::find)) }; - } - None -} - /// Return the minimum length of the haystack in which a prefilter should be /// used. If the haystack is below this length, then it's probably not worth /// the overhead of running the prefilter. diff --git a/src/memmem/prefilter/wasm.rs b/src/memmem/prefilter/wasm.rs new file mode 100644 index 0000000..5470c92 --- /dev/null +++ b/src/memmem/prefilter/wasm.rs @@ -0,0 +1,39 @@ +use core::arch::wasm32::v128; + +use crate::memmem::{ + prefilter::{PrefilterFnTy, PrefilterState}, + NeedleInfo, +}; + +// Check that the functions below satisfy the Prefilter function type. +const _: PrefilterFnTy = find; + +/// A `v128`-accelerated candidate finder for single-substring search. +#[target_feature(enable = "simd128")] +pub(crate) fn find( + prestate: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], +) -> Option { + unsafe { + super::genericsimd::find::( + prestate, + ninfo, + haystack, + needle, + super::simple_memchr_fallback, + ) + } +} + +#[cfg(all(test, feature = "std"))] +mod tests { + #[test] + #[cfg(not(miri))] + fn prefilter_permutations() { + use crate::memmem::prefilter::tests::PrefilterTest; + // SAFETY: super::find is safe to call for all inputs on x86. + unsafe { PrefilterTest::run_all_tests(super::find) }; + } +} diff --git a/src/memmem/prefilter/x86/sse.rs b/src/memmem/prefilter/x86/sse.rs index b11356e..b1c48e1 100644 --- a/src/memmem/prefilter/x86/sse.rs +++ b/src/memmem/prefilter/x86/sse.rs @@ -21,25 +21,12 @@ pub(crate) unsafe fn find( haystack: &[u8], needle: &[u8], ) -> Option { - // If the haystack is too small for SSE2, then just run memchr on the - // rarest byte and be done with it. (It is likely that this code path is - // rarely exercised, since a higher level routine will probably dispatch to - // Rabin-Karp for such a small haystack.) - fn simple_memchr_fallback( - _prestate: &mut PrefilterState, - ninfo: &NeedleInfo, - haystack: &[u8], - needle: &[u8], - ) -> Option { - let (rare, _) = ninfo.rarebytes.as_rare_ordered_usize(); - crate::memchr(needle[rare], haystack).map(|i| i.saturating_sub(rare)) - } super::super::genericsimd::find::<__m128i>( prestate, ninfo, haystack, needle, - simple_memchr_fallback, + super::super::simple_memchr_fallback, ) } diff --git a/src/memmem/vector.rs b/src/memmem/vector.rs index a67d3c5..aca7b12 100644 --- a/src/memmem/vector.rs +++ b/src/memmem/vector.rs @@ -96,3 +96,36 @@ mod x86avx { } } } + +#[cfg(target_arch = "wasm32")] +mod x86sse { + use super::Vector; + use core::arch::wasm32::*; + + impl Vector for v128 { + #[inline(always)] + unsafe fn splat(byte: u8) -> v128 { + u8x16_splat(byte) + } + + #[inline(always)] + unsafe fn load_unaligned(data: *const u8) -> v128 { + v128_load(data.cast()) + } + + #[inline(always)] + unsafe fn movemask(self) -> u32 { + u8x16_bitmask(self).into() + } + + #[inline(always)] + unsafe fn cmpeq(self, vector2: Self) -> v128 { + u8x16_eq(self, vector2) + } + + #[inline(always)] + unsafe fn and(self, vector2: Self) -> v128 { + v128_and(self, vector2) + } + } +} diff --git a/src/memmem/wasm.rs b/src/memmem/wasm.rs new file mode 100644 index 0000000..4e3ea98 --- /dev/null +++ b/src/memmem/wasm.rs @@ -0,0 +1,75 @@ +use core::arch::wasm32::v128; + +use crate::memmem::{genericsimd, NeedleInfo}; + +/// A `v128` accelerated vectorized substring search routine that only works on +/// small needles. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Forward(genericsimd::Forward); + +impl Forward { + /// Create a new "generic simd" forward searcher. If one could not be + /// created from the given inputs, then None is returned. + pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option { + if !cfg!(memchr_runtime_simd) { + return None; + } + genericsimd::Forward::new(ninfo, needle).map(Forward) + } + + /// Returns the minimum length of haystack that is needed for this searcher + /// to work. Passing a haystack with a length smaller than this will cause + /// `find` to panic. + #[inline(always)] + pub(crate) fn min_haystack_len(&self) -> usize { + self.0.min_haystack_len::() + } + + #[inline(always)] + pub(crate) fn find( + &self, + haystack: &[u8], + needle: &[u8], + ) -> Option { + self.find_impl(haystack, needle) + } + + /// The implementation of find marked with the appropriate target feature. + #[target_feature(enable = "simd128")] + fn find_impl(&self, haystack: &[u8], needle: &[u8]) -> Option { + unsafe { genericsimd::fwd_find::(&self.0, haystack, needle) } + } +} + +#[cfg(all(test, feature = "std", not(miri)))] +mod tests { + use crate::memmem::{prefilter::PrefilterState, NeedleInfo}; + + fn find( + _: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], + ) -> Option { + super::Forward::new(ninfo, needle).unwrap().find(haystack, needle) + } + + #[test] + fn prefilter_permutations() { + use crate::memmem::prefilter::tests::PrefilterTest; + + unsafe { + PrefilterTest::run_all_tests_filter(find, |t| { + // This substring searcher only works on certain configs, so + // filter our tests such that Forward::new will be guaranteed + // to succeed. (And also remove tests with a haystack that is + // too small.) + let fwd = match super::Forward::new(&t.ninfo, &t.needle) { + None => return false, + Some(fwd) => fwd, + }; + t.haystack.len() >= fwd.min_haystack_len() + }) + } + } +}