From 099d5ed3855956ff992302cac48f2f2f4af613a1 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 22 Nov 2021 05:20:55 +0000 Subject: [PATCH] Migrated to portable simd --- Cargo.toml | 4 +-- src/compute/aggregate/simd/packed.rs | 32 +++++++++++----------- src/compute/comparison/simd/packed.rs | 29 +++++++++++++------- src/lib.rs | 1 + src/types/simd/mod.rs | 2 +- src/types/simd/packed.rs | 38 +++++++++++++++------------ 6 files changed, 60 insertions(+), 46 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 13fbf2e05cf..d43579ccd45 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,8 +58,6 @@ itertools = { version = "^0.10", optional = true } base64 = { version = "0.13.0", optional = true } -packed_simd = { version = "0.3", optional = true, package = "packed_simd_2" } - # to write to parquet as a stream futures = { version = "0.3", optional = true } @@ -208,7 +206,7 @@ compute = [ # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format. io_parquet = ["parquet2", "io_ipc", "base64", "futures"] benchmarks = ["rand"] -simd = ["packed_simd"] +simd = [] [package.metadata.cargo-all-features] allowlist = ["compute", "compute_sort", "compute_hash", "compute_nullif"] diff --git a/src/compute/aggregate/simd/packed.rs b/src/compute/aggregate/simd/packed.rs index 5ce69468f4e..4e9336ac08c 100644 --- a/src/compute/aggregate/simd/packed.rs +++ b/src/compute/aggregate/simd/packed.rs @@ -14,16 +14,16 @@ macro_rules! simd_sum { }; } -simd_sum!(f32x16, f32, sum); -simd_sum!(f64x8, f64, sum); -simd_sum!(u8x64, u8, wrapping_sum); -simd_sum!(u16x32, u16, wrapping_sum); -simd_sum!(u32x16, u32, wrapping_sum); -simd_sum!(u64x8, u64, wrapping_sum); -simd_sum!(i8x64, i8, wrapping_sum); -simd_sum!(i16x32, i16, wrapping_sum); -simd_sum!(i32x16, i32, wrapping_sum); -simd_sum!(i64x8, i64, wrapping_sum); +simd_sum!(f32x16, f32, horizontal_sum); +simd_sum!(f64x8, f64, horizontal_sum); +simd_sum!(u8x64, u8, horizontal_sum); +simd_sum!(u16x32, u16, horizontal_sum); +simd_sum!(u32x16, u32, horizontal_sum); +simd_sum!(u64x8, u64, horizontal_sum); +simd_sum!(i8x64, i8, horizontal_sum); +simd_sum!(i16x32, i16, horizontal_sum); +simd_sum!(i32x16, i32, horizontal_sum); +simd_sum!(i64x8, i64, horizontal_sum); macro_rules! simd_ord_int { ($simd:tt, $type:ty) => { @@ -33,22 +33,22 @@ macro_rules! simd_ord_int { #[inline] fn max_element(self) -> $type { - self.max_element() + self.horizontal_max() } #[inline] fn min_element(self) -> $type { - self.min_element() + self.horizontal_min() } #[inline] fn max(self, x: Self) -> Self { - self.max(x) + std::cmp::Ord::max(self, x) } #[inline] fn min(self, x: Self) -> Self { - self.min(x) + std::cmp::Ord::min(self, x) } #[inline] @@ -72,12 +72,12 @@ macro_rules! simd_ord_float { #[inline] fn max_element(self) -> $type { - self.max_element() + self.horizontal_max() } #[inline] fn min_element(self) -> $type { - self.min_element() + self.horizontal_min() } #[inline] diff --git a/src/compute/comparison/simd/packed.rs b/src/compute/comparison/simd/packed.rs index 1c18df437cc..c36c6d52cd7 100644 --- a/src/compute/comparison/simd/packed.rs +++ b/src/compute/comparison/simd/packed.rs @@ -1,6 +1,6 @@ use std::convert::TryInto; -use packed_simd::*; +use crate::types::simd::*; use crate::types::{days_ms, months_days_ns}; @@ -15,48 +15,48 @@ macro_rules! simd8 { impl Simd8Lanes<$type> for $md { #[inline] fn from_chunk(v: &[$type]) -> Self { - <$md>::from_slice_unaligned(v) + <$md>::from_slice(v) } #[inline] fn from_incomplete_chunk(v: &[$type], remaining: $type) -> Self { let mut a = [remaining; 8]; a.iter_mut().zip(v.iter()).for_each(|(a, b)| *a = *b); - Self::from_chunk(a.as_ref()) + Self::from_array(a) } } impl Simd8PartialEq for $md { #[inline] fn eq(self, other: Self) -> u8 { - self.eq(other).bitmask() + to_bitmask(self.lanes_eq(other)) } #[inline] fn neq(self, other: Self) -> u8 { - self.ne(other).bitmask() + to_bitmask(self.lanes_ne(other)) } } impl Simd8PartialOrd for $md { #[inline] fn lt_eq(self, other: Self) -> u8 { - self.le(other).bitmask() + to_bitmask(self.lanes_le(other)) } #[inline] fn lt(self, other: Self) -> u8 { - self.lt(other).bitmask() + to_bitmask(self.lanes_lt(other)) } #[inline] fn gt_eq(self, other: Self) -> u8 { - self.ge(other).bitmask() + to_bitmask(self.lanes_ge(other)) } #[inline] fn gt(self, other: Self) -> u8 { - self.gt(other).bitmask() + to_bitmask(self.lanes_gt(other)) } } }; @@ -77,3 +77,14 @@ simd8_native!(days_ms); simd8_native_partial_eq!(days_ms); simd8_native!(months_days_ns); simd8_native_partial_eq!(months_days_ns); + +fn to_bitmask(mask: std::simd::Mask) -> u8 { + mask.test(0) as u8 + | ((mask.test(1) as u8) << 1) + | ((mask.test(2) as u8) << 2) + | ((mask.test(3) as u8) << 3) + | ((mask.test(4) as u8) << 4) + | ((mask.test(5) as u8) << 5) + | ((mask.test(6) as u8) << 6) + | ((mask.test(7) as u8) << 7) +} diff --git a/src/lib.rs b/src/lib.rs index 608bef98120..f54297e63ce 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ // #![allow(clippy::len_without_is_empty)] #![cfg_attr(docsrs, feature(doc_cfg))] +#![cfg_attr(feature = "simd", feature(portable_simd))] #[macro_use] pub mod array; diff --git a/src/types/simd/mod.rs b/src/types/simd/mod.rs index c0e90e4e710..45321b8f82b 100644 --- a/src/types/simd/mod.rs +++ b/src/types/simd/mod.rs @@ -13,7 +13,7 @@ pub trait FromMaskChunk { /// # Safety /// The `NativeType` and the `NativeSimd` must have possible a matching alignment. /// e.g. slicing `&[NativeType]` by `align_of()` must be properly aligned/safe. -pub unsafe trait NativeSimd: Default + Copy { +pub unsafe trait NativeSimd: Sized + Default + Copy { /// Number of lanes const LANES: usize; /// The [`NativeType`] of this struct. E.g. `f32` for a `NativeSimd = f32x16`. diff --git a/src/types/simd/packed.rs b/src/types/simd/packed.rs index 160d47ecc35..0c03327212a 100644 --- a/src/types/simd/packed.rs +++ b/src/types/simd/packed.rs @@ -1,8 +1,12 @@ -pub use packed_simd::{ - f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, m16x32, m32x16, m64x8, m8x64, u16x32, u32x16, - u64x8, u8x64, +pub use std::simd::{ + f32x16, f32x8, f64x8, i16x32, i16x8, i32x16, i32x8, i64x8, i8x64, i8x8, mask32x16 as m32x16, + mask64x8 as m64x8, mask8x64 as m8x64, u16x32, u16x8, u32x16, u32x8, u64x8, u8x64, u8x8, }; +/// Vector of 32 16-bit masks +#[allow(non_camel_case_types)] +pub type m16x32 = std::simd::Mask; + use super::*; macro_rules! simd { @@ -20,7 +24,7 @@ macro_rules! simd { #[inline] fn from_chunk(v: &[$type]) -> Self { - <$name>::from_slice_unaligned(v) + <$name>::from_slice(v) } #[inline] @@ -67,28 +71,28 @@ chunk_macro!(u64, u8, u64x8, m64x8, from_chunk_u8); #[inline] fn from_chunk_u8(chunk: u8) -> m64x8 { - let idx = u64x8::new(1, 2, 4, 8, 16, 32, 64, 128); + let idx = u64x8::from_array([1, 2, 4, 8, 16, 32, 64, 128]); let vecmask = u64x8::splat(chunk as u64); - (idx & vecmask).eq(idx) + (idx & vecmask).lanes_eq(idx) } #[inline] fn from_chunk_u16(chunk: u16) -> m32x16 { - let idx = u32x16::new( + let idx = u32x16::from_array([ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, - ); + ]); let vecmask = u32x16::splat(chunk as u32); - (idx & vecmask).eq(idx) + (idx & vecmask).lanes_eq(idx) } #[inline] fn from_chunk_u32(chunk: u32) -> m16x32 { - let idx = u16x32::new( + let idx = u16x32::from_array([ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, - ); + ]); let left = u16x32::from_chunk(&[ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -105,16 +109,16 @@ fn from_chunk_u32(chunk: u32) -> m16x32 { let vecmask1 = u16x32::splat(a1); let vecmask2 = u16x32::splat(a2); - (idx & left & vecmask1).eq(idx) | (idx & right & vecmask2).eq(idx) + (idx & left & vecmask1).lanes_eq(idx) | (idx & right & vecmask2).lanes_eq(idx) } #[inline] fn from_chunk_u64(chunk: u64) -> m8x64 { - let idx = u8x64::new( + let idx = u8x64::from_array([ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, - ); + ]); let idxs = [ u8x64::from_chunk(&[ 1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -162,7 +166,7 @@ fn from_chunk_u64(chunk: u64) -> m8x64 { let mut result = m8x64::default(); for i in 0..8 { - result |= (idxs[i] & u8x64::splat(a[i])).eq(idx) + result |= (idxs[i] & u8x64::splat(a[i])).lanes_eq(idx) } result @@ -177,7 +181,7 @@ mod tests { let a = 0b00000001000000010000000100000001u32; let a = from_chunk_u32(a); for i in 0..32 { - assert_eq!(a.extract(i), i % 8 == 0) + assert_eq!(a.test(i), i % 8 == 0) } } @@ -186,7 +190,7 @@ mod tests { let a = 0b0000000100000001000000010000000100000001000000010000000100000001u64; let a = from_chunk_u64(a); for i in 0..64 { - assert_eq!(a.extract(i), i % 8 == 0) + assert_eq!(a.test(i), i % 8 == 0) } } }