From 099d5ed3855956ff992302cac48f2f2f4af613a1 Mon Sep 17 00:00:00 2001
From: "Jorge C. Leitao" <jorgecarleitao@gmail.com>
Date: Mon, 22 Nov 2021 05:20:55 +0000
Subject: [PATCH] Migrated to portable simd

---
 Cargo.toml                            |  4 +--
 src/compute/aggregate/simd/packed.rs  | 32 +++++++++++-----------
 src/compute/comparison/simd/packed.rs | 29 +++++++++++++-------
 src/lib.rs                            |  1 +
 src/types/simd/mod.rs                 |  2 +-
 src/types/simd/packed.rs              | 38 +++++++++++++++------------
 6 files changed, 60 insertions(+), 46 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 13fbf2e05cf..d43579ccd45 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -58,8 +58,6 @@ itertools = { version = "^0.10", optional = true }
 
 base64 = { version = "0.13.0", optional = true }
 
-packed_simd = { version = "0.3", optional = true, package = "packed_simd_2" }
-
 # to write to parquet as a stream
 futures = { version = "0.3", optional = true }
 
@@ -208,7 +206,7 @@ compute = [
 # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format.
 io_parquet = ["parquet2", "io_ipc", "base64", "futures"]
 benchmarks = ["rand"]
-simd = ["packed_simd"]
+simd = []
 
 [package.metadata.cargo-all-features]
 allowlist = ["compute", "compute_sort", "compute_hash", "compute_nullif"]
diff --git a/src/compute/aggregate/simd/packed.rs b/src/compute/aggregate/simd/packed.rs
index 5ce69468f4e..4e9336ac08c 100644
--- a/src/compute/aggregate/simd/packed.rs
+++ b/src/compute/aggregate/simd/packed.rs
@@ -14,16 +14,16 @@ macro_rules! simd_sum {
     };
 }
 
-simd_sum!(f32x16, f32, sum);
-simd_sum!(f64x8, f64, sum);
-simd_sum!(u8x64, u8, wrapping_sum);
-simd_sum!(u16x32, u16, wrapping_sum);
-simd_sum!(u32x16, u32, wrapping_sum);
-simd_sum!(u64x8, u64, wrapping_sum);
-simd_sum!(i8x64, i8, wrapping_sum);
-simd_sum!(i16x32, i16, wrapping_sum);
-simd_sum!(i32x16, i32, wrapping_sum);
-simd_sum!(i64x8, i64, wrapping_sum);
+simd_sum!(f32x16, f32, horizontal_sum);
+simd_sum!(f64x8, f64, horizontal_sum);
+simd_sum!(u8x64, u8, horizontal_sum);
+simd_sum!(u16x32, u16, horizontal_sum);
+simd_sum!(u32x16, u32, horizontal_sum);
+simd_sum!(u64x8, u64, horizontal_sum);
+simd_sum!(i8x64, i8, horizontal_sum);
+simd_sum!(i16x32, i16, horizontal_sum);
+simd_sum!(i32x16, i32, horizontal_sum);
+simd_sum!(i64x8, i64, horizontal_sum);
 
 macro_rules! simd_ord_int {
     ($simd:tt, $type:ty) => {
@@ -33,22 +33,22 @@ macro_rules! simd_ord_int {
 
             #[inline]
             fn max_element(self) -> $type {
-                self.max_element()
+                self.horizontal_max()
             }
 
             #[inline]
             fn min_element(self) -> $type {
-                self.min_element()
+                self.horizontal_min()
             }
 
             #[inline]
             fn max(self, x: Self) -> Self {
-                self.max(x)
+                std::cmp::Ord::max(self, x)
             }
 
             #[inline]
             fn min(self, x: Self) -> Self {
-                self.min(x)
+                std::cmp::Ord::min(self, x)
             }
 
             #[inline]
@@ -72,12 +72,12 @@ macro_rules! simd_ord_float {
 
             #[inline]
             fn max_element(self) -> $type {
-                self.max_element()
+                self.horizontal_max()
             }
 
             #[inline]
             fn min_element(self) -> $type {
-                self.min_element()
+                self.horizontal_min()
             }
 
             #[inline]
diff --git a/src/compute/comparison/simd/packed.rs b/src/compute/comparison/simd/packed.rs
index 1c18df437cc..c36c6d52cd7 100644
--- a/src/compute/comparison/simd/packed.rs
+++ b/src/compute/comparison/simd/packed.rs
@@ -1,6 +1,6 @@
 use std::convert::TryInto;
 
-use packed_simd::*;
+use crate::types::simd::*;
 
 use crate::types::{days_ms, months_days_ns};
 
@@ -15,48 +15,48 @@ macro_rules! simd8 {
         impl Simd8Lanes<$type> for $md {
             #[inline]
             fn from_chunk(v: &[$type]) -> Self {
-                <$md>::from_slice_unaligned(v)
+                <$md>::from_slice(v)
             }
 
             #[inline]
             fn from_incomplete_chunk(v: &[$type], remaining: $type) -> Self {
                 let mut a = [remaining; 8];
                 a.iter_mut().zip(v.iter()).for_each(|(a, b)| *a = *b);
-                Self::from_chunk(a.as_ref())
+                Self::from_array(a)
             }
         }
 
         impl Simd8PartialEq for $md {
             #[inline]
             fn eq(self, other: Self) -> u8 {
-                self.eq(other).bitmask()
+                to_bitmask(self.lanes_eq(other))
             }
 
             #[inline]
             fn neq(self, other: Self) -> u8 {
-                self.ne(other).bitmask()
+                to_bitmask(self.lanes_ne(other))
             }
         }
 
         impl Simd8PartialOrd for $md {
             #[inline]
             fn lt_eq(self, other: Self) -> u8 {
-                self.le(other).bitmask()
+                to_bitmask(self.lanes_le(other))
             }
 
             #[inline]
             fn lt(self, other: Self) -> u8 {
-                self.lt(other).bitmask()
+                to_bitmask(self.lanes_lt(other))
             }
 
             #[inline]
             fn gt_eq(self, other: Self) -> u8 {
-                self.ge(other).bitmask()
+                to_bitmask(self.lanes_ge(other))
             }
 
             #[inline]
             fn gt(self, other: Self) -> u8 {
-                self.gt(other).bitmask()
+                to_bitmask(self.lanes_gt(other))
             }
         }
     };
@@ -77,3 +77,14 @@ simd8_native!(days_ms);
 simd8_native_partial_eq!(days_ms);
 simd8_native!(months_days_ns);
 simd8_native_partial_eq!(months_days_ns);
+
+fn to_bitmask<T: std::simd::MaskElement>(mask: std::simd::Mask<T, 8>) -> u8 {
+    mask.test(0) as u8
+        | ((mask.test(1) as u8) << 1)
+        | ((mask.test(2) as u8) << 2)
+        | ((mask.test(3) as u8) << 3)
+        | ((mask.test(4) as u8) << 4)
+        | ((mask.test(5) as u8) << 5)
+        | ((mask.test(6) as u8) << 6)
+        | ((mask.test(7) as u8) << 7)
+}
diff --git a/src/lib.rs b/src/lib.rs
index 608bef98120..f54297e63ce 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,6 +4,7 @@
 //
 #![allow(clippy::len_without_is_empty)]
 #![cfg_attr(docsrs, feature(doc_cfg))]
+#![cfg_attr(feature = "simd", feature(portable_simd))]
 
 #[macro_use]
 pub mod array;
diff --git a/src/types/simd/mod.rs b/src/types/simd/mod.rs
index c0e90e4e710..45321b8f82b 100644
--- a/src/types/simd/mod.rs
+++ b/src/types/simd/mod.rs
@@ -13,7 +13,7 @@ pub trait FromMaskChunk<T> {
 /// # Safety
 /// The `NativeType` and the `NativeSimd` must have possible a matching alignment.
 /// e.g. slicing `&[NativeType]` by `align_of<NativeSimd>()` must be properly aligned/safe.
-pub unsafe trait NativeSimd: Default + Copy {
+pub unsafe trait NativeSimd: Sized + Default + Copy {
     /// Number of lanes
     const LANES: usize;
     /// The [`NativeType`] of this struct. E.g. `f32` for a `NativeSimd = f32x16`.
diff --git a/src/types/simd/packed.rs b/src/types/simd/packed.rs
index 160d47ecc35..0c03327212a 100644
--- a/src/types/simd/packed.rs
+++ b/src/types/simd/packed.rs
@@ -1,8 +1,12 @@
-pub use packed_simd::{
-    f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, m16x32, m32x16, m64x8, m8x64, u16x32, u32x16,
-    u64x8, u8x64,
+pub use std::simd::{
+    f32x16, f32x8, f64x8, i16x32, i16x8, i32x16, i32x8, i64x8, i8x64, i8x8, mask32x16 as m32x16,
+    mask64x8 as m64x8, mask8x64 as m8x64, u16x32, u16x8, u32x16, u32x8, u64x8, u8x64, u8x8,
 };
 
+/// Vector of 32 16-bit masks
+#[allow(non_camel_case_types)]
+pub type m16x32 = std::simd::Mask<i16, 32>;
+
 use super::*;
 
 macro_rules! simd {
@@ -20,7 +24,7 @@ macro_rules! simd {
 
             #[inline]
             fn from_chunk(v: &[$type]) -> Self {
-                <$name>::from_slice_unaligned(v)
+                <$name>::from_slice(v)
             }
 
             #[inline]
@@ -67,28 +71,28 @@ chunk_macro!(u64, u8, u64x8, m64x8, from_chunk_u8);
 
 #[inline]
 fn from_chunk_u8(chunk: u8) -> m64x8 {
-    let idx = u64x8::new(1, 2, 4, 8, 16, 32, 64, 128);
+    let idx = u64x8::from_array([1, 2, 4, 8, 16, 32, 64, 128]);
     let vecmask = u64x8::splat(chunk as u64);
 
-    (idx & vecmask).eq(idx)
+    (idx & vecmask).lanes_eq(idx)
 }
 
 #[inline]
 fn from_chunk_u16(chunk: u16) -> m32x16 {
-    let idx = u32x16::new(
+    let idx = u32x16::from_array([
         1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
-    );
+    ]);
     let vecmask = u32x16::splat(chunk as u32);
 
-    (idx & vecmask).eq(idx)
+    (idx & vecmask).lanes_eq(idx)
 }
 
 #[inline]
 fn from_chunk_u32(chunk: u32) -> m16x32 {
-    let idx = u16x32::new(
+    let idx = u16x32::from_array([
         1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 1, 2, 4, 8,
         16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
-    );
+    ]);
     let left = u16x32::from_chunk(&[
         1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -105,16 +109,16 @@ fn from_chunk_u32(chunk: u32) -> m16x32 {
     let vecmask1 = u16x32::splat(a1);
     let vecmask2 = u16x32::splat(a2);
 
-    (idx & left & vecmask1).eq(idx) | (idx & right & vecmask2).eq(idx)
+    (idx & left & vecmask1).lanes_eq(idx) | (idx & right & vecmask2).lanes_eq(idx)
 }
 
 #[inline]
 fn from_chunk_u64(chunk: u64) -> m8x64 {
-    let idx = u8x64::new(
+    let idx = u8x64::from_array([
         1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1,
         2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
         4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
-    );
+    ]);
     let idxs = [
         u8x64::from_chunk(&[
             1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -162,7 +166,7 @@ fn from_chunk_u64(chunk: u64) -> m8x64 {
 
     let mut result = m8x64::default();
     for i in 0..8 {
-        result |= (idxs[i] & u8x64::splat(a[i])).eq(idx)
+        result |= (idxs[i] & u8x64::splat(a[i])).lanes_eq(idx)
     }
 
     result
@@ -177,7 +181,7 @@ mod tests {
         let a = 0b00000001000000010000000100000001u32;
         let a = from_chunk_u32(a);
         for i in 0..32 {
-            assert_eq!(a.extract(i), i % 8 == 0)
+            assert_eq!(a.test(i), i % 8 == 0)
         }
     }
 
@@ -186,7 +190,7 @@ mod tests {
         let a = 0b0000000100000001000000010000000100000001000000010000000100000001u64;
         let a = from_chunk_u64(a);
         for i in 0..64 {
-            assert_eq!(a.extract(i), i % 8 == 0)
+            assert_eq!(a.test(i), i % 8 == 0)
         }
     }
 }