Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Migrated to portable simd
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Jan 9, 2022
1 parent 1c78fe9 commit e1e7af8
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 47 deletions.
4 changes: 1 addition & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ itertools = { version = "^0.10", optional = true }

base64 = { version = "0.13.0", optional = true }

packed_simd = { version = "0.3", optional = true, package = "packed_simd_2" }

# to write to parquet as a stream
futures = { version = "0.3", optional = true }

Expand Down Expand Up @@ -208,7 +206,7 @@ compute = [
# base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format.
io_parquet = ["parquet2", "io_ipc", "base64", "futures"]
benchmarks = ["rand"]
simd = ["packed_simd"]
simd = []

[package.metadata.cargo-all-features]
allowlist = ["compute", "compute_sort", "compute_hash", "compute_nullif"]
Expand Down
32 changes: 16 additions & 16 deletions src/compute/aggregate/simd/packed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@ macro_rules! simd_sum {
};
}

simd_sum!(f32x16, f32, sum);
simd_sum!(f64x8, f64, sum);
simd_sum!(u8x64, u8, wrapping_sum);
simd_sum!(u16x32, u16, wrapping_sum);
simd_sum!(u32x16, u32, wrapping_sum);
simd_sum!(u64x8, u64, wrapping_sum);
simd_sum!(i8x64, i8, wrapping_sum);
simd_sum!(i16x32, i16, wrapping_sum);
simd_sum!(i32x16, i32, wrapping_sum);
simd_sum!(i64x8, i64, wrapping_sum);
simd_sum!(f32x16, f32, horizontal_sum);
simd_sum!(f64x8, f64, horizontal_sum);
simd_sum!(u8x64, u8, horizontal_sum);
simd_sum!(u16x32, u16, horizontal_sum);
simd_sum!(u32x16, u32, horizontal_sum);
simd_sum!(u64x8, u64, horizontal_sum);
simd_sum!(i8x64, i8, horizontal_sum);
simd_sum!(i16x32, i16, horizontal_sum);
simd_sum!(i32x16, i32, horizontal_sum);
simd_sum!(i64x8, i64, horizontal_sum);

macro_rules! simd_ord_int {
($simd:tt, $type:ty) => {
Expand All @@ -33,22 +33,22 @@ macro_rules! simd_ord_int {

#[inline]
fn max_element(self) -> $type {
self.max_element()
self.horizontal_max()
}

#[inline]
fn min_element(self) -> $type {
self.min_element()
self.horizontal_min()
}

#[inline]
fn max(self, x: Self) -> Self {
self.max(x)
std::cmp::Ord::max(self, x)
}

#[inline]
fn min(self, x: Self) -> Self {
self.min(x)
std::cmp::Ord::min(self, x)
}

#[inline]
Expand All @@ -72,12 +72,12 @@ macro_rules! simd_ord_float {

#[inline]
fn max_element(self) -> $type {
self.max_element()
self.horizontal_max()
}

#[inline]
fn min_element(self) -> $type {
self.min_element()
self.horizontal_min()
}

#[inline]
Expand Down
31 changes: 21 additions & 10 deletions src/compute/comparison/simd/packed.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use std::convert::TryInto;

use super::{set, Simd8, Simd8Lanes};
use crate::types::simd::*;

use packed_simd::*;
use super::{set, Simd8, Simd8Lanes};

macro_rules! simd8 {
($type:ty, $md:ty) => {
Expand All @@ -13,44 +13,44 @@ macro_rules! simd8 {
impl Simd8Lanes<$type> for $md {
#[inline]
fn from_chunk(v: &[$type]) -> Self {
<$md>::from_slice_unaligned(v)
<$md>::from_slice(v)
}

#[inline]
fn from_incomplete_chunk(v: &[$type], remaining: $type) -> Self {
let mut a = [remaining; 8];
a.iter_mut().zip(v.iter()).for_each(|(a, b)| *a = *b);
Self::from_chunk(a.as_ref())
Self::from_array(a)
}

#[inline]
fn eq(self, other: Self) -> u8 {
self.eq(other).bitmask()
to_bitmask(self.lanes_eq(other))
}

#[inline]
fn neq(self, other: Self) -> u8 {
self.ne(other).bitmask()
to_bitmask(self.lanes_ne(other))
}

#[inline]
fn lt_eq(self, other: Self) -> u8 {
self.le(other).bitmask()
to_bitmask(self.lanes_le(other))
}

#[inline]
fn lt(self, other: Self) -> u8 {
self.lt(other).bitmask()
to_bitmask(self.lanes_lt(other))
}

#[inline]
fn gt_eq(self, other: Self) -> u8 {
self.ge(other).bitmask()
to_bitmask(self.lanes_ge(other))
}

#[inline]
fn gt(self, other: Self) -> u8 {
self.gt(other).bitmask()
to_bitmask(self.lanes_gt(other))
}
}
};
Expand All @@ -67,3 +67,14 @@ simd8!(i64, i64x8);
simd8_native!(i128);
simd8!(f32, f32x8);
simd8!(f64, f64x8);

fn to_bitmask<T: std::simd::MaskElement>(mask: std::simd::Mask<T, 8>) -> u8 {
mask.test(0) as u8
| ((mask.test(1) as u8) << 1)
| ((mask.test(2) as u8) << 2)
| ((mask.test(3) as u8) << 3)
| ((mask.test(4) as u8) << 4)
| ((mask.test(5) as u8) << 5)
| ((mask.test(6) as u8) << 6)
| ((mask.test(7) as u8) << 7)
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
//
#![allow(clippy::len_without_is_empty)]
#![cfg_attr(docsrs, feature(doc_cfg))]
#![cfg_attr(feature = "simd", feature(portable_simd))]

#[macro_use]
pub mod array;
Expand Down
2 changes: 1 addition & 1 deletion src/types/simd/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ pub trait FromMaskChunk<T> {
/// # Safety
/// The `NativeType` and the `NativeSimd` must have possible a matching alignment.
/// e.g. slicing `&[NativeType]` by `align_of<NativeSimd>()` must be properly aligned/safe.
pub unsafe trait NativeSimd: Default + Copy {
pub unsafe trait NativeSimd: Sized + Default + Copy {
/// Number of lanes
const LANES: usize;
/// The [`NativeType`] of this struct. E.g. `f32` for a `NativeSimd = f32x16`.
Expand Down
38 changes: 21 additions & 17 deletions src/types/simd/packed.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
pub use packed_simd::{
f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, m16x32, m32x16, m64x8, m8x64, u16x32, u32x16,
u64x8, u8x64,
pub use std::simd::{
f32x16, f32x8, f64x8, i16x32, i16x8, i32x16, i32x8, i64x8, i8x64, i8x8, mask32x16 as m32x16,
mask64x8 as m64x8, mask8x64 as m8x64, u16x32, u16x8, u32x16, u32x8, u64x8, u8x64, u8x8,
};

/// Vector of 32 16-bit masks
#[allow(non_camel_case_types)]
pub type m16x32 = std::simd::Mask<i16, 32>;

use super::*;

macro_rules! simd {
Expand All @@ -20,7 +24,7 @@ macro_rules! simd {

#[inline]
fn from_chunk(v: &[$type]) -> Self {
<$name>::from_slice_unaligned(v)
<$name>::from_slice(v)
}

#[inline]
Expand Down Expand Up @@ -67,28 +71,28 @@ chunk_macro!(u64, u8, u64x8, m64x8, from_chunk_u8);

#[inline]
fn from_chunk_u8(chunk: u8) -> m64x8 {
let idx = u64x8::new(1, 2, 4, 8, 16, 32, 64, 128);
let idx = u64x8::from_array([1, 2, 4, 8, 16, 32, 64, 128]);
let vecmask = u64x8::splat(chunk as u64);

(idx & vecmask).eq(idx)
(idx & vecmask).lanes_eq(idx)
}

#[inline]
fn from_chunk_u16(chunk: u16) -> m32x16 {
let idx = u32x16::new(
let idx = u32x16::from_array([
1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
);
]);
let vecmask = u32x16::splat(chunk as u32);

(idx & vecmask).eq(idx)
(idx & vecmask).lanes_eq(idx)
}

#[inline]
fn from_chunk_u32(chunk: u32) -> m16x32 {
let idx = u16x32::new(
let idx = u16x32::from_array([
1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 1, 2, 4, 8,
16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
);
]);
let left = u16x32::from_chunk(&[
1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Expand All @@ -105,16 +109,16 @@ fn from_chunk_u32(chunk: u32) -> m16x32 {
let vecmask1 = u16x32::splat(a1);
let vecmask2 = u16x32::splat(a2);

(idx & left & vecmask1).eq(idx) | (idx & right & vecmask2).eq(idx)
(idx & left & vecmask1).lanes_eq(idx) | (idx & right & vecmask2).lanes_eq(idx)
}

#[inline]
fn from_chunk_u64(chunk: u64) -> m8x64 {
let idx = u8x64::new(
let idx = u8x64::from_array([
1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1,
2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2,
4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
);
]);
let idxs = [
u8x64::from_chunk(&[
1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Expand Down Expand Up @@ -162,7 +166,7 @@ fn from_chunk_u64(chunk: u64) -> m8x64 {

let mut result = m8x64::default();
for i in 0..8 {
result |= (idxs[i] & u8x64::splat(a[i])).eq(idx)
result |= (idxs[i] & u8x64::splat(a[i])).lanes_eq(idx)
}

result
Expand All @@ -177,7 +181,7 @@ mod tests {
let a = 0b00000001000000010000000100000001u32;
let a = from_chunk_u32(a);
for i in 0..32 {
assert_eq!(a.extract(i), i % 8 == 0)
assert_eq!(a.test(i), i % 8 == 0)
}
}

Expand All @@ -186,7 +190,7 @@ mod tests {
let a = 0b0000000100000001000000010000000100000001000000010000000100000001u64;
let a = from_chunk_u64(a);
for i in 0..64 {
assert_eq!(a.extract(i), i % 8 == 0)
assert_eq!(a.test(i), i % 8 == 0)
}
}
}

0 comments on commit e1e7af8

Please sign in to comment.