From 0a4c5a40416a6a9c55dad298f183e8102f12a309 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Mon, 18 Mar 2024 15:06:54 +0100 Subject: [PATCH 1/8] add POLARS_DISABLE_AVX512 environment variable --- crates/polars-compute/src/filter/primitive.rs | 9 +++---- crates/polars-utils/src/cpuid.rs | 24 +++++++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/crates/polars-compute/src/filter/primitive.rs b/crates/polars-compute/src/filter/primitive.rs index 10c00afdff1c..65a88adf1438 100644 --- a/crates/polars-compute/src/filter/primitive.rs +++ b/crates/polars-compute/src/filter/primitive.rs @@ -1,5 +1,6 @@ use arrow::bitmap::Bitmap; use bytemuck::{cast_slice, cast_vec, Pod}; +use polars_utils::cpuid::is_avx512_enabled; #[cfg(all(target_arch = "x86_64", feature = "simd"))] use super::avx512; @@ -28,7 +29,7 @@ pub fn filter_values(values: &[T], mask: &Bitmap) -> Vec { fn filter_values_u8(values: &[u8], mask: &Bitmap) -> Vec { #[cfg(all(target_arch = "x86_64", feature = "simd"))] - if std::arch::is_x86_feature_detected!("avx512vbmi2") { + if is_avx512_enabled() && std::arch::is_x86_feature_detected!("avx512vbmi2") { return filter_values_generic(values, mask, 64, avx512::filter_u8_avx512vbmi2); } @@ -37,7 +38,7 @@ fn filter_values_u8(values: &[u8], mask: &Bitmap) -> Vec { fn filter_values_u16(values: &[u16], mask: &Bitmap) -> Vec { #[cfg(all(target_arch = "x86_64", feature = "simd"))] - if std::arch::is_x86_feature_detected!("avx512vbmi2") { + if is_avx512_enabled() && std::arch::is_x86_feature_detected!("avx512vbmi2") { return filter_values_generic(values, mask, 32, avx512::filter_u16_avx512vbmi2); } @@ -46,7 +47,7 @@ fn filter_values_u16(values: &[u16], mask: &Bitmap) -> Vec { fn filter_values_u32(values: &[u32], mask: &Bitmap) -> Vec { #[cfg(all(target_arch = "x86_64", feature = "simd"))] - if std::arch::is_x86_feature_detected!("avx512f") { + if is_avx512_enabled() { return filter_values_generic(values, mask, 16, avx512::filter_u32_avx512f); } @@ -55,7 +56,7 @@ fn filter_values_u32(values: &[u32], mask: &Bitmap) -> Vec { fn filter_values_u64(values: &[u64], mask: &Bitmap) -> Vec { #[cfg(all(target_arch = "x86_64", feature = "simd"))] - if std::arch::is_x86_feature_detected!("avx512f") { + if is_avx512_enabled() { return filter_values_generic(values, mask, 8, avx512::filter_u64_avx512f); } diff --git a/crates/polars-utils/src/cpuid.rs b/crates/polars-utils/src/cpuid.rs index 71eda848a878..f7642e3e574c 100644 --- a/crates/polars-utils/src/cpuid.rs +++ b/crates/polars-utils/src/cpuid.rs @@ -37,3 +37,27 @@ pub fn has_fast_bmi2() -> bool { false } + +#[inline] +pub fn is_avx512_enabled() -> bool { + #[cfg(target_arch = "x86_64")] + { + static CACHE: OnceLock = OnceLock::new(); + return *CACHE.get_or_init(|| { + if !std::arch::is_x86_feature_detected!("avx512f") { + return false; + } + + if std::env::var("POLARS_DISABLE_AVX512") + .map(|var| var == "1") + .unwrap_or(false) + { + return false; + } + + true + }); + } + + false +} From 20fbb5d1779b80f4aa90411859926c4fd7e4d15b Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Mon, 18 Mar 2024 15:22:00 +0100 Subject: [PATCH 2/8] add avx-512 --- crates/polars-compute/src/comparisons/mod.rs | 11 -- .../polars-compute/src/comparisons/scalar.rs | 7 +- crates/polars-compute/src/filter/primitive.rs | 2 + crates/polars-compute/src/if_then_else/mod.rs | 8 +- .../polars-compute/src/if_then_else/simd.rs | 163 ++++++++++++++++++ crates/polars-compute/src/lib.rs | 14 ++ 6 files changed, 189 insertions(+), 16 deletions(-) create mode 100644 crates/polars-compute/src/if_then_else/simd.rs diff --git a/crates/polars-compute/src/comparisons/mod.rs b/crates/polars-compute/src/comparisons/mod.rs index a0baebad6b7d..c1b7dcb75981 100644 --- a/crates/polars-compute/src/comparisons/mod.rs +++ b/crates/polars-compute/src/comparisons/mod.rs @@ -72,17 +72,6 @@ pub trait TotalOrdKernel: Sized + Array { } } -// Trait to enable the scalar blanket implementation. -trait NotSimdPrimitive {} - -#[cfg(not(feature = "simd"))] -impl NotSimdPrimitive for T {} - -#[cfg(feature = "simd")] -impl NotSimdPrimitive for u128 {} -#[cfg(feature = "simd")] -impl NotSimdPrimitive for i128 {} - mod scalar; mod view; diff --git a/crates/polars-compute/src/comparisons/scalar.rs b/crates/polars-compute/src/comparisons/scalar.rs index bc338f3816a8..954880d7edf3 100644 --- a/crates/polars-compute/src/comparisons/scalar.rs +++ b/crates/polars-compute/src/comparisons/scalar.rs @@ -1,11 +1,12 @@ use arrow::array::{BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}; use arrow::bitmap::{self, Bitmap}; -use arrow::types::NativeType; use polars_utils::total_ord::{TotalEq, TotalOrd}; -use super::{NotSimdPrimitive, TotalOrdKernel}; +use crate::NotSimdPrimitive; -impl TotalOrdKernel for PrimitiveArray { +use super::TotalOrdKernel; + +impl TotalOrdKernel for PrimitiveArray { type Scalar = T; fn tot_lt_kernel(&self, other: &Self) -> Bitmap { diff --git a/crates/polars-compute/src/filter/primitive.rs b/crates/polars-compute/src/filter/primitive.rs index 65a88adf1438..d7b8bc233011 100644 --- a/crates/polars-compute/src/filter/primitive.rs +++ b/crates/polars-compute/src/filter/primitive.rs @@ -1,5 +1,7 @@ use arrow::bitmap::Bitmap; use bytemuck::{cast_slice, cast_vec, Pod}; + +#[cfg(target_arch = "x86_64")] use polars_utils::cpuid::is_avx512_enabled; #[cfg(all(target_arch = "x86_64", feature = "simd"))] diff --git a/crates/polars-compute/src/if_then_else/mod.rs b/crates/polars-compute/src/if_then_else/mod.rs index 3f704ca1d5f4..0f01e393bf2e 100644 --- a/crates/polars-compute/src/if_then_else/mod.rs +++ b/crates/polars-compute/src/if_then_else/mod.rs @@ -4,13 +4,16 @@ use arrow::array::{Array, PrimitiveArray}; use arrow::bitmap::utils::{align_bitslice_start_u8, SlicesIterator}; use arrow::bitmap::{self, Bitmap}; use arrow::datatypes::ArrowDataType; -use arrow::types::NativeType; use polars_utils::slice::load_padded_le_u64; +use crate::NotSimdPrimitive; + mod array; mod boolean; mod list; mod scalar; +#[cfg(feature = "simd")] +mod simd; mod view; pub trait IfThenElseKernel: Sized + Array { @@ -35,7 +38,7 @@ pub trait IfThenElseKernel: Sized + Array { ) -> Self; } -impl IfThenElseKernel for PrimitiveArray { +impl IfThenElseKernel for PrimitiveArray { type Scalar<'a> = T; fn if_then_else(mask: &Bitmap, if_true: &Self, if_false: &Self) -> Self { @@ -98,6 +101,7 @@ impl IfThenElseKernel for PrimitiveArray { } } + fn if_then_else_validity( mask: &Bitmap, if_true: Option<&Bitmap>, diff --git a/crates/polars-compute/src/if_then_else/simd.rs b/crates/polars-compute/src/if_then_else/simd.rs new file mode 100644 index 000000000000..38aa93543eef --- /dev/null +++ b/crates/polars-compute/src/if_then_else/simd.rs @@ -0,0 +1,163 @@ +use std::mem::MaybeUninit; +use std::simd::{Mask, Simd, SimdElement}; + +use arrow::array::PrimitiveArray; +use arrow::bitmap::Bitmap; +use arrow::datatypes::ArrowDataType; + +#[cfg(target_arch = "x86_64")] +use polars_utils::cpuid::is_avx512_enabled; + +use super::{ + if_then_else_loop, if_then_else_loop_broadcast_both, if_then_else_loop_broadcast_false, + if_then_else_validity, scalar, IfThenElseKernel, +}; + +fn select_simd_64( + mask: u64, + if_true: Simd, + if_false: Simd, + out: &mut [MaybeUninit; 64], +) { + let mv = Mask::<::Mask, 64>::from_bitmask(mask); + let ret = mv.select(if_true, if_false); + unsafe { + let src = ret.as_array().as_ptr() as *const MaybeUninit; + core::ptr::copy_nonoverlapping(src, out.as_mut_ptr(), 64); + } +} + +fn if_then_else_simd_64( + mask: u64, + if_true: &[T; 64], + if_false: &[T; 64], + out: &mut [MaybeUninit; 64], +) { + select_simd_64( + mask, + Simd::from_slice(if_true), + Simd::from_slice(if_false), + out, + ) +} + +fn if_then_else_broadcast_false_simd_64( + mask: u64, + if_true: &[T; 64], + if_false: T, + out: &mut [MaybeUninit; 64], +) { + select_simd_64(mask, Simd::from_slice(if_true), Simd::splat(if_false), out) +} + +fn if_then_else_broadcast_both_simd_64( + mask: u64, + if_true: T, + if_false: T, + out: &mut [MaybeUninit; 64], +) { + select_simd_64(mask, Simd::splat(if_true), Simd::splat(if_false), out) +} + +// The AVX-512 kernels get autogenerated by inlining the routine into the +// functions with the appropriate target_feature enabled. + +/// # Safety +/// AVX-512BW must be available. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f")] +#[target_feature(enable = "avx512bw")] +unsafe fn with_avx512(f: impl FnOnce() -> R) -> R { + f() +} + +fn with_avx512_if_available(f: impl FnOnce() -> R) -> R { + #[cfg(target_arch = "x86_64")] + if is_avx512_enabled() && std::arch::is_x86_feature_detected!("avx512bw") { + return unsafe { with_avx512(f) }; + } + + f() +} + +macro_rules! impl_if_then_else { + ($T: ty) => { + impl IfThenElseKernel for PrimitiveArray<$T> { + type Scalar<'a> = $T; + + fn if_then_else(mask: &Bitmap, if_true: &Self, if_false: &Self) -> Self { + let values = with_avx512_if_available(|| { + if_then_else_loop( + mask, + if_true.values(), + if_false.values(), + scalar::if_then_else_scalar_rest, + if_then_else_simd_64, + ) + }); + let validity = if_then_else_validity(mask, if_true.validity(), if_false.validity()); + PrimitiveArray::from_vec(values).with_validity(validity) + } + + fn if_then_else_broadcast_true( + mask: &Bitmap, + if_true: Self::Scalar<'_>, + if_false: &Self, + ) -> Self { + let values = with_avx512_if_available(|| { + if_then_else_loop_broadcast_false( + true, + mask, + if_false.values(), + if_true, + if_then_else_broadcast_false_simd_64, + ) + }); + let validity = if_then_else_validity(mask, None, if_false.validity()); + PrimitiveArray::from_vec(values).with_validity(validity) + } + + fn if_then_else_broadcast_false( + mask: &Bitmap, + if_true: &Self, + if_false: Self::Scalar<'_>, + ) -> Self { + let values = with_avx512_if_available(|| if_then_else_loop_broadcast_false( + false, + mask, + if_true.values(), + if_false, + if_then_else_broadcast_false_simd_64, + )); + let validity = if_then_else_validity(mask, if_true.validity(), None); + PrimitiveArray::from_vec(values).with_validity(validity) + } + + fn if_then_else_broadcast_both( + _dtype: ArrowDataType, + mask: &Bitmap, + if_true: Self::Scalar<'_>, + if_false: Self::Scalar<'_>, + ) -> Self { + let values = with_avx512_if_available(|| if_then_else_loop_broadcast_both( + mask, + if_true, + if_false, + if_then_else_broadcast_both_simd_64, + )); + PrimitiveArray::from_vec(values) + } + } + } +} + +impl_if_then_else!(i8); +impl_if_then_else!(i16); +impl_if_then_else!(i32); +impl_if_then_else!(i64); +impl_if_then_else!(u8); +impl_if_then_else!(u16); +impl_if_then_else!(u32); +impl_if_then_else!(u64); +impl_if_then_else!(f32); +impl_if_then_else!(f64); \ No newline at end of file diff --git a/crates/polars-compute/src/lib.rs b/crates/polars-compute/src/lib.rs index cc477a817739..d21b72118d09 100644 --- a/crates/polars-compute/src/lib.rs +++ b/crates/polars-compute/src/lib.rs @@ -5,6 +5,8 @@ feature(stdarch_x86_avx512) )] +use arrow::types::NativeType; + pub mod arithmetic; pub mod comparisons; pub mod filter; @@ -12,3 +14,15 @@ pub mod if_then_else; pub mod min_max; pub mod arity; + + +// Trait to enable the scalar blanket implementation. +pub trait NotSimdPrimitive : NativeType {} + +#[cfg(not(feature = "simd"))] +impl NotSimdPrimitive for T {} + +#[cfg(feature = "simd")] +impl NotSimdPrimitive for u128 {} +#[cfg(feature = "simd")] +impl NotSimdPrimitive for i128 {} From 2b3e744159a2f085fee2d511b0c77ad6bf9b6c55 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Mon, 18 Mar 2024 16:41:58 +0100 Subject: [PATCH 3/8] made view if-then-else more robust against compiler whims --- .../polars-compute/src/if_then_else/scalar.rs | 4 +- .../polars-compute/src/if_then_else/simd.rs | 51 +++++++++++++------ .../polars-compute/src/if_then_else/view.rs | 23 +++++++-- 3 files changed, 55 insertions(+), 23 deletions(-) diff --git a/crates/polars-compute/src/if_then_else/scalar.rs b/crates/polars-compute/src/if_then_else/scalar.rs index 2e1d6b396ddb..310da8cfb121 100644 --- a/crates/polars-compute/src/if_then_else/scalar.rs +++ b/crates/polars-compute/src/if_then_else/scalar.rs @@ -6,7 +6,7 @@ pub fn if_then_else_scalar_rest( if_false: &[T], out: &mut [MaybeUninit], ) { - assert!(if_true.len() <= out.len()); // Removes bounds checks in inner loop. + assert!(if_true.len() == out.len()); // Removes bounds checks in inner loop. let true_it = if_true.iter().copied(); let false_it = if_false.iter().copied(); for (i, (t, f)) in true_it.zip(false_it).enumerate() { @@ -21,7 +21,7 @@ pub fn if_then_else_broadcast_false_scalar_rest( if_false: T, out: &mut [MaybeUninit], ) { - assert!(if_true.len() <= out.len()); // Removes bounds checks in inner loop. + assert!(if_true.len() == out.len()); // Removes bounds checks in inner loop. let true_it = if_true.iter().copied(); for (i, t) in true_it.enumerate() { let src = if (mask >> i) & 1 != 0 { t } else { if_false }; diff --git a/crates/polars-compute/src/if_then_else/simd.rs b/crates/polars-compute/src/if_then_else/simd.rs index 38aa93543eef..1819df6f3526 100644 --- a/crates/polars-compute/src/if_then_else/simd.rs +++ b/crates/polars-compute/src/if_then_else/simd.rs @@ -4,7 +4,6 @@ use std::simd::{Mask, Simd, SimdElement}; use arrow::array::PrimitiveArray; use arrow::bitmap::Bitmap; use arrow::datatypes::ArrowDataType; - #[cfg(target_arch = "x86_64")] use polars_utils::cpuid::is_avx512_enabled; @@ -92,7 +91,11 @@ macro_rules! impl_if_then_else { if_true.values(), if_false.values(), scalar::if_then_else_scalar_rest, + // Auto-generated SIMD was slower on ARM. + #[cfg(target_arch = "x86_64")] if_then_else_simd_64, + #[cfg(not(target_arch = "x86_64"))] + scalar::if_then_else_scalar_64, ) }); let validity = if_then_else_validity(mask, if_true.validity(), if_false.validity()); @@ -110,7 +113,11 @@ macro_rules! impl_if_then_else { mask, if_false.values(), if_true, + // Auto-generated SIMD was slower on ARM. + #[cfg(target_arch = "x86_64")] if_then_else_broadcast_false_simd_64, + #[cfg(not(target_arch = "x86_64"))] + scalar::if_then_else_broadcast_false_scalar_64, ) }); let validity = if_then_else_validity(mask, None, if_false.validity()); @@ -122,13 +129,19 @@ macro_rules! impl_if_then_else { if_true: &Self, if_false: Self::Scalar<'_>, ) -> Self { - let values = with_avx512_if_available(|| if_then_else_loop_broadcast_false( - false, - mask, - if_true.values(), - if_false, - if_then_else_broadcast_false_simd_64, - )); + let values = with_avx512_if_available(|| { + if_then_else_loop_broadcast_false( + false, + mask, + if_true.values(), + if_false, + // Auto-generated SIMD was slower on ARM. + #[cfg(target_arch = "x86_64")] + if_then_else_broadcast_false_simd_64, + #[cfg(not(target_arch = "x86_64"))] + scalar::if_then_else_broadcast_false_scalar_64, + ) + }); let validity = if_then_else_validity(mask, if_true.validity(), None); PrimitiveArray::from_vec(values).with_validity(validity) } @@ -139,16 +152,22 @@ macro_rules! impl_if_then_else { if_true: Self::Scalar<'_>, if_false: Self::Scalar<'_>, ) -> Self { - let values = with_avx512_if_available(|| if_then_else_loop_broadcast_both( - mask, - if_true, - if_false, - if_then_else_broadcast_both_simd_64, - )); + let values = with_avx512_if_available(|| { + if_then_else_loop_broadcast_both( + mask, + if_true, + if_false, + // Auto-generated SIMD was slower on ARM. + #[cfg(target_arch = "x86_64")] + if_then_else_broadcast_both_simd_64, + #[cfg(not(target_arch = "x86_64"))] + scalar::if_then_else_broadcast_both_scalar_64, + ) + }); PrimitiveArray::from_vec(values) } } - } + }; } impl_if_then_else!(i8); @@ -160,4 +179,4 @@ impl_if_then_else!(u16); impl_if_then_else!(u32); impl_if_then_else!(u64); impl_if_then_else!(f32); -impl_if_then_else!(f64); \ No newline at end of file +impl_if_then_else!(f64); diff --git a/crates/polars-compute/src/if_then_else/view.rs b/crates/polars-compute/src/if_then_else/view.rs index 5b1100153b03..64e9eba3eb8b 100644 --- a/crates/polars-compute/src/if_then_else/view.rs +++ b/crates/polars-compute/src/if_then_else/view.rs @@ -7,9 +7,7 @@ use arrow::buffer::Buffer; use arrow::datatypes::ArrowDataType; use super::IfThenElseKernel; -use crate::if_then_else::scalar::{ - if_then_else_broadcast_both_scalar_64, if_then_else_broadcast_false_scalar_64, -}; +use crate::if_then_else::scalar::if_then_else_broadcast_both_scalar_64; // Makes a buffer and a set of views into that buffer from a set of strings. // Does not allocate a buffer if not necessary. @@ -87,7 +85,7 @@ impl IfThenElseKernel for BinaryViewArray { mask, if_false.views(), true_view, - if_then_else_broadcast_false_scalar_64, + if_then_else_broadcast_false_view_64, ); let validity = super::if_then_else_validity(mask, None, if_false.validity()); @@ -120,7 +118,7 @@ impl IfThenElseKernel for BinaryViewArray { mask, if_true.views(), false_view, - if_then_else_broadcast_false_scalar_64, + if_then_else_broadcast_false_view_64, ); let validity = super::if_then_else_validity(mask, if_true.validity(), None); @@ -242,3 +240,18 @@ pub fn if_then_else_view_64( ) { if_then_else_view_rest(mask, if_true, if_false, out, false_buffer_idx_offset) } + +// Using the scalar variant of this works, but was slower, we want to select a source pointer and +// then copy it. Using this version for the integers results in branches. +pub fn if_then_else_broadcast_false_view_64( + mask: u64, + if_true: &[View; 64], + if_false: View, + out: &mut [MaybeUninit; 64], +) { + assert!(if_true.len() == out.len()); // Removes bounds checks in inner loop. + for (i, t) in if_true.iter().enumerate() { + let src = if (mask >> i) & 1 != 0 { t } else { &if_false }; + out[i] = MaybeUninit::new(*src); + } +} From 1f77c9135d403dcadee7f58f3d85a9bfa63f4b18 Mon Sep 17 00:00:00 2001 From: "orsonpeters@gmail.com" Date: Mon, 18 Mar 2024 16:27:11 +0000 Subject: [PATCH 4/8] remove AVX-512, no difference --- .../polars-compute/src/if_then_else/simd.rs | 37 ++++--------------- 1 file changed, 8 insertions(+), 29 deletions(-) diff --git a/crates/polars-compute/src/if_then_else/simd.rs b/crates/polars-compute/src/if_then_else/simd.rs index 1819df6f3526..0958a8e2f0a9 100644 --- a/crates/polars-compute/src/if_then_else/simd.rs +++ b/crates/polars-compute/src/if_then_else/simd.rs @@ -58,34 +58,13 @@ fn if_then_else_broadcast_both_simd_64( select_simd_64(mask, Simd::splat(if_true), Simd::splat(if_false), out) } -// The AVX-512 kernels get autogenerated by inlining the routine into the -// functions with the appropriate target_feature enabled. - -/// # Safety -/// AVX-512BW must be available. -#[cfg(target_arch = "x86_64")] -#[target_feature(enable = "avx512f")] -#[target_feature(enable = "avx512bw")] -unsafe fn with_avx512(f: impl FnOnce() -> R) -> R { - f() -} - -fn with_avx512_if_available(f: impl FnOnce() -> R) -> R { - #[cfg(target_arch = "x86_64")] - if is_avx512_enabled() && std::arch::is_x86_feature_detected!("avx512bw") { - return unsafe { with_avx512(f) }; - } - - f() -} - macro_rules! impl_if_then_else { ($T: ty) => { impl IfThenElseKernel for PrimitiveArray<$T> { type Scalar<'a> = $T; fn if_then_else(mask: &Bitmap, if_true: &Self, if_false: &Self) -> Self { - let values = with_avx512_if_available(|| { + let values = if_then_else_loop( mask, if_true.values(), @@ -97,7 +76,7 @@ macro_rules! impl_if_then_else { #[cfg(not(target_arch = "x86_64"))] scalar::if_then_else_scalar_64, ) - }); + ; let validity = if_then_else_validity(mask, if_true.validity(), if_false.validity()); PrimitiveArray::from_vec(values).with_validity(validity) } @@ -107,7 +86,7 @@ macro_rules! impl_if_then_else { if_true: Self::Scalar<'_>, if_false: &Self, ) -> Self { - let values = with_avx512_if_available(|| { + let values = if_then_else_loop_broadcast_false( true, mask, @@ -119,7 +98,7 @@ macro_rules! impl_if_then_else { #[cfg(not(target_arch = "x86_64"))] scalar::if_then_else_broadcast_false_scalar_64, ) - }); + ; let validity = if_then_else_validity(mask, None, if_false.validity()); PrimitiveArray::from_vec(values).with_validity(validity) } @@ -129,7 +108,7 @@ macro_rules! impl_if_then_else { if_true: &Self, if_false: Self::Scalar<'_>, ) -> Self { - let values = with_avx512_if_available(|| { + let values = if_then_else_loop_broadcast_false( false, mask, @@ -141,7 +120,7 @@ macro_rules! impl_if_then_else { #[cfg(not(target_arch = "x86_64"))] scalar::if_then_else_broadcast_false_scalar_64, ) - }); + ; let validity = if_then_else_validity(mask, if_true.validity(), None); PrimitiveArray::from_vec(values).with_validity(validity) } @@ -152,7 +131,7 @@ macro_rules! impl_if_then_else { if_true: Self::Scalar<'_>, if_false: Self::Scalar<'_>, ) -> Self { - let values = with_avx512_if_available(|| { + let values = if_then_else_loop_broadcast_both( mask, if_true, @@ -163,7 +142,7 @@ macro_rules! impl_if_then_else { #[cfg(not(target_arch = "x86_64"))] scalar::if_then_else_broadcast_both_scalar_64, ) - }); + ; PrimitiveArray::from_vec(values) } } From 2eb2821f3ca4e2f38c6cabffcd03d5b1ac58d1ac Mon Sep 17 00:00:00 2001 From: "orsonpeters@gmail.com" Date: Mon, 18 Mar 2024 16:27:37 +0000 Subject: [PATCH 5/8] copy view after select --- crates/polars-compute/src/if_then_else/view.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/crates/polars-compute/src/if_then_else/view.rs b/crates/polars-compute/src/if_then_else/view.rs index 64e9eba3eb8b..b304534064ff 100644 --- a/crates/polars-compute/src/if_then_else/view.rs +++ b/crates/polars-compute/src/if_then_else/view.rs @@ -213,13 +213,14 @@ pub fn if_then_else_view_rest( false_buffer_idx_offset: u32, ) { assert!(if_true.len() <= out.len()); // Removes bounds checks in inner loop. - let true_it = if_true.iter().copied(); - let false_it = if_false.iter().copied(); + let true_it = if_true.iter(); + let false_it = if_false.iter(); for (i, (t, f)) in true_it.zip(false_it).enumerate() { // Written like this, this loop *should* be branchless. // Unfortunately we're still dependent on the compiler. let m = (mask >> i) & 1 != 0; - let mut v = if m { t } else { f }; + let src = if m { t } else { f }; + let mut v = *src; let offset = if m | (v.length <= 12) { // Yes, | instead of || is intentional. 0 From ba03a724d6728bc0f522ddf5f3a389d610bca82a Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Mon, 18 Mar 2024 17:31:55 +0100 Subject: [PATCH 6/8] fmt --- .../polars-compute/src/comparisons/scalar.rs | 3 +- crates/polars-compute/src/filter/primitive.rs | 1 - crates/polars-compute/src/if_then_else/mod.rs | 1 - .../polars-compute/src/if_then_else/simd.rs | 96 +++++++++---------- crates/polars-compute/src/lib.rs | 3 +- 5 files changed, 45 insertions(+), 59 deletions(-) diff --git a/crates/polars-compute/src/comparisons/scalar.rs b/crates/polars-compute/src/comparisons/scalar.rs index 954880d7edf3..f2f245ed34c5 100644 --- a/crates/polars-compute/src/comparisons/scalar.rs +++ b/crates/polars-compute/src/comparisons/scalar.rs @@ -2,9 +2,8 @@ use arrow::array::{BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}; use arrow::bitmap::{self, Bitmap}; use polars_utils::total_ord::{TotalEq, TotalOrd}; -use crate::NotSimdPrimitive; - use super::TotalOrdKernel; +use crate::NotSimdPrimitive; impl TotalOrdKernel for PrimitiveArray { type Scalar = T; diff --git a/crates/polars-compute/src/filter/primitive.rs b/crates/polars-compute/src/filter/primitive.rs index d7b8bc233011..c4c7a5d5cfea 100644 --- a/crates/polars-compute/src/filter/primitive.rs +++ b/crates/polars-compute/src/filter/primitive.rs @@ -1,6 +1,5 @@ use arrow::bitmap::Bitmap; use bytemuck::{cast_slice, cast_vec, Pod}; - #[cfg(target_arch = "x86_64")] use polars_utils::cpuid::is_avx512_enabled; diff --git a/crates/polars-compute/src/if_then_else/mod.rs b/crates/polars-compute/src/if_then_else/mod.rs index 0f01e393bf2e..f912473594ba 100644 --- a/crates/polars-compute/src/if_then_else/mod.rs +++ b/crates/polars-compute/src/if_then_else/mod.rs @@ -101,7 +101,6 @@ impl IfThenElseKernel for PrimitiveArray { } } - fn if_then_else_validity( mask: &Bitmap, if_true: Option<&Bitmap>, diff --git a/crates/polars-compute/src/if_then_else/simd.rs b/crates/polars-compute/src/if_then_else/simd.rs index 0958a8e2f0a9..411f1e497f3f 100644 --- a/crates/polars-compute/src/if_then_else/simd.rs +++ b/crates/polars-compute/src/if_then_else/simd.rs @@ -4,8 +4,6 @@ use std::simd::{Mask, Simd, SimdElement}; use arrow::array::PrimitiveArray; use arrow::bitmap::Bitmap; use arrow::datatypes::ArrowDataType; -#[cfg(target_arch = "x86_64")] -use polars_utils::cpuid::is_avx512_enabled; use super::{ if_then_else_loop, if_then_else_loop_broadcast_both, if_then_else_loop_broadcast_false, @@ -64,19 +62,17 @@ macro_rules! impl_if_then_else { type Scalar<'a> = $T; fn if_then_else(mask: &Bitmap, if_true: &Self, if_false: &Self) -> Self { - let values = - if_then_else_loop( - mask, - if_true.values(), - if_false.values(), - scalar::if_then_else_scalar_rest, - // Auto-generated SIMD was slower on ARM. - #[cfg(target_arch = "x86_64")] - if_then_else_simd_64, - #[cfg(not(target_arch = "x86_64"))] - scalar::if_then_else_scalar_64, - ) - ; + let values = if_then_else_loop( + mask, + if_true.values(), + if_false.values(), + scalar::if_then_else_scalar_rest, + // Auto-generated SIMD was slower on ARM. + #[cfg(target_arch = "x86_64")] + if_then_else_simd_64, + #[cfg(not(target_arch = "x86_64"))] + scalar::if_then_else_scalar_64, + ); let validity = if_then_else_validity(mask, if_true.validity(), if_false.validity()); PrimitiveArray::from_vec(values).with_validity(validity) } @@ -86,19 +82,17 @@ macro_rules! impl_if_then_else { if_true: Self::Scalar<'_>, if_false: &Self, ) -> Self { - let values = - if_then_else_loop_broadcast_false( - true, - mask, - if_false.values(), - if_true, - // Auto-generated SIMD was slower on ARM. - #[cfg(target_arch = "x86_64")] - if_then_else_broadcast_false_simd_64, - #[cfg(not(target_arch = "x86_64"))] - scalar::if_then_else_broadcast_false_scalar_64, - ) - ; + let values = if_then_else_loop_broadcast_false( + true, + mask, + if_false.values(), + if_true, + // Auto-generated SIMD was slower on ARM. + #[cfg(target_arch = "x86_64")] + if_then_else_broadcast_false_simd_64, + #[cfg(not(target_arch = "x86_64"))] + scalar::if_then_else_broadcast_false_scalar_64, + ); let validity = if_then_else_validity(mask, None, if_false.validity()); PrimitiveArray::from_vec(values).with_validity(validity) } @@ -108,19 +102,17 @@ macro_rules! impl_if_then_else { if_true: &Self, if_false: Self::Scalar<'_>, ) -> Self { - let values = - if_then_else_loop_broadcast_false( - false, - mask, - if_true.values(), - if_false, - // Auto-generated SIMD was slower on ARM. - #[cfg(target_arch = "x86_64")] - if_then_else_broadcast_false_simd_64, - #[cfg(not(target_arch = "x86_64"))] - scalar::if_then_else_broadcast_false_scalar_64, - ) - ; + let values = if_then_else_loop_broadcast_false( + false, + mask, + if_true.values(), + if_false, + // Auto-generated SIMD was slower on ARM. + #[cfg(target_arch = "x86_64")] + if_then_else_broadcast_false_simd_64, + #[cfg(not(target_arch = "x86_64"))] + scalar::if_then_else_broadcast_false_scalar_64, + ); let validity = if_then_else_validity(mask, if_true.validity(), None); PrimitiveArray::from_vec(values).with_validity(validity) } @@ -131,18 +123,16 @@ macro_rules! impl_if_then_else { if_true: Self::Scalar<'_>, if_false: Self::Scalar<'_>, ) -> Self { - let values = - if_then_else_loop_broadcast_both( - mask, - if_true, - if_false, - // Auto-generated SIMD was slower on ARM. - #[cfg(target_arch = "x86_64")] - if_then_else_broadcast_both_simd_64, - #[cfg(not(target_arch = "x86_64"))] - scalar::if_then_else_broadcast_both_scalar_64, - ) - ; + let values = if_then_else_loop_broadcast_both( + mask, + if_true, + if_false, + // Auto-generated SIMD was slower on ARM. + #[cfg(target_arch = "x86_64")] + if_then_else_broadcast_both_simd_64, + #[cfg(not(target_arch = "x86_64"))] + scalar::if_then_else_broadcast_both_scalar_64, + ); PrimitiveArray::from_vec(values) } } diff --git a/crates/polars-compute/src/lib.rs b/crates/polars-compute/src/lib.rs index d21b72118d09..bb9ca6647245 100644 --- a/crates/polars-compute/src/lib.rs +++ b/crates/polars-compute/src/lib.rs @@ -15,9 +15,8 @@ pub mod min_max; pub mod arity; - // Trait to enable the scalar blanket implementation. -pub trait NotSimdPrimitive : NativeType {} +pub trait NotSimdPrimitive: NativeType {} #[cfg(not(feature = "simd"))] impl NotSimdPrimitive for T {} From 3c9389f53088e465d985492c962ab18c3972464a Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Mon, 18 Mar 2024 17:35:49 +0100 Subject: [PATCH 7/8] silence unused warning on arm --- crates/polars-compute/src/if_then_else/simd.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/polars-compute/src/if_then_else/simd.rs b/crates/polars-compute/src/if_then_else/simd.rs index 411f1e497f3f..f17719711eb0 100644 --- a/crates/polars-compute/src/if_then_else/simd.rs +++ b/crates/polars-compute/src/if_then_else/simd.rs @@ -10,6 +10,7 @@ use super::{ if_then_else_validity, scalar, IfThenElseKernel, }; +#[cfg(target_arch = "x86_64")] fn select_simd_64( mask: u64, if_true: Simd, @@ -24,6 +25,7 @@ fn select_simd_64( } } +#[cfg(target_arch = "x86_64")] fn if_then_else_simd_64( mask: u64, if_true: &[T; 64], @@ -38,6 +40,7 @@ fn if_then_else_simd_64( ) } +#[cfg(target_arch = "x86_64")] fn if_then_else_broadcast_false_simd_64( mask: u64, if_true: &[T; 64], @@ -47,6 +50,7 @@ fn if_then_else_broadcast_false_simd_64( select_simd_64(mask, Simd::from_slice(if_true), Simd::splat(if_false), out) } +#[cfg(target_arch = "x86_64")] fn if_then_else_broadcast_both_simd_64( mask: u64, if_true: T, From 379bd53366a2e0ca3f1d0e11a20b2c6ed5ac1d88 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Mon, 18 Mar 2024 17:48:30 +0100 Subject: [PATCH 8/8] fix conditional errors/warnings --- crates/polars-compute/src/filter/primitive.rs | 2 +- crates/polars-compute/src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/polars-compute/src/filter/primitive.rs b/crates/polars-compute/src/filter/primitive.rs index c4c7a5d5cfea..9cc542b60978 100644 --- a/crates/polars-compute/src/filter/primitive.rs +++ b/crates/polars-compute/src/filter/primitive.rs @@ -1,6 +1,6 @@ use arrow::bitmap::Bitmap; use bytemuck::{cast_slice, cast_vec, Pod}; -#[cfg(target_arch = "x86_64")] +#[cfg(all(target_arch = "x86_64", feature = "simd"))] use polars_utils::cpuid::is_avx512_enabled; #[cfg(all(target_arch = "x86_64", feature = "simd"))] diff --git a/crates/polars-compute/src/lib.rs b/crates/polars-compute/src/lib.rs index bb9ca6647245..0911243e3b4d 100644 --- a/crates/polars-compute/src/lib.rs +++ b/crates/polars-compute/src/lib.rs @@ -19,7 +19,7 @@ pub mod arity; pub trait NotSimdPrimitive: NativeType {} #[cfg(not(feature = "simd"))] -impl NotSimdPrimitive for T {} +impl NotSimdPrimitive for T {} #[cfg(feature = "simd")] impl NotSimdPrimitive for u128 {}