-
Notifications
You must be signed in to change notification settings - Fork 70
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement Neon SIMD #39
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,12 @@ cfg_if::cfg_if! { | |
#[derive(Clone, Copy, Debug)] | ||
#[repr(transparent)] | ||
pub struct f32x4(v128); | ||
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { | ||
use core::arch::aarch64::*; | ||
|
||
#[derive(Clone, Copy, Debug)] | ||
#[repr(C, align(16))] | ||
pub struct f32x4(float32x4_t); | ||
} else { | ||
#[derive(Clone, Copy, Debug)] | ||
#[repr(C, align(16))] | ||
|
@@ -40,34 +46,46 @@ impl f32x4 { | |
} | ||
|
||
pub fn max(self, rhs: Self) -> Self { | ||
// These technically don't have the same semantics for NaN and 0, but it | ||
// doesn't seem to matter as Skia does it the same way. | ||
cfg_if::cfg_if! { | ||
if #[cfg(all(feature = "simd", target_feature = "sse2"))] { | ||
Self(unsafe { _mm_max_ps(self.0, rhs.0) }) | ||
} else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { | ||
Self(f32x4_max(self.0, rhs.0)) | ||
Self(f32x4_pmax(self.0, rhs.0)) | ||
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { | ||
unsafe { | ||
Self(vmaxq_f32(self.0, rhs.0)) | ||
} | ||
} else { | ||
Self([ | ||
self.0[0].max(rhs.0[0]), | ||
self.0[1].max(rhs.0[1]), | ||
self.0[2].max(rhs.0[2]), | ||
self.0[3].max(rhs.0[3]), | ||
super::pmax(self.0[0], rhs.0[0]), | ||
super::pmax(self.0[1], rhs.0[1]), | ||
super::pmax(self.0[2], rhs.0[2]), | ||
super::pmax(self.0[3], rhs.0[3]), | ||
]) | ||
} | ||
} | ||
} | ||
|
||
pub fn min(self, rhs: Self) -> Self { | ||
// These technically don't have the same semantics for NaN and 0, but it | ||
// doesn't seem to matter as Skia does it the same way. | ||
cfg_if::cfg_if! { | ||
if #[cfg(all(feature = "simd", target_feature = "sse2"))] { | ||
Self(unsafe { _mm_min_ps(self.0, rhs.0) }) | ||
} else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { | ||
Self(f32x4_min(self.0, rhs.0)) | ||
Self(f32x4_pmin(self.0, rhs.0)) | ||
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { | ||
unsafe { | ||
Self(vminq_f32(self.0, rhs.0)) | ||
} | ||
} else { | ||
Self([ | ||
self.0[0].min(rhs.0[0]), | ||
self.0[1].min(rhs.0[1]), | ||
self.0[2].min(rhs.0[2]), | ||
self.0[3].min(rhs.0[3]), | ||
super::pmin(self.0[0], rhs.0[0]), | ||
super::pmin(self.0[1], rhs.0[1]), | ||
super::pmin(self.0[2], rhs.0[2]), | ||
super::pmin(self.0[3], rhs.0[3]), | ||
]) | ||
} | ||
} | ||
|
@@ -95,6 +113,10 @@ impl core::ops::Add for f32x4 { | |
Self(unsafe { _mm_add_ps(self.0, rhs.0) }) | ||
} else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { | ||
Self(f32x4_add(self.0, rhs.0)) | ||
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah we had this discussion before. I believe I want to keep this for the eventual followup PR that includes 32-bit ARM into the equation (which doesn't have neon by default). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh yes. Verbose, but fine. |
||
unsafe { | ||
Self(vaddq_f32(self.0, rhs.0)) | ||
} | ||
} else { | ||
Self([ | ||
self.0[0] + rhs.0[0], | ||
|
@@ -122,6 +144,10 @@ impl core::ops::Sub for f32x4 { | |
Self(unsafe { _mm_sub_ps(self.0, rhs.0) }) | ||
} else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { | ||
Self(f32x4_sub(self.0, rhs.0)) | ||
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { | ||
unsafe { | ||
Self(vsubq_f32(self.0, rhs.0)) | ||
} | ||
} else { | ||
Self([ | ||
self.0[0] - rhs.0[0], | ||
|
@@ -143,6 +169,10 @@ impl core::ops::Mul for f32x4 { | |
Self(unsafe { _mm_mul_ps(self.0, rhs.0) }) | ||
} else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { | ||
Self(f32x4_mul(self.0, rhs.0)) | ||
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { | ||
unsafe { | ||
Self(vmulq_f32(self.0, rhs.0)) | ||
} | ||
} else { | ||
Self([ | ||
self.0[0] * rhs.0[0], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@CryZe Is this a no_std variant?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, basically as I've noticed back then when I opened this PR, there's a bunch of platform differences when it comes to NaN, but Skia doesn't seem to care about them. Rust's default max is "unnecessarily slow" in that it cares for NaN (and I believe other edge cases), so I ported the pmin/pmax logic from wasm (that also is meant to ignore NaN and just do the fastest thing possible) so it can be used in the fallback code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you sure? It seems it's just an intrinsic: ihttps://doc.rust-lang.org/stable/src/core/num/f32.rs.html#749
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I compiled this list back then that really shows how much of a mess min and max are: https://gist.github.com/CryZe/30cc76f4629cb0846d5a9b8d13144649
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it just calls into fmaxf, which is C's overcomplicated function: https://rust.godbolt.org/z/KKz7875sr
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I will look into it in more details. We can leave it as is for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://docs.rs/libm/latest/src/libm/math/fmaxf.rs.html#11
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, I see. We technically shouldn't have NaN values to begin with, so a custom version should be fine.