-
Notifications
You must be signed in to change notification settings - Fork 82
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
extern "platform-intrinsics" float functions often call libm #76
Comments
(I wonder if any of the Hi, I'm [accidentally] a
Basically, floats have been kept out of The only reason that we don't just put more libm stuff directly into core is that it might be less optimal than the system's local llibm, so we want to try to link to the system's libm when it's available. However, none of libm is SIMD aware in the first place. The local libm will never beat our code because it can't even do the operation. So in the case of SIMD we can just always use our version of a given function. most of the time Because if there's an op we can't implement ourselves for all platforms then that'll be trouble. |
A lot of discussion happened about this. We concluded this is not easily fixable in one go. Some things that came up:
rust-lang/rust#64609 is one of the partial blockers to many solutions. |
I have updated this with a recent version. Most dramatic change: diff --git a/src/unopt/simd_libm.asm b/src/unopt/simd_libm.asm
index dc84af2..91ad1d5 100644
--- a/src/unopt/simd_libm.asm
+++ b/src/unopt/simd_libm.asm
@@ -1,9 +1,9 @@
-; 2021-02-16
+; 2021-04-16
; https://github.com/rust-lang/stdsimd/issues/76
example::f32x4::mul_add:
- pushq %r14
- pushq %rbx
- subq $88, %rsp
- movq %rdi, %r14
movaps (%rsi), %xmm0
- movaps %xmm0, 48(%rsp)
- movaps (%rdx), %xmm1
- movaps %xmm1, 16(%rsp)
- movaps (%rcx), %xmm3
- movaps %xmm3, 32(%rsp)
- shufps $231, %xmm0, %xmm0
- shufps $231, %xmm1, %xmm1
- movaps %xmm3, %xmm2
- shufps $231, %xmm3, %xmm2
- movq fmaf@GOTPCREL(%rip), %rbx
- callq *%rbx
- movaps %xmm0, (%rsp)
- movaps 48(%rsp), %xmm0
- movhlps %xmm0, %xmm0
- movaps 16(%rsp), %xmm1
- movhlps %xmm1, %xmm1
- movaps 32(%rsp), %xmm2
- movhlps %xmm2, %xmm2
- callq *%rbx
- unpcklps (%rsp), %xmm0
- movaps %xmm0, (%rsp)
- movaps 48(%rsp), %xmm0
- movaps 16(%rsp), %xmm1
- movaps 32(%rsp), %xmm2
- callq *%rbx
- movaps %xmm0, 64(%rsp)
- movaps 48(%rsp), %xmm0
- shufps $229, %xmm0, %xmm0
- movaps 16(%rsp), %xmm1
- shufps $229, %xmm1, %xmm1
- movaps 32(%rsp), %xmm2
- shufps $229, %xmm2, %xmm2
- callq *%rbx
- movaps 64(%rsp), %xmm1
- unpcklps %xmm0, %xmm1
- unpcklpd (%rsp), %xmm1
- movaps %xmm1, (%r14)
- movq %r14, %rax
- addq $88, %rsp
- popq %rbx
- popq %r14
+ mulps (%rdx), %xmm0
+ movq %rdi, %rax
+ addps (%rcx), %xmm0
+ movaps %xmm0, (%rdi)
retq
|
Regressed again, presumably due to rust-lang/rust#84274: example::f32x4::mul_add:
pushq %r14
pushq %rbx
subq $88, %rsp
movq %rdi, %r14
movaps (%rsi), %xmm0
movaps %xmm0, 48(%rsp)
movaps (%rdx), %xmm1
movaps %xmm1, 16(%rsp)
movaps (%rcx), %xmm3
movaps %xmm3, 32(%rsp)
shufps $255, %xmm0, %xmm0
shufps $255, %xmm1, %xmm1
movaps %xmm3, %xmm2
shufps $255, %xmm3, %xmm2
movq fmaf@GOTPCREL(%rip), %rbx
callq *%rbx
movaps %xmm0, (%rsp)
movaps 48(%rsp), %xmm0
movhlps %xmm0, %xmm0
movaps 16(%rsp), %xmm1
movhlps %xmm1, %xmm1
movaps 32(%rsp), %xmm2
movhlps %xmm2, %xmm2
callq *%rbx
unpcklps (%rsp), %xmm0
movaps %xmm0, (%rsp)
movaps 48(%rsp), %xmm0
movaps 16(%rsp), %xmm1
movaps 32(%rsp), %xmm2
callq *%rbx
movaps %xmm0, 64(%rsp)
movaps 48(%rsp), %xmm0
shufps $85, %xmm0, %xmm0
movaps 16(%rsp), %xmm1
shufps $85, %xmm1, %xmm1
movaps 32(%rsp), %xmm2
shufps $85, %xmm2, %xmm2
callq *%rbx
movaps 64(%rsp), %xmm1
unpcklps %xmm0, %xmm1
unpcklpd (%rsp), %xmm1
movaps %xmm1, (%r14)
movq %r14, %rax
addq $88, %rsp
popq %rbx
popq %r14
retq |
I tried this code (Godbolt):
Rust Code
Instead, this happened: Mostly compiled to calls to libm!
When sufficient vector features are enabled, these do compile to vectorized assembly instructions. However, the problem is that compilation without those features enabled means code that depends on libm... which is not allowed in
core
. We are going to have to either solve this or push our implementation ofSimdF32
andSimdF64
mostly intostd
, notcore
.Notable winners on x64:
simd_fsqrt
,simd_fabs
become vector instructions just fine. I'm worried about them on x86_32 or Arm architectures, though.Meta
rustc --version --verbose
:x86 Assembly
AArch64 Assembly
The text was updated successfully, but these errors were encountered: