-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
f32::min()/max() not inlined #42423
Comments
We should reimplement fn fmax(a: f32, b: f32) -> f32 {
if b.is_nan() || b <= a { a } else { b }
}
fn fmin(a: f32, b: f32) -> f32 {
if b.is_nan() || b >= a { a } else { b }
} The resulting ASM looks like #[no_mangle] #[inline(never)] pub fn better(x: f32) -> f32 {
fmin(fmax(x, 0.), 1.)
}
=>
better:
.cfi_startproc
xorps xmm1, xmm1
cmpless xmm1, xmm0
andps xmm0, xmm1
minss xmm0, dword ptr [rip + .LCPI0_0]
ret I compared the equivalent C program __attribute__((noinline))
float clamp(float x) {
return fmin(fmax(x, 0), 1);
} In clamp:
sub rsp, 8
vxorpd xmm1, xmm1, xmm1
vcvtss2sd xmm0, xmm0, xmm0
call fmax
vmovsd xmm1, QWORD PTR .LC1[rip]
call fmin
add rsp, 8
vcvtsd2ss xmm0, xmm0, xmm0
ret while in clamp:
vcvtss2sd xmm0, xmm0, xmm0
vmaxsd xmm0, xmm0, QWORD PTR .LC0[rip]
vminsd xmm0, xmm0, QWORD PTR .LC1[rip]
vcvtsd2ss xmm0, xmm0, xmm0
ret The result is similar in clang 4.Without clamp: # @clamp
vxorps xmm1, xmm1, xmm1
vmaxss xmm1, xmm1, xmm0
vcmpunordss xmm0, xmm0, xmm0
vandnps xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero
vminss xmm2, xmm1, xmm0
vcmpunordss xmm0, xmm0, xmm0
vandnps xmm2, xmm0, xmm2
vandps xmm0, xmm0, xmm1
vorps xmm0, xmm2, xmm0
ret With clamp: # @clamp
vxorps xmm1, xmm1, xmm1
vmaxss xmm0, xmm0, xmm1
vminss xmm0, xmm0, dword ptr [rip + .LCPI0_0]
ret ICC 17 refuses to inline To be precise, the optimization is done with the Test of equivalence and showing considerate speed improvement: Program#![feature(test)]
#![feature(dec2flt)]
extern crate test;
extern crate core;
use test::{Bencher, black_box};
use core::num::dec2flt::rawfp::RawFloat;
use std::f32;
fn fmax(a: f32, b: f32) -> f32 {
if b.is_nan() || b <= a { a } else { b }
}
fn fmin(a: f32, b: f32) -> f32 {
if b.is_nan() || b >= a { a } else { b }
}
const NUMBERS: &[f32] = &[
f32::INFINITY,
f32::MAX,
10.0,
1.0,
0.1,
f32::MIN_POSITIVE,
0.0,
-0.0,
-f32::MIN_POSITIVE,
-0.1,
-1.0
-10.0,
-f32::MAX,
-f32::INFINITY,
f32::NAN,
];
#[test]
fn test_equivalent() {
for a in NUMBERS {
for b in NUMBERS {
let c = fmax(*a, *b);
let d = a.max(*b);
assert_eq!(c.integer_decode(), d.integer_decode(), "max({:?}, {:?}) => wrong:{:?}, correct:{:?}", a, b, c, d);
let e = fmin(*a, *b);
let f = a.min(*b);
assert_eq!(e.integer_decode(), f.integer_decode(), "min({:?}, {:?}) => wrong:{:?}, correct:{:?}", a, b, e, f);
}
}
}
#[bench]
fn test_max_std(bencher: &mut Bencher) {
bencher.iter(|| {
for a in NUMBERS {
for b in NUMBERS {
black_box(a.max(*b));
}
}
})
}
#[bench]
fn test_max_new(bencher: &mut Bencher) {
bencher.iter(|| {
for a in NUMBERS {
for b in NUMBERS {
black_box(fmax(*a, *b));
}
}
})
}
#[bench]
fn test_min_std(bencher: &mut Bencher) {
bencher.iter(|| {
for a in NUMBERS {
for b in NUMBERS {
black_box(a.min(*b));
}
}
})
}
#[bench]
fn test_min_new(bencher: &mut Bencher) {
bencher.iter(|| {
for a in NUMBERS {
for b in NUMBERS {
black_box(fmin(*a, *b));
}
}
})
} $ ./1 --test --bench
running 5 tests
test test_equivalent ... ok
test test_max_new ... bench: 66 ns/iter (+/- 19)
test test_max_std ... bench: 504 ns/iter (+/- 64)
test test_min_new ... bench: 68 ns/iter (+/- 30)
test test_min_std ... bench: 509 ns/iter (+/- 80)
test result: ok. 1 passed; 0 failed; 0 ignored; 4 measured; 0 filtered out |
There is an LLVM intrinsic Check program// -Copt-level=3 -Ctarget-cpu=native -Cpanic=abort
#![crate_type="lib"]
#![feature(link_llvm_intrinsics)]
extern {
#[link_name = "llvm.minnum.f32"]
pub fn minnum_f32(a: f32, b: f32) -> f32;
#[link_name = "llvm.maxnum.f32"]
pub fn maxnum_f32(a: f32, b: f32) -> f32;
}
pub trait MinMaxTest: Sized {
fn min_llvm(self, other: Self) -> Self;
fn min_rust(self, other: Self) -> Self;
fn max_llvm(self, other: Self) -> Self;
fn max_rust(self, other: Self) -> Self;
}
impl MinMaxTest for f32 {
/*
x86_64:
_<f32 as 2::MinMaxTest>::min_llvm::hd76590e5233a75ba:
pushq %rbp
movq %rsp, %rbp
vminss %xmm1, %xmm0, %xmm2
vcmpunordss %xmm1, %xmm1, %xmm1
vblendvps %xmm1, %xmm0, %xmm2, %xmm0
popq %rbp
retq
aarch64:
_<f32 as 2::MinMaxTest>::min_llvm::hd76590e5233a75ba:
fminnm s0, s1, s0
ret
*/
fn min_llvm(self, other: Self) -> Self {
unsafe { minnum_f32(other, self) }
}
/*
x86_64:
_<f32 as 2::MinMaxTest>::min_rust::h6a6994ac4726b5ad:
pushq %rbp
movq %rsp, %rbp
vminss %xmm1, %xmm0, %xmm2
vcmpunordss %xmm1, %xmm1, %xmm1
vblendvps %xmm1, %xmm0, %xmm2, %xmm0
popq %rbp
retq
aarch64:
_<f32 as 2::MinMaxTest>::min_rust::h6a6994ac4726b5ad:
fcmp s1, s0
fccmp s1, s1, #1, le
fcsel s0, s0, s1, vs
ret
*/
fn min_rust(self, other: Self) -> Self {
if other.is_nan() || other > self { self } else { other }
}
/*
x86_64:
_<f32 as 2::MinMaxTest>::max_llvm::h8b7b369f1c3662de:
pushq %rbp
movq %rsp, %rbp
vmaxss %xmm1, %xmm0, %xmm2
vcmpunordss %xmm1, %xmm1, %xmm1
vblendvps %xmm1, %xmm0, %xmm2, %xmm0
popq %rbp
retq
aarch64:
_<f32 as 2::MinMaxTest>::max_llvm::h8b7b369f1c3662de:
fmaxnm s0, s1, s0
ret
*/
fn max_llvm(self, other: Self) -> Self {
unsafe { maxnum_f32(other, self) }
}
/*
x86_64:
_<f32 as 2::MinMaxTest>::max_rust::hb29df4916b2f44f3:
pushq %rbp
movq %rsp, %rbp
vmaxss %xmm1, %xmm0, %xmm2
vcmpunordss %xmm1, %xmm1, %xmm1
vblendvps %xmm1, %xmm0, %xmm2, %xmm0
popq %rbp
retq
aarch64:
_<f32 as 2::MinMaxTest>::max_rust::hb29df4916b2f44f3:
fcmp s1, s0
fccmp s1, s1, #1, pl
fcsel s0, s0, s1, vs
ret
*/
fn max_rust(self, other: Self) -> Self {
if other.is_nan() || other < self { self } else { other }
}
}
/*
x86_64:
__2::clamp01_rfc1961::hf9aa1066c6c8cd7e:
pushq %rbp
movq %rsp, %rbp
vxorps %xmm1, %xmm1, %xmm1
vmaxss %xmm0, %xmm1, %xmm0
vmovss LCPI4_0(%rip), %xmm1
vminss %xmm0, %xmm1, %xmm0
popq %rbp
retq
aarch64:
__2::clamp01_rfc1961::hf9aa1066c6c8cd7e:
movi.2d v1, #0000000000000000
fmax s0, s0, s1
fmov s1, #1.00000000
fmin s0, s0, s1
ret
*/
pub fn clamp01_rfc1961(mut x: f32) -> f32 {
if x < 0.0 { x = 0.0; }
if x > 1.0 { x = 1.0; }
x
}
/*
x86_64:
__2::clamp01_llvm::h30ee56b2e9fb6a19:
pushq %rbp
movq %rsp, %rbp
vxorps %xmm1, %xmm1, %xmm1
vmaxss %xmm0, %xmm1, %xmm1
vcmpunordss %xmm0, %xmm0, %xmm0
vandnps %xmm1, %xmm0, %xmm0
vmovss LCPI5_0(%rip), %xmm1
vminss %xmm0, %xmm1, %xmm2
vcmpunordss %xmm0, %xmm0, %xmm0
vandnps %xmm2, %xmm0, %xmm2
vandps %xmm1, %xmm0, %xmm0
vorps %xmm0, %xmm2, %xmm0
popq %rbp
retq
aarch64:
__2::clamp01_llvm::h30ee56b2e9fb6a19:
movi.2d v1, #0000000000000000
fmaxnm s0, s0, s1
fmov s1, #1.00000000
fminnm s0, s0, s1
ret
*/
pub fn clamp01_llvm(x: f32) -> f32 {
x.max_llvm(0.0).min_llvm(1.0)
}
/*
x86_64:
__2::clamp01_rust::h324077e72816ec64:
pushq %rbp
movq %rsp, %rbp
vxorps %xmm1, %xmm1, %xmm1
vmaxss %xmm1, %xmm0, %xmm0
vminss LCPI6_0(%rip), %xmm0, %xmm0
popq %rbp
retq
aarch64:
__2::clamp01_rust::h324077e72816ec64:
movi.2d v1, #0000000000000000
fmaxnm s0, s0, s1
fmov s1, #1.00000000
fminnm s0, s0, s1
ret
*/
pub fn clamp01_rust(x: f32) -> f32 {
x.max_rust(0.0).min_rust(1.0)
}
/*
x86_64:
__2::clamp01_std::h5df114f63023c56b:
pushq %rbp
movq %rsp, %rbp
vxorps %xmm1, %xmm1, %xmm1
callq _fmaxf
vmovss LCPI7_0(%rip), %xmm1
popq %rbp
jmp _fminf
aarch64:
__2::clamp01_std::h5df114f63023c56b:
stp x29, x30, [sp, #-16]!
mov x29, sp
movi.2d v1, #0000000000000000
bl _fmaxf
fmov s1, #1.00000000
ldp x29, x30, [sp], #16
b _fminf
*/
pub fn clamp01_std(x: f32) -> f32 {
x.max(0.0).min(1.0)
} |
Re-implement float min/max in rust This also adds the relevant implementations into libcore. See #42423
Seems fixed. Closing. bad:
xorps %xmm1, %xmm1
cmpless %xmm0, %xmm1
andps %xmm1, %xmm0
minss .LCPI0_0(%rip), %xmm0
retq
good:
xorps %xmm1, %xmm1
cmpnless %xmm0, %xmm1
minss .LCPI1_0(%rip), %xmm0
andnps %xmm0, %xmm1
movaps %xmm1, %xmm0
retq |
compiles to (with optimizations):
but I'd expect it to be inlined and generate code that is exactly the same as
if x > 0. {x} else {0.}
, which compiles tomaxss
inline. I've checked that even LTO doesn't inline the calls.The text was updated successfully, but these errors were encountered: