Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

f32::min()/max() not inlined #42423

Closed
kornelski opened this issue Jun 4, 2017 · 3 comments
Closed

f32::min()/max() not inlined #42423

kornelski opened this issue Jun 4, 2017 · 3 comments

Comments

@kornelski
Copy link
Contributor

kornelski commented Jun 4, 2017

fn test(x: f32) -> f32 {
    x.max(0.)
}

compiles to (with optimizations):

jmp	fmaxf@PLT

but I'd expect it to be inlined and generate code that is exactly the same as if x > 0. {x} else {0.}, which compiles to maxss inline. I've checked that even LTO doesn't inline the calls.

@kennytm
Copy link
Member

kennytm commented Jun 4, 2017

We should reimplement fNN::max and min to not rely on cmath?

fn fmax(a: f32, b: f32) -> f32 {
    if b.is_nan() || b <= a { a } else { b }
}
fn fmin(a: f32, b: f32) -> f32 {
    if b.is_nan() || b >= a { a } else { b }
}

The resulting ASM looks like

#[no_mangle] #[inline(never)] pub fn better(x: f32) -> f32 {
    fmin(fmax(x, 0.), 1.)
}

=>

better:
	.cfi_startproc
	xorps	xmm1, xmm1
	cmpless	xmm1, xmm0
	andps	xmm0, xmm1
	minss	xmm0, dword ptr [rip + .LCPI0_0]
	ret

I compared the equivalent C program

__attribute__((noinline))
float clamp(float x) {
    return fmin(fmax(x, 0), 1);
}

In gcc-7.1 -x c -std=c11 -O3 -march=native this compiled to

clamp:
        sub     rsp, 8
        vxorpd  xmm1, xmm1, xmm1
        vcvtss2sd       xmm0, xmm0, xmm0
        call    fmax
        vmovsd  xmm1, QWORD PTR .LC1[rip]
        call    fmin
        add     rsp, 8
        vcvtsd2ss       xmm0, xmm0, xmm0
        ret

while in gcc-7.1 -x c -std=c11 -O3 -march=native -ffast-math this becomes

clamp:
        vcvtss2sd       xmm0, xmm0, xmm0
        vmaxsd  xmm0, xmm0, QWORD PTR .LC0[rip]
        vminsd  xmm0, xmm0, QWORD PTR .LC1[rip]
        vcvtsd2ss       xmm0, xmm0, xmm0
        ret
The result is similar in clang 4.

Without -ffast-math

clamp:                                  # @clamp
        vxorps  xmm1, xmm1, xmm1
        vmaxss  xmm1, xmm1, xmm0
        vcmpunordss     xmm0, xmm0, xmm0
        vandnps xmm0, xmm0, xmm1
        vmovss  xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero
        vminss  xmm2, xmm1, xmm0
        vcmpunordss     xmm0, xmm0, xmm0
        vandnps xmm2, xmm0, xmm2
        vandps  xmm0, xmm0, xmm1
        vorps   xmm0, xmm2, xmm0
        ret

With -ffast-math

clamp:                                  # @clamp
        vxorps  xmm1, xmm1, xmm1
        vmaxss  xmm0, xmm0, xmm1
        vminss  xmm0, xmm0, dword ptr [rip + .LCPI0_0]
        ret

ICC 17 refuses to inline fmin or fmax with or without -ffast-math.

To be precise, the optimization is done with the -ffinite-math-only flag, but I don't see any problem in the Rust code when ±∞, ±0 or quiet NaNs are involved. -fno-signaling-nans does nothing so I don't think signaling NaNs is a concern for GCC.


Test of equivalence and showing considerate speed improvement:

Program
#![feature(test)]
#![feature(dec2flt)]

extern crate test;
extern crate core;

use test::{Bencher, black_box};
use core::num::dec2flt::rawfp::RawFloat;
use std::f32;

fn fmax(a: f32, b: f32) -> f32 {
    if b.is_nan() || b <= a { a } else { b }
}
fn fmin(a: f32, b: f32) -> f32 {
    if b.is_nan() || b >= a { a } else { b }
}

const NUMBERS: &[f32] = &[
    f32::INFINITY,
    f32::MAX,
    10.0,
    1.0,
    0.1,
    f32::MIN_POSITIVE,
    0.0,
    -0.0,
    -f32::MIN_POSITIVE,
    -0.1,
    -1.0
    -10.0,
    -f32::MAX,
    -f32::INFINITY,
    f32::NAN,
];

#[test]
fn test_equivalent() {
    for a in NUMBERS {
        for b in NUMBERS {
            let c = fmax(*a, *b);
            let d = a.max(*b);
            assert_eq!(c.integer_decode(), d.integer_decode(), "max({:?}, {:?}) => wrong:{:?}, correct:{:?}", a, b, c, d);

            let e = fmin(*a, *b);
            let f = a.min(*b);
            assert_eq!(e.integer_decode(), f.integer_decode(), "min({:?}, {:?}) => wrong:{:?}, correct:{:?}", a, b, e, f);
        }
    }
}

#[bench]
fn test_max_std(bencher: &mut Bencher) {
    bencher.iter(|| {
        for a in NUMBERS {
            for b in NUMBERS {
                black_box(a.max(*b));
            }
        }
    })
}

#[bench]
fn test_max_new(bencher: &mut Bencher) {
    bencher.iter(|| {
        for a in NUMBERS {
            for b in NUMBERS {
                black_box(fmax(*a, *b));
            }
        }
    })
}

#[bench]
fn test_min_std(bencher: &mut Bencher) {
    bencher.iter(|| {
        for a in NUMBERS {
            for b in NUMBERS {
                black_box(a.min(*b));
            }
        }
    })
}

#[bench]
fn test_min_new(bencher: &mut Bencher) {
    bencher.iter(|| {
        for a in NUMBERS {
            for b in NUMBERS {
                black_box(fmin(*a, *b));
            }
        }
    })
}
$ ./1 --test --bench

running 5 tests
test test_equivalent ... ok
test test_max_new ... bench:          66 ns/iter (+/- 19)
test test_max_std ... bench:         504 ns/iter (+/- 64)
test test_min_new ... bench:          68 ns/iter (+/- 30)
test test_min_std ... bench:         509 ns/iter (+/- 80)

test result: ok. 1 passed; 0 failed; 0 ignored; 4 measured; 0 filtered out

@kennytm
Copy link
Member

kennytm commented Jun 4, 2017

There is an LLVM intrinsic llvm.minnum.fNN. When compared with the pure Rust implementation, the generated assembly is generally worse on x86_64 and i686, and better on ARM and AArch64 (Both won't call fmaxf/fminf unless there is no hard-float support). I haven't checked other targets.

Check program
// -Copt-level=3 -Ctarget-cpu=native -Cpanic=abort 

#![crate_type="lib"]
#![feature(link_llvm_intrinsics)]

extern {
    #[link_name = "llvm.minnum.f32"]
    pub fn minnum_f32(a: f32, b: f32) -> f32;
    #[link_name = "llvm.maxnum.f32"]
    pub fn maxnum_f32(a: f32, b: f32) -> f32;
}

pub trait MinMaxTest: Sized {
    fn min_llvm(self, other: Self) -> Self;
    fn min_rust(self, other: Self) -> Self;
    fn max_llvm(self, other: Self) -> Self;
    fn max_rust(self, other: Self) -> Self;
}

impl MinMaxTest for f32 {
    /*
    x86_64:
        _<f32 as 2::MinMaxTest>::min_llvm::hd76590e5233a75ba:
            pushq	%rbp
            movq	%rsp, %rbp
            vminss	%xmm1, %xmm0, %xmm2
            vcmpunordss	%xmm1, %xmm1, %xmm1
            vblendvps	%xmm1, %xmm0, %xmm2, %xmm0
            popq	%rbp
            retq
    
    aarch64:
        _<f32 as 2::MinMaxTest>::min_llvm::hd76590e5233a75ba:
            fminnm	s0, s1, s0
            ret
    */
    fn min_llvm(self, other: Self) -> Self {
        unsafe { minnum_f32(other, self) }     
    }

    /*
    x86_64:
        _<f32 as 2::MinMaxTest>::min_rust::h6a6994ac4726b5ad:
            pushq	%rbp
            movq	%rsp, %rbp
            vminss	%xmm1, %xmm0, %xmm2
            vcmpunordss	%xmm1, %xmm1, %xmm1
            vblendvps	%xmm1, %xmm0, %xmm2, %xmm0
            popq	%rbp
            retq
    
    aarch64:
        _<f32 as 2::MinMaxTest>::min_rust::h6a6994ac4726b5ad:
            fcmp	s1, s0
            fccmp	s1, s1, #1, le
            fcsel	s0, s0, s1, vs
            ret
    */
    fn min_rust(self, other: Self) -> Self {
        if other.is_nan() || other > self { self } else { other }
    }

    /*
    x86_64:
        _<f32 as 2::MinMaxTest>::max_llvm::h8b7b369f1c3662de:
            pushq	%rbp
            movq	%rsp, %rbp
            vmaxss	%xmm1, %xmm0, %xmm2
            vcmpunordss	%xmm1, %xmm1, %xmm1
            vblendvps	%xmm1, %xmm0, %xmm2, %xmm0
            popq	%rbp
            retq

    aarch64:
        _<f32 as 2::MinMaxTest>::max_llvm::h8b7b369f1c3662de:
            fmaxnm	s0, s1, s0
            ret
    */
    fn max_llvm(self, other: Self) -> Self {
        unsafe { maxnum_f32(other, self) }
    }

    /*
    x86_64:
        _<f32 as 2::MinMaxTest>::max_rust::hb29df4916b2f44f3:
            pushq	%rbp
            movq	%rsp, %rbp
            vmaxss	%xmm1, %xmm0, %xmm2
            vcmpunordss	%xmm1, %xmm1, %xmm1
            vblendvps	%xmm1, %xmm0, %xmm2, %xmm0
            popq	%rbp
            retq

    aarch64:
        _<f32 as 2::MinMaxTest>::max_rust::hb29df4916b2f44f3:
            fcmp	s1, s0
            fccmp	s1, s1, #1, pl
            fcsel	s0, s0, s1, vs
            ret
    */
    fn max_rust(self, other: Self) -> Self {
        if other.is_nan() || other < self { self } else { other }
    }

}

/*
x86_64:
    __2::clamp01_rfc1961::hf9aa1066c6c8cd7e:
        pushq	%rbp
        movq	%rsp, %rbp
        vxorps	%xmm1, %xmm1, %xmm1
        vmaxss	%xmm0, %xmm1, %xmm0
        vmovss	LCPI4_0(%rip), %xmm1
        vminss	%xmm0, %xmm1, %xmm0
        popq	%rbp
        retq

aarch64:
    __2::clamp01_rfc1961::hf9aa1066c6c8cd7e:
        movi.2d	v1, #0000000000000000
        fmax	s0, s0, s1
        fmov	s1, #1.00000000
        fmin	s0, s0, s1
        ret
*/
pub fn clamp01_rfc1961(mut x: f32) -> f32 {
    if x < 0.0 { x = 0.0; }
    if x > 1.0 { x = 1.0; }
    x
}

/*
x86_64:
    __2::clamp01_llvm::h30ee56b2e9fb6a19:
        pushq	%rbp
        movq	%rsp, %rbp
        vxorps	%xmm1, %xmm1, %xmm1
        vmaxss	%xmm0, %xmm1, %xmm1
        vcmpunordss	%xmm0, %xmm0, %xmm0
        vandnps	%xmm1, %xmm0, %xmm0
        vmovss	LCPI5_0(%rip), %xmm1
        vminss	%xmm0, %xmm1, %xmm2
        vcmpunordss	%xmm0, %xmm0, %xmm0
        vandnps	%xmm2, %xmm0, %xmm2
        vandps	%xmm1, %xmm0, %xmm0
        vorps	%xmm0, %xmm2, %xmm0
        popq	%rbp
        retq

aarch64:
    __2::clamp01_llvm::h30ee56b2e9fb6a19:
        movi.2d	v1, #0000000000000000
        fmaxnm	s0, s0, s1
        fmov	s1, #1.00000000
        fminnm	s0, s0, s1
        ret
*/
pub fn clamp01_llvm(x: f32) -> f32 {
    x.max_llvm(0.0).min_llvm(1.0)
}

/*
x86_64:
    __2::clamp01_rust::h324077e72816ec64:
        pushq	%rbp
        movq	%rsp, %rbp
        vxorps	%xmm1, %xmm1, %xmm1
        vmaxss	%xmm1, %xmm0, %xmm0
        vminss	LCPI6_0(%rip), %xmm0, %xmm0
        popq	%rbp
        retq

aarch64:
    __2::clamp01_rust::h324077e72816ec64:
        movi.2d	v1, #0000000000000000
        fmaxnm	s0, s0, s1
        fmov	s1, #1.00000000
        fminnm	s0, s0, s1
        ret
*/
pub fn clamp01_rust(x: f32) -> f32 {
    x.max_rust(0.0).min_rust(1.0)
}

/*
x86_64:
    __2::clamp01_std::h5df114f63023c56b:
        pushq	%rbp
        movq	%rsp, %rbp
        vxorps	%xmm1, %xmm1, %xmm1
        callq	_fmaxf
        vmovss	LCPI7_0(%rip), %xmm1
        popq	%rbp
        jmp	_fminf

aarch64:
    __2::clamp01_std::h5df114f63023c56b:
        stp	x29, x30, [sp, #-16]!
        mov	 x29, sp
        movi.2d	v1, #0000000000000000
        bl	_fmaxf
        fmov	s1, #1.00000000
        ldp	x29, x30, [sp], #16
        b	_fminf
*/
pub fn clamp01_std(x: f32) -> f32 {
    x.max(0.0).min(1.0)
}

nagisa added a commit to nagisa/rust that referenced this issue Jun 14, 2017
bors added a commit that referenced this issue Jun 16, 2017
Re-implement float min/max in rust

This also adds the relevant implementations into libcore.

See #42423
@Mark-Simulacrum
Copy link
Member

Seems fixed. Closing.

bad:
	xorps	%xmm1, %xmm1
	cmpless	%xmm0, %xmm1
	andps	%xmm1, %xmm0
	minss	.LCPI0_0(%rip), %xmm0
	retq

good:
	xorps	%xmm1, %xmm1
	cmpnless	%xmm0, %xmm1
	minss	.LCPI1_0(%rip), %xmm0
	andnps	%xmm0, %xmm1
	movaps	%xmm1, %xmm0
	retq

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants