Big performance problem with closed intervals looping #45222

leonardo-m · 2017-10-11T23:19:26Z

In my code I've essentially stopped using loops with ... (intervals closed on the right, recently written with the syntax ..=) because they give performance problems. This is a simple example that shows the problem:

#![feature(inclusive_range_syntax)]
#![allow(private_no_mangle_fns)]

#[inline(never)]
#[no_mangle]
fn foo1(n: u64) -> u64 {
    let mut count = 0;
    for _ in 0 .. n {
        for j in (0 .. n + 1).rev() {
            count += j;
        }
    }
    count
}

#[inline(never)]
#[no_mangle]
fn foo2(n: u64) -> u64 {
    let mut count = 0;
    for _ in 0 .. n {
        for j in (0 ..= n).rev() {
            count += j;
        }
    }
    count
}

fn main() {
    let n: u64 = std::env::args().nth(1).unwrap().parse().unwrap();
    let what: u32 = std::env::args().nth(2).unwrap().parse().unwrap();

    match what {
        1 => println!("{}", foo1(n)),
        2 => println!("{}", foo2(n)),
        _ => panic!(),
    }
}

Compiled with the last Nightly:

rustc 1.22.0-nightly (d6d711dd8 2017-10-10)
binary: rustc
commit-hash: d6d711dd8f7ad5885294b8e1f0009a23dc1f8b1f
commit-date: 2017-10-10
host: x86_64-pc-windows-gnu
release: 1.22.0-nightly
LLVM version: 4.0

Compiled with:
rustc -O test.rs

Running it calling foo1 takes 0.02 seconds:

...>elaps test 100000 1
500005000000000

Running it calling foo2 takes about 13.65 seconds:


...>elaps test 100000 2
500005000000000

The asm I am seeing using "--emit asm"

foo1:
	testq	%rcx, %rcx
	je	.LBB5_1
	movq	%rcx, %r8
	imulq	%r8, %r8
	leaq	-1(%rcx), %rdx
	movq	%rcx, %rax
	mulq	%rdx
	shldq	$63, %rax, %rdx
	subq	%rdx, %r8
	cmpq	$3, %rcx
	jbe	.LBB5_3
	movq	%rcx, %rdx
	andq	$-4, %rdx
	je	.LBB5_3
	movd	%r8, %xmm0
	pshufd	$68, %xmm0, %xmm2
	leaq	-4(%rdx), %r9
	movl	%r9d, %eax
	shrl	$2, %eax
	incl	%eax
	andq	$3, %rax
	je	.LBB5_8
	cmpq	$-1, %rcx
	pxor	%xmm0, %xmm0
	pxor	%xmm3, %xmm3
	je	.LBB5_11
	movdqa	%xmm2, %xmm3
.LBB5_11:
	negq	%rax
	xorl	%r10d, %r10d
	pxor	%xmm1, %xmm1
	.p2align	4, 0x90
.LBB5_12:
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	addq	$4, %r10
	incq	%rax
	jne	.LBB5_12
	jmp	.LBB5_13
.LBB5_3:
	xorl	%eax, %eax
	xorl	%edx, %edx
.LBB5_4:
	xorl	%r9d, %r9d
	cmpq	$-1, %rcx
	cmoveq	%r9, %r8
	.p2align	4, 0x90
.LBB5_5:
	incq	%rdx
	addq	%r8, %rax
	cmpq	%rcx, %rdx
	jb	.LBB5_5
.LBB5_19:
	retq
.LBB5_1:
	xorl	%eax, %eax
	retq
.LBB5_8:
	xorl	%r10d, %r10d
	pxor	%xmm0, %xmm0
	pxor	%xmm1, %xmm1
.LBB5_13:
	cmpq	$12, %r9
	jb	.LBB5_18
	cmpq	$-1, %rcx
	pxor	%xmm3, %xmm3
	je	.LBB5_16
	movdqa	%xmm2, %xmm3
.LBB5_16:
	movq	%rdx, %rax
	subq	%r10, %rax
	.p2align	4, 0x90
.LBB5_17:
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	addq	$-16, %rax
	jne	.LBB5_17
.LBB5_18:
	paddq	%xmm1, %xmm0
	pshufd	$78, %xmm0, %xmm1
	paddq	%xmm0, %xmm1
	movd	%xmm1, %rax
	cmpq	%rcx, %rdx
	jne	.LBB5_4
	jmp	.LBB5_19



foo2:
	pushq	%rsi
	pushq	%rdi
	pushq	%rbx
	testq	%rcx, %rcx
	je	.LBB6_1
	testb	$1, %cl
	jne	.LBB6_4
	xorl	%eax, %eax
	xorl	%r8d, %r8d
	cmpq	$1, %rcx
	jne	.LBB6_11
	jmp	.LBB6_23
.LBB6_1:
	xorl	%eax, %eax
	jmp	.LBB6_23
.LBB6_4:
	xorl	%r8d, %r8d
	movq	$-1, %r9
	xorl	%r10d, %r10d
	movq	%rcx, %r11
	xorl	%eax, %eax
	jmp	.LBB6_5
	.p2align	4, 0x90
.LBB6_8:
	addq	%r11, %rax
	movq	%rdi, %r10
	movq	%rdx, %r11
.LBB6_5:
	cmpq	%r11, %r10
	movl	$1, %esi
	cmovbq	%r9, %rsi
	cmoveq	%r8, %rsi
	testq	%rsi, %rsi
	movl	$1, %edi
	movl	$0, %edx
	je	.LBB6_8
	cmpq	$-1, %rsi
	jne	.LBB6_9
	leaq	-1(%r11), %rdx
	movq	%r10, %rdi
	jmp	.LBB6_8
.LBB6_9:
	movl	$1, %r8d
	cmpq	$1, %rcx
	je	.LBB6_23
.LBB6_11:
	xorl	%r9d, %r9d
	movq	$-1, %r10
	.p2align	4, 0x90
.LBB6_12:
	xorl	%r11d, %r11d
	movq	%rcx, %rdx
	jmp	.LBB6_13
	.p2align	4, 0x90
.LBB6_16:
	addq	%rdx, %rax
	movq	%rbx, %r11
	movq	%rsi, %rdx
.LBB6_13:
	cmpq	%rdx, %r11
	movl	$1, %edi
	cmovbq	%r10, %rdi
	cmoveq	%r9, %rdi
	testq	%rdi, %rdi
	movl	$1, %ebx
	movl	$0, %esi
	je	.LBB6_16
	cmpq	$-1, %rdi
	jne	.LBB6_17
	leaq	-1(%rdx), %rsi
	movq	%r11, %rbx
	jmp	.LBB6_16
	.p2align	4, 0x90
.LBB6_17:
	addq	$2, %r8
	xorl	%r11d, %r11d
	movq	%rcx, %rdx
	jmp	.LBB6_18
	.p2align	4, 0x90
.LBB6_21:
	addq	%rdx, %rax
	movq	%rbx, %r11
	movq	%rsi, %rdx
.LBB6_18:
	cmpq	%rdx, %r11
	movl	$1, %edi
	cmovbq	%r10, %rdi
	cmoveq	%r9, %rdi
	testq	%rdi, %rdi
	movl	$1, %ebx
	movl	$0, %esi
	je	.LBB6_21
	cmpq	$-1, %rdi
	jne	.LBB6_22
	leaq	-1(%rdx), %rsi
	movq	%r11, %rbx
	jmp	.LBB6_21
	.p2align	4, 0x90
.LBB6_22:
	cmpq	%rcx, %r8
	jb	.LBB6_12
.LBB6_23:
	popq	%rbx
	popq	%rdi
	popq	%rsi
	retq

The text was updated successfully, but these errors were encountered:

ExpHP · 2017-10-12T01:58:55Z

Performing the low-hanging fruit for minimization:

#![feature(inclusive_range_syntax)]
#![allow(private_no_mangle_fns)]

#[inline(never)]
#[no_mangle]
fn triangle_exc(n: u64) -> u64 {
    let mut count = 0;
    for j in (0 .. n + 1) {
        count += j;
    }
    count
}

#[inline(never)]
#[no_mangle]
fn triangle_inc(n: u64) -> u64 {
    let mut count = 0;
    for j in 0 ..= n {
        count += j;
    }
    count
}

fn main() {
    let n: u64 = std::env::args().nth(1).unwrap().parse().unwrap();

    println!("{}", triangle_exc(n));
    println!("{}", triangle_inc(n));
}

Good:

//-----------------------------------------
       │     0000000000007300 <triangle_exc>:
       │     triangle_exc():
       │       inc    %rdi
       │     ↓ je     8c
       │       cmp    $0x3,%rdi
       │     ↓ jbe    7b
       │       mov    %rdi,%rcx
       │       and    $0xfffffffffffffffc,%rcx
       │     ↓ je     7b
       │       lea    -0x4(%rcx),%rax
       │       mov    %eax,%esi
       │       shr    $0x2,%esi
       │       inc    %esi
       │       and    $0x3,%rsi
       │     ↓ je     8f
       │       neg    %rsi
       │       mov    $0x1,%edx
       │       movq   %rdx,%xmm0
       │       pslldq $0x8,%xmm0
       │       pxor   %xmm2,%xmm2
       │       xor    %edx,%edx
       │       movdqa _fini+0x44,%xmm3
       │       movdqa _fini+0x54,%xmm4
       │       pxor   %xmm1,%xmm1
       │       data16 nopw %cs:0x0(%rax,%rax,1)
       │ 60:   paddq  %xmm0,%xmm2
       │       paddq  %xmm0,%xmm1
       │       paddq  %xmm3,%xmm1
       │       add    $0x4,%rdx
       │       paddq  %xmm4,%xmm0
       │       inc    %rsi
       │     ↑ jne    60
       │ 7b:   xor    %eax,%eax
       │       xor    %ecx,%ecx
       │       nop
       │ 80:   add    %rcx,%rax
       │       inc    %rcx
       │       cmp    %rdi,%rcx
       │     ↑ jb     80
       │ 8b: ← retq
       │ 8c:   xor    %eax,%eax
       │     ← retq
       │ 8f:   xor    %edx,%edx
       │       mov    $0x1,%esi
       │       movq   %rsi,%xmm0
       │       pslldq $0x8,%xmm0
       │       pxor   %xmm2,%xmm2
       │       pxor   %xmm1,%xmm1
       │ a8:   cmp    $0xc,%rax
       │       movdqa %xmm2,%xmm6
       │     ↓ jb     106
       │       mov    %rcx,%rax
       │       sub    %rdx,%rax
       │       movdqa _fini+0x64,%xmm3
       │       movdqa _fini+0x74,%xmm4
       │       movdqa _fini+0x84,%xmm5
       │ d0:   paddq  %xmm0,%xmm2
       │       paddq  %xmm0,%xmm1
 20.00 │       movdqa %xmm0,%xmm6
 10.00 │       paddq  %xmm6,%xmm6
       │       paddq  %xmm6,%xmm1
 10.00 │       paddq  %xmm2,%xmm6
 30.00 │       paddq  %xmm0,%xmm6
       │       paddq  %xmm0,%xmm1
 10.00 │       paddq  %xmm3,%xmm6
       │       paddq  %xmm4,%xmm1
       │       paddq  %xmm5,%xmm0
 20.00 │       movdqa %xmm6,%xmm2
       │     ↑ jne    d0
       │106:   paddq  %xmm1,%xmm6
       │       pshufd $0x4e,%xmm6,%xmm0
       │       paddq  %xmm6,%xmm0
       │       movq   %xmm0,%rax
       │       cmp    %rcx,%rdi
       │     ↑ jne    80
       │     ↑ jmpq   8b

Bad:

       │    0000000000007430 <triangle_inc>:
       │    triangle_inc():
       │      xor    %r8d,%r8d
       │      mov    $0xffffffffffffffff,%r9
       │      xor    %r10d,%r10d
       │      xor    %eax,%eax
       │    ↓ jmp    29
       │      data16 data16 data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)
       │20:   add    %r10,%rax
  1.79 │      mov    %rdx,%rdi
  2.98 │      mov    %rsi,%r10
 17.86 │29:   cmp    %rdi,%r10
       │      mov    $0x1,%ecx
  7.14 │      cmovb  %r9,%rcx
 16.07 │      cmove  %r8,%rcx
  3.57 │      test   %rcx,%rcx
  1.79 │      mov    $0x0,%edx
  1.79 │      mov    $0x1,%esi
 14.29 │    ↑ je     20
       │      cmp    $0xffffffffffffffff,%rcx
       │    ↓ jne    57
  2.98 │      lea    0x1(%r10),%rsi
  3.57 │      mov    %rdi,%rdx
 26.19 │    ↑ jmp    20
       │57: ← retq

Lost some kind of fast lane?

arthurprs · 2017-10-13T12:54:13Z

Apparently llvm is way happier optimizing the open interval with simd .

kennytm · 2018-01-28T18:38:31Z

Referring to the example in #45222 (comment), the first code is recognized by LLVM loop-vectorize, while the second doesn't.

$ rustc +nightly --crate-type staticlib -C debuginfo=1 -C codegen-units=1 -C opt-level=3 -C panic=abort -C target-cpu=native -C remark=loop-vectorize 1.rs

note: optimization analysis for loop-vectorize at 1.rs:9:0: loop not vectorized: loop control flow is not understood by vectorizer

note: optimization missed for loop-vectorize at 1.rs:9:0: loop not vectorized

After vectorization the .. loop becomes just a * (a+1) / 2, which explains the huge time difference.

If we black-box the j in count += j when benchmarking, the result becomes more realistic (2.6× slowdown, not 680× slowdown):

$ time ./1 100000 1
500005000000000

real	0m5.680s
user	0m5.645s
sys	0m0.012s

$ time ./1 100000 2
500005000000000

real	0m15.088s
user	0m15.015s
sys	0m0.026s

We may see if upgrading to LLVM 6 can help this case.

Also note that the two pieces of code are not equivalent: the n+1 version cannot properly handle the case n == u64::max_value() (although rare).

kennytm · 2018-01-30T12:34:38Z

I've checked again using the LLVM 6 build. The performance is improved, but there is still a large gap (2.6× → 2.0×).

Vectorization is still not recognized for 0 ..= n with the naked j.

Timings

$ rustc +nightly -C codegen-units=1 -C opt-level=3 -C target-cpu=native 1.rs

$ time ./1 30000 2
13500450000000

real	0m5.389s
user	0m5.136s
sys	0m0.012s

$ time ./1 30000 1
13500450000000

real	0m2.195s
user	0m2.192s
sys	0m0.000s

$ rustc +38bd38147d2fa21f8a684b019fc0763adf8fd436 -C codegen-units=1 -C opt-level=3 -C target-cpu=native 1.rs

$ time ./1 30000 2
13500450000000

real	0m4.445s
user	0m4.332s
sys	0m0.000s

$ time ./1 30000 1
13500450000000

real	0m2.330s
user	0m2.184s
sys	0m0.016s

scottmcm · 2018-02-05T08:02:21Z

This might just be the classic external-iteration-is-slower-sometimes problem. Note that even (0..=n).sum() is currently generating unfortunate code.

Fix for internal iteration is up at #48012

ollie27 · 2018-02-05T12:12:30Z

I believe this can be fixed by adding an extra field to RangeInclusive like this:

struct FixedRangeInclusive {
    start: u64,
    end: u64,
    done: bool,
}

fn fixed_range_inclusive(start: u64, end: u64) -> FixedRangeInclusive {
    FixedRangeInclusive {
        start,
        end,
        done: false,
    }
}

impl Iterator for FixedRangeInclusive {
    type Item = u64;
    fn next(&mut self) -> Option<Self::Item> {
        if !self.done {
            if self.start == self.end {
                self.done = true;
            }
            let new = self.start.wrapping_add(1);
            Some(std::mem::replace(&mut self.start, new))
        } else {
            None
        }
    }
}

Check out the assembly on the playground.

kennytm · 2018-02-05T12:45:11Z

The current two-field RangeInclusive follows from rust-lang/rfcs#1980. The done field was the original design in RFC 1192 but was changed due to rust-lang/rfcs#1192 (comment).

ollie27 · 2018-02-05T13:15:26Z

I'm aware that RangeInclusive has gone through many different designs but the current design was clearly not chosen with performance in mind. Of course ideally RangeInclusive wouldn't have any public fields so these kind of changes can be made easily.

ExpHP · 2018-02-05T18:53:31Z

Of course ideally RangeInclusive wouldn't have any public fields so these kind of changes can be made easily.

This would be jarring, in consideration of the fact that Range does have public data members.

It is unfortunate. Were this not the case I could almost picture something like this:

pub struct RangeInclusive<T> {
    // NOTE: not pub
    start: T, // actually, these should probably be ManuallyDrop<T> 
    end: T,   // or union MaybeUninit<T> { value: T, empty: () }
    done: bool,
}

impl RangeInclusive<T> {
    // Expose an API that matches the functionality of the enum type
    #[inline] pub fn new(start: T, end: T) -> Self { ... }
    #[inline] pub fn new_done() -> Self { ... }
    #[inline] pub fn endpoints(&self) -> Option<(&T, &T)> { ... }
    #[inline] pub fn endpoints_mut(&mut self) -> Option<(&mut T, &mut T)> { ... }
    #[inline] pub fn into_endpoints(self) -> Option<(T, T)> { ... }
}

and ISTM (note: haven't tested) that this should optimize just as well as the three-field struct, since it IS the three-field struct (just with statically enforced usage patterns). But I suppose that, even then, it would seem questionable to have a standard library type that simulates a enum (rather than being one) solely for performance concerns.

Edit: I misread somewhat and thought that the enum was the current proposal.

scottmcm · 2018-02-06T07:45:53Z

@leonardo-m Can you share some non-simple examples where the current form is a problem? #48012 will make it so that the example in here is fine if written the easier way count += (0 ..= n).sum().

For simple things like sums of iota, it's easy to get suboptimal codegen from all kinds of different iterators. Like using for x in (0..3).chain(3..x) to add things instead of folding the same iterator has the identical problem as was raised here: https://godbolt.org/g/tYt7TX

Edit: #48057 has also improved things in recent nightlies, though it's still not perfect.

…, r=alexcrichton Override try_[r]fold for RangeInclusive Because the last item needs special handling, it seems that LLVM has trouble canonicalizing the loops in external iteration. With the override, it becomes obvious that the start==end case exits the loop (as opposed to the one *after* that exiting the loop in external iteration). Demo adapted from rust-lang#45222 ```rust #[no_mangle] pub fn foo3r(n: u64) -> u64 { let mut count = 0; (0..n).for_each(|_| { (0 ..= n).rev().for_each(|j| { count += j; }) }); count } ``` <details> <summary>Current nightly ASM, 100 lines (https://play.rust-lang.org/?gist=f5674c702c6e2045c3aab5d03763e5f6&version=nightly&mode=release)</summary> ```asm foo3r: pushq %rbx .Lcfi0: .Lcfi1: testq %rdi, %rdi je .LBB0_1 testb $1, %dil jne .LBB0_4 xorl %eax, %eax xorl %r8d, %r8d cmpq $1, %rdi jne .LBB0_11 jmp .LBB0_23 .LBB0_1: xorl %eax, %eax popq %rbx retq .LBB0_4: xorl %r8d, %r8d movq $-1, %r9 xorl %eax, %eax movq %rdi, %r11 xorl %r10d, %r10d jmp .LBB0_5 .LBB0_8: addq %r11, %rax movq %rsi, %r11 movq %rdx, %r10 .LBB0_5: cmpq %r11, %r10 movl $1, %ecx cmovbq %r9, %rcx cmoveq %r8, %rcx testq %rcx, %rcx movl $0, %esi movl $1, %edx je .LBB0_8 cmpq $-1, %rcx jne .LBB0_9 leaq -1(%r11), %rsi movq %r10, %rdx jmp .LBB0_8 .LBB0_9: movl $1, %r8d cmpq $1, %rdi je .LBB0_23 .LBB0_11: xorl %r9d, %r9d movq $-1, %r10 .LBB0_12: movq %rdi, %rsi xorl %r11d, %r11d jmp .LBB0_13 .LBB0_16: addq %rsi, %rax movq %rcx, %rsi movq %rbx, %r11 .LBB0_13: cmpq %rsi, %r11 movl $1, %edx cmovbq %r10, %rdx cmoveq %r9, %rdx testq %rdx, %rdx movl $0, %ecx movl $1, %ebx je .LBB0_16 cmpq $-1, %rdx jne .LBB0_17 leaq -1(%rsi), %rcx movq %r11, %rbx jmp .LBB0_16 .LBB0_17: movq %rdi, %rcx xorl %r11d, %r11d jmp .LBB0_18 .LBB0_21: addq %rcx, %rax movq %rsi, %rcx movq %rbx, %r11 .LBB0_18: cmpq %rcx, %r11 movl $1, %edx cmovbq %r10, %rdx cmoveq %r9, %rdx testq %rdx, %rdx movl $0, %esi movl $1, %ebx je .LBB0_21 cmpq $-1, %rdx jne .LBB0_22 leaq -1(%rcx), %rsi movq %r11, %rbx jmp .LBB0_21 .LBB0_22: addq $2, %r8 cmpq %rdi, %r8 jne .LBB0_12 .LBB0_23: popq %rbx retq .Lfunc_end0: ``` </details><br> With this PR: ```asm foo3r: test rcx, rcx je .LBB3_1 lea r8, [rcx - 1] lea rdx, [rcx - 2] mov rax, r8 mul rdx shld rdx, rax, 63 imul r8, r8 add r8, rcx sub r8, rdx imul r8, rcx mov rax, r8 ret .LBB3_1: xor r8d, r8d mov rax, r8 ret ```

Simplify RangeInclusive::next[_back] `match`ing on an `Option<Ordering>` seems cause some confusion for LLVM; switching to just using comparison operators removes a few jumps from the simple `for` loops I was trying. cc #45222 #28237 (comment) Example: ```rust #[no_mangle] pub fn coresum(x: std::ops::RangeInclusive<u64>) -> u64 { let mut sum = 0; for i in x { sum += i ^ (i-1); } sum } ``` Today: ```asm coresum: xor r8d, r8d mov r9, -1 xor eax, eax jmp .LBB0_1 .LBB0_4: lea rcx, [rdi - 1] xor rcx, rdi add rax, rcx mov rsi, rdx mov rdi, r10 .LBB0_1: cmp rdi, rsi mov ecx, 1 cmovb rcx, r9 cmove rcx, r8 test rcx, rcx mov edx, 0 mov r10d, 1 je .LBB0_4 // 1 cmp rcx, -1 jne .LBB0_5 // 2 lea r10, [rdi + 1] mov rdx, rsi jmp .LBB0_4 // 3 .LBB0_5: ret ``` With this PR: ```asm coresum: cmp rcx, rdx jbe .LBB0_2 xor eax, eax ret .LBB0_2: xor r8d, r8d mov r9d, 1 xor eax, eax .p2align 4, 0x90 .LBB0_3: lea r10, [rcx + 1] cmp rcx, rdx cmovae rdx, r8 cmovae r10, r9 lea r11, [rcx - 1] xor r11, rcx add rax, r11 mov rcx, r10 cmp r10, rdx jbe .LBB0_3 // Just this ret ``` <details><summary>Though using internal iteration (`.map(|i| i ^ (i-1)).sum()`) is still shorter to type, and lets the compiler unroll it</summary> ```asm coresum_inner: .Lcfi0: .seh_proc coresum_inner sub rsp, 168 .Lcfi1: .seh_stackalloc 168 vmovdqa xmmword ptr [rsp + 144], xmm15 .Lcfi2: .seh_savexmm 15, 144 vmovdqa xmmword ptr [rsp + 128], xmm14 .Lcfi3: .seh_savexmm 14, 128 vmovdqa xmmword ptr [rsp + 112], xmm13 .Lcfi4: .seh_savexmm 13, 112 vmovdqa xmmword ptr [rsp + 96], xmm12 .Lcfi5: .seh_savexmm 12, 96 vmovdqa xmmword ptr [rsp + 80], xmm11 .Lcfi6: .seh_savexmm 11, 80 vmovdqa xmmword ptr [rsp + 64], xmm10 .Lcfi7: .seh_savexmm 10, 64 vmovdqa xmmword ptr [rsp + 48], xmm9 .Lcfi8: .seh_savexmm 9, 48 vmovdqa xmmword ptr [rsp + 32], xmm8 .Lcfi9: .seh_savexmm 8, 32 vmovdqa xmmword ptr [rsp + 16], xmm7 .Lcfi10: .seh_savexmm 7, 16 vmovdqa xmmword ptr [rsp], xmm6 .Lcfi11: .seh_savexmm 6, 0 .Lcfi12: .seh_endprologue cmp rdx, rcx jae .LBB1_2 xor eax, eax jmp .LBB1_13 .LBB1_2: mov r8, rdx sub r8, rcx jbe .LBB1_3 cmp r8, 7 jbe .LBB1_5 mov rax, r8 and rax, -8 mov r9, r8 and r9, -8 je .LBB1_5 add rax, rcx vmovq xmm0, rcx vpshufd xmm0, xmm0, 68 mov ecx, 1 vmovq xmm1, rcx vpslldq xmm1, xmm1, 8 vpaddq xmm1, xmm0, xmm1 vpxor xmm0, xmm0, xmm0 vpcmpeqd xmm11, xmm11, xmm11 vmovdqa xmm12, xmmword ptr [rip + __xmm@00000000000000010000000000000001] vmovdqa xmm13, xmmword ptr [rip + __xmm@00000000000000030000000000000003] vmovdqa xmm14, xmmword ptr [rip + __xmm@00000000000000050000000000000005] vmovdqa xmm15, xmmword ptr [rip + __xmm@00000000000000080000000000000008] mov rcx, r9 vpxor xmm4, xmm4, xmm4 vpxor xmm5, xmm5, xmm5 vpxor xmm6, xmm6, xmm6 .p2align 4, 0x90 .LBB1_9: vpaddq xmm7, xmm1, xmmword ptr [rip + __xmm@00000000000000020000000000000002] vpaddq xmm9, xmm1, xmmword ptr [rip + __xmm@00000000000000040000000000000004] vpaddq xmm10, xmm1, xmmword ptr [rip + __xmm@00000000000000060000000000000006] vpaddq xmm8, xmm1, xmm12 vpxor xmm7, xmm8, xmm7 vpaddq xmm2, xmm1, xmm13 vpxor xmm8, xmm2, xmm9 vpaddq xmm3, xmm1, xmm14 vpxor xmm3, xmm3, xmm10 vpaddq xmm2, xmm1, xmm11 vpxor xmm2, xmm2, xmm1 vpaddq xmm0, xmm2, xmm0 vpaddq xmm4, xmm7, xmm4 vpaddq xmm5, xmm8, xmm5 vpaddq xmm6, xmm3, xmm6 vpaddq xmm1, xmm1, xmm15 add rcx, -8 jne .LBB1_9 vpaddq xmm0, xmm4, xmm0 vpaddq xmm0, xmm5, xmm0 vpaddq xmm0, xmm6, xmm0 vpshufd xmm1, xmm0, 78 vpaddq xmm0, xmm0, xmm1 vmovq r10, xmm0 cmp r8, r9 jne .LBB1_6 jmp .LBB1_11 .LBB1_3: xor r10d, r10d jmp .LBB1_12 .LBB1_5: xor r10d, r10d mov rax, rcx .p2align 4, 0x90 .LBB1_6: lea rcx, [rax - 1] xor rcx, rax inc rax add r10, rcx cmp rdx, rax jne .LBB1_6 .LBB1_11: mov rcx, rdx .LBB1_12: lea rax, [rcx - 1] xor rax, rcx add rax, r10 .LBB1_13: vmovaps xmm6, xmmword ptr [rsp] vmovaps xmm7, xmmword ptr [rsp + 16] vmovaps xmm8, xmmword ptr [rsp + 32] vmovaps xmm9, xmmword ptr [rsp + 48] vmovaps xmm10, xmmword ptr [rsp + 64] vmovaps xmm11, xmmword ptr [rsp + 80] vmovaps xmm12, xmmword ptr [rsp + 96] vmovaps xmm13, xmmword ptr [rsp + 112] vmovaps xmm14, xmmword ptr [rsp + 128] vmovaps xmm15, xmmword ptr [rsp + 144] add rsp, 168 ret .seh_handlerdata .section .text,"xr",one_only,coresum_inner .Lcfi13: .seh_endproc ``` </details>

kennytm · 2018-05-01T18:47:12Z

For record: Enabling Polly (#50044) doesn't fix the issue.

Stargateur · 2018-05-11T04:07:39Z

@ollie27 Just a note, if you are going to use a bool your example always do two tests, maybe something like this could speed up thing because it only test two time for the two last value:

impl Iterator for FixedRangeInclusive {
    type Item = u64;
    fn next(&mut self) -> Option<Self::Item> {
        if self.start < self.end {
            let new = self.start.wrapping_add(1);
            Some(std::mem::replace(&mut self.start, new))
        } else if !self.done {
            self.done = true;
            Some(self.start)
        } else {
            None
        }
    }
}

Change RangeInclusive to a three-field struct. Fix #45222. This PR also reverts #48012 (i.e. removed the `try_fold`/`try_rfold` specialization for `RangeInclusive`) because LLVM no longer has trouble recognizing a RangeInclusive loop.

leonardo-m · 2018-12-06T09:30:33Z

To answer Issue #56516, this is a first example of the performance problem, better examples could follow. Code example from:
http://ericniebler.com/2014/04/27/range-comprehensions/

fn triples() -> impl Iterator<Item=(u32, u32, u32)> {
    (1 ..).flat_map(|z| (1 .. z + 1)
                        .flat_map(move |x| (x .. z + 1u32)
                                           .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                                           .map(move |y| (x, y, z))))
}

fn main() {
    let result: u32 = triples().take(3_000).map(|(x, y, z)| x + y + z).sum();
    println!("{}", result); // 10650478, about 2.8 seconds.
}

If I replace the open intervals with closed ones:

fn triples() -> impl Iterator<Item=(u32, u32, u32)> {
    (1 ..).flat_map(|z| (1u32 ..= z)
                        .flat_map(move |x| (x ..= z)
                                           .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                                           .map(move |y| (x, y, z))))
}

fn main() {
    let result: u32 = triples().take(3_000).map(|(x, y, z)| x + y + z).sum();
    println!("{}", result);
}

For the second version I am seeing a run-time of about 6 seconds.

leonardo-m · 2019-01-01T22:34:23Z

Lot of discussion here:

https://old.reddit.com/r/rust/comments/ab7hsi/comparing_pythagorean_triples_in_c_d_and_rust/

frol · 2019-02-28T12:47:43Z

#58122 has significantly improved the situation!

My benchmark based on the @leonardo-m snippet

#![feature(test)]
extern crate test;

use test::Bencher;

fn triples_exclusive() -> impl Iterator<Item = (u32, u32, u32)> {
    (1u32..).flat_map(|z| {
        (1..z + 1).flat_map(move |x| {
            (x..z + 1)
                .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                .map(move |y| (x, y, z))
        })
    })
}

fn triples_inclusive() -> impl Iterator<Item = (u32, u32, u32)> {
    (1u32..).flat_map(|z| {
        (1..=z).flat_map(move |x| {
            (x..=z)
                .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                .map(move |y| (x, y, z))
        })
    })
}

#[bench]
fn range_exclusive(b: &mut Bencher) {
    b.iter(|| triples_exclusive().take(1_000).map(|(x, y, z)| x + y + z).sum::<u32>());
}

#[bench]
fn range_inclusive(b: &mut Bencher) {
    b.iter(|| triples_inclusive().take(1_000).map(|(x, y, z)| x + y + z).sum::<u32>());
}

Here are the results on my laptop:

Run 1:

test range_exclusive ... bench: 169,132,205 ns/iter (+/- 9,048,718)
test range_inclusive ... bench: 173,425,958 ns/iter (+/- 16,198,459)

Run 2:

test range_exclusive ... bench: 172,896,754 ns/iter (+/- 12,204,477)
test range_inclusive ... bench: 174,383,382 ns/iter (+/- 12,032,995)

Run 3:

test range_exclusive ... bench: 169,798,685 ns/iter (+/- 11,155,761)
test range_inclusive ... bench: 174,589,161 ns/iter (+/- 13,358,378)

Inclusive Range is still consistently slower than the exclusive variant, which is not that significant some might say, but it may end up to be a "clever" optimization trick like so many of them in C++ world. Also, in my opinion, the existance of this performance difference goes against "zero-cost" claims, so I think this issue should be kept open.

/cc @matthieu-m

leonardo-m · 2019-02-28T17:31:12Z

The percentage performance difference I see in my code is quite larger than the 1-2% difference shown above. So better benchmarks are necessary.

matthieu-m · 2019-02-28T17:55:39Z

@frol

I agree that the performance drop between exclusive and inclusive is annoying, and should ideally be eliminated if possible, however I don't see that as a failure of zero-overhead abstraction.

If you were coding the inclusive range with a for loop, you would also need to handle either the starting/ending bound specially, leading to a different codegen than for exclusive ranges.

For me there are two issues:

The codegen issue, as mentioned; there are possibly ways to massage the Rust code to optimize better, or there may be something to be done on the LLVM side to teach LLVM to split loops in the situation of a loop check moving monotonically.
The API decision that Range and RangeInclusive directly implement Iterator instead of implementing IntoIterator; in the latter case, it would be possible to dynamically choose between exclusive (if possibility to move either bound further) or full (if not) iteration, which LLVM may optimize better.

The API is water under the bridge for stability reasons, so we need to concentrate on the codegen issues. I think the next step should be involving folks knowledgeable about LLVM, who could understand why the optimizer fails so hard and either point to a specific pattern to avoid or improve LLVM so it doesn't.

I won't be following this up, as I have other projects, so I invite anybody interested to pick up the flag and carry on.

@leonardo-m

For these benchmarks specifically? In my benchmarks I was seeing less than 1% between exclusive and inclusive, which is consistent with what is reported here.

Note that using external iteration (a for loop) may not optimize as well as using internal iteration.

Stargateur · 2019-02-28T17:57:10Z

@frol

Also, in my opinion, the existance of this performance difference goes against "zero-cost" claims

I disagree, 0..5 != 0..=4, the zero cost is about the difference if I would manually do a loop with inclusive range like with a u8 from 0 to 255

saethlin · 2021-10-24T20:33:29Z

👋 It's been more than a year since the last comment on this issue.

The original example has been well-optimized since inclusive ranges were stabilized. They now look like this: https://godbolt.org/z/oh9WbbacE

example::triangle_exc:
        cmp     rdi, -1
        je      .LBB0_1
        lea     rcx, [rdi - 1]
        mov     rax, rdi
        mul     rcx
        shld    rdx, rax, 63
        add     rdx, rdi
        mov     rax, rdx
        ret
.LBB0_1:
        xor     edx, edx
        mov     rax, rdx
        ret

example::triangle_inc:
        xor     eax, eax
        xor     ecx, ecx
.LBB1_1:
        mov     rdx, rcx
        cmp     rcx, rdi
        adc     rcx, 0
        add     rax, rdx
        cmp     rdx, rdi
        jae     .LBB1_3
        cmp     rcx, rdi
        jbe     .LBB1_1
.LBB1_3:
        ret

Eric Niebler's pythagorean triples benchmark also has different behavior than reported here. Run with the current nightly on a recent x86_64 CPU, default release profile, there is a consistent (small) preference for inclusive ranges:

test range_exclusive ... bench: 109,594,481 ns/iter (+/- 581,839)
test range_inclusive ... bench: 108,711,065 ns/iter (+/- 502,620)

But the situation is muddied by adding any kind of optimization flag. Here's codgen-units = 1:

test range_exclusive ... bench: 109,135,628 ns/iter (+/- 556,099)
test range_inclusive ... bench: 160,119,738 ns/iter (+/- 180,715)

And here's lto = true:

test range_exclusive ... bench: 161,099,657 ns/iter (+/- 156,865)
test range_inclusive ... bench: 136,806,833 ns/iter (+/- 422,545)

Timings are similarly unstable with the latest stable, 1.56.

The basic result holds up across architectures too; here's the default release profile on an aarch64 laptop:

test range_exclusive ... bench: 429,999,840 ns/iter (+/- 8,341)
test range_inclusive ... bench: 388,264,759 ns/iter (+/- 7,984)

Overall I think different benchmarks are needed here. If anyone has complaints about inclusive range performance, I would like to see a benchmark the demonstrates a consistent regression on a recent compiler.

All these were run on the current nightly (which has NewPM enabled by default):

binary: rustc
commit-hash: 91b931926fd49fc97d1e39f2b8206abf1d77ce7d
commit-date: 2021-10-23
host: x86_64-unknown-linux-gnu
release: 1.58.0-nightly
LLVM version: 13.0.0```

Stargateur · 2021-12-29T01:00:50Z

The following question on stackoverflow show a big performance hit using RangeInclusive:

From:

use num_integer::Roots;

fn main() {
    let v = 984067;

    for i in 1..=v {
        ramanujan(i)
    }
}

fn ramanujan(m: i32) {
    let maxcube = m.cbrt();
    let mut res1 = 0;
    let mut res2 = 0;
    let mut _res3 = 0;
    let mut _res4 = 0;
    
    for i in 1..=maxcube {
        for j in 1..=maxcube {
            if i * i * i + j * j * j == m {
                res1 = i;
                res2 = j;
                break;
            }
        }
    }
    
    for k in 1..=maxcube {
        for l in 1..=maxcube {
            if k == res1 || k == res2 || l == res1 || l == res2 {
                continue;
            }
            if k * k * k + l * l * l == m {
                _res3 = k;
                _res4 = l;
                break;
            }
        }
    }
}

To:

use num_integer::Roots;

fn main() {
    let v = 984067;
    for i in 1..=v {
        ramanujan(i)
    }
}

fn ramanujan(m: i32) {
    let maxcube = m.cbrt() + 1;
    let mut res1 = 0;
    let mut res2 = 0;
    let mut res3 = 0;
    let mut res4 = 0;
    
    for i in 1..maxcube {
        for j in 1..maxcube {
            if i * i * i + j * j * j == m {
                res1 = i;
                res2 = j;
                break;
            }
        }
    }
    
    for k in 1..maxcube {
        for l in 1..maxcube {
            if k == res1 || k == res2 || l == res1 || l == res2 {
                continue;
            }
            if k * k * k + l * l * l == m {
                res3 = k;
                res4 = l;
                break;
            }
        }
    }
}

Result:

rustc 1.57.0 (f1edd0429 2021-11-29)
From: 0.01s user 0.01s system 0% cpu 18.108 total
To: 0.00s user 0.01s system 0% cpu 3.549 total

rustc 1.59.0-nightly (efec54529 2021-12-04)
From: 0.01s user 0.00s system 0% cpu 17.993 total
To: 0.00s user 0.01s system 0% cpu 3.494 total

Thus I don't know exactly if this is related to this issue. But since this issue is more or less "problem with RangeInclusive perf" tell me if I need to open and put it in another issue.

saethlin · 2021-12-29T02:12:47Z

I minimized the above example into:

pub fn example(m: i32, max: i32) -> i32 {
    for i in 1..(max + 1) {
        if i * i == m { 
            return i;
        }   
    }   
    0   
}

pub fn example_inc(m: i32, max: i32) -> i32 {
    for i in 1..=max {
        if i * i == m { 
            return i;
        }   
    }   
    0   
}

Godbolt link: https://godbolt.org/z/nqTveYvzx
(updated shortly after posting to make the two functions semantically identical)

I see a 2x runtime difference between the two, using this program to execute them, commenting out the one I don't want:

#![feature(test)]
extern crate test;
use test::bench::black_box;

use num_integer::Roots;

fn main() {
    let v = 984067;

    for i in 1..=v {
        black_box(example(i, i.cbrt()));
    }

    for i in 1..=v {
        black_box(example_inc(i, i.cbrt()));
    }
}

Stargateur · 2021-12-29T07:22:48Z

use std::ops::ControlFlow;

pub fn example_try_fold(m: i32, max: i32) -> i32 {
  match (1..=max).try_fold(0, |acc, i: i32| {
      if i * i == m { 
          ControlFlow::Break(i)
      }
      else {
        ControlFlow::Continue(acc)
      }
  }) {
      ControlFlow::Break(i) | ControlFlow::Continue(i) => i,
  }
}

produce more or less the same asm than your example().

leonardo-m · 2021-12-29T08:47:54Z

LLVM is sometimes able to optimize enough if you have a single inclusive loop. The real problem appears when you have two or three nested inclusive loops. In this case LLVM gives loops with bad efficiency. So I've mostly banned inclusive loops in Rust code.

jorgeperezlara · 2024-06-26T09:44:48Z

I have experienced this first hand while writing a benchmark (see my Reddit post).

How may I suggest having an unsafe inclusive range for this kind of things? I could use exclusive ranges, of course, but I'd have to have the choice for when uint_MAX is reached to be able to manually disable these checks (as with most of Rust's safety abstractions).

kennytm · 2024-06-26T10:08:46Z

I think #123741 can fix this once and for all (by making RangeInclusive::into_iter() return an optimized-for-iteration structure). No need to introduce an unsafe range.

TimNN added C-enhancement Category: An issue proposing an enhancement or a PR with one. I-slow Issue: Problems and improvements with respect to performance of generated code. labels Oct 17, 2017

leonardo-m mentioned this issue Nov 9, 2017

Do not suggest to use the closed interval syntax rust-lang/rust-clippy#2217

Closed

leonardo-m mentioned this issue Jan 27, 2018

Stabilize inclusive range (..=) #47813

Merged

kennytm added the T-compiler Relevant to the compiler team, which will review and decide on the PR/issue. label Jan 28, 2018

scottmcm mentioned this issue Feb 5, 2018

Override try_[r]fold for RangeInclusive #48012

Merged

scottmcm mentioned this issue Feb 5, 2018

Range* should overrride more methods of Iterator #39975

Open

kennytm mentioned this issue Feb 6, 2018

Tracking issue for ..= inclusive ranges (RFC #1192) -- originally ... #28237

Closed

8 tasks

scottmcm mentioned this issue Feb 7, 2018

Simplify RangeInclusive::next[_back] #48057

Merged

This was referenced Apr 14, 2018

Introduce RangeInclusive::{new, start, end} methods and make the fields private. #49724

Merged

Add Polly support #50044

Closed

kennytm mentioned this issue Jun 18, 2018

Change RangeInclusive to a three-field struct. #51622

Merged

bors closed this as completed in #51622 Jul 13, 2018

kennytm mentioned this issue Dec 5, 2018

Replace usages of ..i + 1 ranges with ..=i. #56516

Merged

kennytm reopened this Dec 6, 2018

sinkuu mentioned this issue Dec 6, 2018

Override <RangeInclusive as Iterator>::try_(r)fold #56563

Closed

nagisa mentioned this issue Mar 30, 2019

Add llvm.sideeffect to potential infinite loops and recursions #59546

Merged

CAD97 mentioned this issue Feb 5, 2020

Remove problematic specialization from RangeInclusive #68835

Merged

CDirkx mentioned this issue Mar 19, 2020

RangeInclusive cannot be used in const generics. #70155

Closed

Stargateur mentioned this issue Dec 29, 2021

Tracking Issue for try_trait_v2, A new design for the ? desugaring (RFC#3058) #84277

Open

23 tasks

the8472 mentioned this issue Sep 26, 2022

Incomplete optimization with opt-level=z compared to clang for possible pre-compiled expressions #102312

Open

lukas-code mentioned this issue Sep 29, 2022

included range in for loop gives more assembly code than excluded range #102462

Open

the8472 mentioned this issue Oct 6, 2022

Example iterator fold vs. loop emit different code #99656

Open

ghost mentioned this issue Oct 6, 2022

RangeInclusive<Integer>::sum Optimization Suggestion #94922

Closed

workingjubilee added the C-optimization Category: An issue highlighting optimization opportunities or PRs implementing such label Oct 8, 2023

matthieu-m mentioned this issue Jan 4, 2024

RFC: New range types for Edition 2024 rust-lang/rfcs#3550

Merged

saethlin mentioned this issue Oct 6, 2024

Inclusive ranges (..=) slower than exclusive range + 1 (..n + 1) #131333

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Big performance problem with closed intervals looping #45222

Big performance problem with closed intervals looping #45222

leonardo-m commented Oct 11, 2017 •

edited by kennytm

Loading

ExpHP commented Oct 12, 2017

arthurprs commented Oct 13, 2017

kennytm commented Jan 28, 2018

kennytm commented Jan 30, 2018

scottmcm commented Feb 5, 2018

ollie27 commented Feb 5, 2018

kennytm commented Feb 5, 2018

ollie27 commented Feb 5, 2018

ExpHP commented Feb 5, 2018 •

edited

Loading

scottmcm commented Feb 6, 2018 •

edited

Loading

kennytm commented May 1, 2018

Stargateur commented May 11, 2018 •

edited

Loading

leonardo-m commented Dec 6, 2018

leonardo-m commented Jan 1, 2019

frol commented Feb 28, 2019 •

edited

Loading

leonardo-m commented Feb 28, 2019

matthieu-m commented Feb 28, 2019

Stargateur commented Feb 28, 2019

saethlin commented Oct 24, 2021

Stargateur commented Dec 29, 2021 •

edited

Loading

saethlin commented Dec 29, 2021 •

edited

Loading

Stargateur commented Dec 29, 2021 •

edited

Loading

leonardo-m commented Dec 29, 2021

jorgeperezlara commented Jun 26, 2024

kennytm commented Jun 26, 2024

Big performance problem with closed intervals looping #45222

Big performance problem with closed intervals looping #45222

Comments

leonardo-m commented Oct 11, 2017 • edited by kennytm Loading

ExpHP commented Oct 12, 2017

arthurprs commented Oct 13, 2017

kennytm commented Jan 28, 2018

kennytm commented Jan 30, 2018

scottmcm commented Feb 5, 2018

ollie27 commented Feb 5, 2018

kennytm commented Feb 5, 2018

ollie27 commented Feb 5, 2018

ExpHP commented Feb 5, 2018 • edited Loading

scottmcm commented Feb 6, 2018 • edited Loading

kennytm commented May 1, 2018

Stargateur commented May 11, 2018 • edited Loading

leonardo-m commented Dec 6, 2018

leonardo-m commented Jan 1, 2019

frol commented Feb 28, 2019 • edited Loading

leonardo-m commented Feb 28, 2019

matthieu-m commented Feb 28, 2019

Stargateur commented Feb 28, 2019

saethlin commented Oct 24, 2021

Stargateur commented Dec 29, 2021 • edited Loading

saethlin commented Dec 29, 2021 • edited Loading

Stargateur commented Dec 29, 2021 • edited Loading

leonardo-m commented Dec 29, 2021

jorgeperezlara commented Jun 26, 2024

kennytm commented Jun 26, 2024

leonardo-m commented Oct 11, 2017 •

edited by kennytm

Loading

ExpHP commented Feb 5, 2018 •

edited

Loading

scottmcm commented Feb 6, 2018 •

edited

Loading

Stargateur commented May 11, 2018 •

edited

Loading

frol commented Feb 28, 2019 •

edited

Loading

Stargateur commented Dec 29, 2021 •

edited

Loading

saethlin commented Dec 29, 2021 •

edited

Loading

Stargateur commented Dec 29, 2021 •

edited

Loading