The new step_by is too much slow #43064

leonardo-m · 2017-07-05T08:58:20Z

As I explained in #42534 the new step_by can't be used:

#![feature(iterator_step_by)]

#[inline(never)]
pub fn first_foo(mut a: i32, b: i32) -> i32 {
    let mut tot = 0;

    while a < b {
        tot += a;
        a += 3;
    }
    tot
}

#[inline(never)]
pub fn second_foo(x: std::ops::Range<i32>) -> i32 {
    let mut tot = 0;

    #[allow(deprecated)]
    for a in x.step_by(3) {
        tot += a;
    }
    tot
}

fn main() {
    println!("{}", first_foo(0, 1000));
    println!("{}", first_foo(0, 100000));
    println!("{}", second_foo(0 .. 1000));
    println!("{}", second_foo(0 .. 100000));
}

Compiling with:
rustc -O --emit asm test.rs

It gives:

_ZN4test9first_foo17he3a0bddb42457919E:
    testl   %ecx, %ecx
    jle .LBB0_1
    xorl    %edx, %edx
    xorl    %eax, %eax
    .p2align    4, 0x90
.LBB0_4:
    addl    %edx, %eax
    addl    $3, %edx
    cmpl    %ecx, %edx
    jl  .LBB0_4
    jmp .LBB0_2
.LBB0_1:
    xorl    %eax, %eax
.LBB0_2:
    retq

_ZN4test10second_foo17hd5d21559ba84e6d1E:
    movq    %rcx, %r9
    shrq    $32, %r9
    xorl    %r8d, %r8d
    xorl    %r10d, %r10d
    xorl    %eax, %eax
    testb   $1, %r10b
    je  .LBB1_11
    jmp .LBB1_2
    .p2align    4, 0x90
.LBB1_7:
    shrq    $32, %rdx
    addl    %eax, %edx
    movb    $1, %r10b
    movl    %edx, %eax
    testb   $1, %r10b
    je  .LBB1_11
.LBB1_2:
    cmpl    %r9d, %ecx
    jge .LBB1_5
    leal    1(%rcx), %edx
    cmpl    %r9d, %edx
    jge .LBB1_4
    leal    2(%rcx), %edx
    cmpl    %r9d, %edx
    jge .LBB1_9
    addl    $3, %ecx
    shlq    $32, %rdx
    movl    $1, %r10d
    jmp .LBB1_6
    .p2align    4, 0x90
.LBB1_11:
    movq    %rcx, %r10
    shlq    $32, %r10
    xorl    %edx, %edx
    cmpl    %r9d, %ecx
    setl    %dl
    cmovgeq %r8, %r10
    addl    %edx, %ecx
    jmp .LBB1_6
    .p2align    4, 0x90
.LBB1_4:
    movl    %edx, %ecx
    jmp .LBB1_5
.LBB1_9:
    movl    %edx, %ecx
    .p2align    4, 0x90
.LBB1_5:
    xorl    %edx, %edx
    xorl    %r10d, %r10d
.LBB1_6:
    orq %r10, %rdx
    testl   %edx, %edx
    jne .LBB1_7
    retq

In a function like second_foo() the old step_by gave something like:

_ZN5test210second_foo17h5b0eb3168418bdd1E:
    movq    %rcx, %rdx
    shrq    $32, %rdx
    xorl    %eax, %eax
    cmpl    %edx, %ecx
    jge .LBB1_3
    xorl    %eax, %eax
    .p2align    4, 0x90
.LBB1_2:
    addl    %ecx, %eax
    addl    $3, %ecx
    cmovol  %edx, %ecx ; what is this for?
    cmpl    %edx, %ecx
    jl  .LBB1_2
.LBB1_3:
    retq

The text was updated successfully, but these errors were encountered:

Implement O(1)-time Iterator::nth for Range*, and slim the Step trait Fixes #43064. Fixes part of #39975. Fixes items 1 <s>and 3</s> of #42168. CC #27741. I think #42310 and #43012 should not have landed without the `nth` part of this PR, but oh well.

leonardo-m · 2017-07-08T09:54:15Z

Dear bors, before closing down this issue you should show me that the asm after that pull request is good enough. I'll verify it myself tomorrow with the next Nightly.

kennytm · 2017-07-08T10:23:49Z

Technically this is closed by GitHub not bors 😄 Feel free to reopen if it is still not fixed.

Mark-Simulacrum · 2017-07-08T12:06:25Z

Reopening so we track verification.

leonardo-m · 2017-07-09T08:34:04Z

The asm now is:

_ZN4test10second_foo17h387627478d348617E:
	pushq	%r14
	pushq	%rsi
	pushq	%rdi
	pushq	%rbx
	subq	$40, %rsp
	movq	%rcx, %rsi
	movq	%rsi, %rbx
	shrq	$32, %rbx
	xorl	%r14d, %r14d
	xorl	%eax, %eax
	xorl	%edi, %edi
	testb	$1, %al
	jne	.LBB1_3
	jmp	.LBB1_2
	.p2align	4, 0x90
.LBB1_10:
	shrq	$32, %rcx
	addl	%edi, %ecx
	movb	$1, %al
	movl	%ecx, %edi
	testb	$1, %al
	je	.LBB1_2
.LBB1_3:
	movl	$2, %ecx
	callq	_ZN4core3num69_$LT$impl$u20$core..convert..TryFrom$LT$usize$GT$$u20$for$u20$u32$GT$8try_from17h98ee5a7abea3d10bE
	testl	%eax, %eax
	jne	.LBB1_4
	shrq	$32, %rax
	addl	%esi, %eax
	cmpl	%esi, %eax
	jge	.LBB1_6
.LBB1_4:
	movl	%ebx, %esi
	xorl	%ecx, %ecx
	xorl	%edx, %edx
.LBB1_7:
	orq	%rdx, %rcx
	testl	%ecx, %ecx
	jne	.LBB1_10
	jmp	.LBB1_9
	.p2align	4, 0x90
.LBB1_2:
	movq	%rsi, %rax
	shlq	$32, %rax
	xorl	%ecx, %ecx
	cmpl	%ebx, %esi
	setl	%cl
	cmovgeq	%r14, %rax
	addl	%ecx, %esi
	orq	%rax, %rcx
	testl	%ecx, %ecx
	jne	.LBB1_10
	jmp	.LBB1_9
.LBB1_6:
	movq	%rax, %rcx
	shlq	$32, %rcx
	leal	1(%rax), %esi
	xorl	%edx, %edx
	cmpl	%ebx, %eax
	setl	%dl
	cmovgel	%ebx, %esi
	cmovgeq	%r14, %rcx
	jmp	.LBB1_7
.LBB1_9:
	movl	%edi, %eax
	addq	$40, %rsp
	popq	%rbx
	popq	%rdi
	popq	%rsi
	popq	%r14
	retq

Ti me it looks still too much long. And apparently there's a call to a TryFrom in the middle of the loop...

In some of my code I see a further 10% performance decrease after the recent performance decrease caused by allocators.

SimonSapin · 2017-07-09T09:48:38Z

I pretty much can’t read assembly. Is either the before or after code O(n) for step_by(n).next()?

The call to try_from is because StepBy::next calls nth on the underlying iterator with a usize argument. In the case of Range<i32>, nth will call <i32 as Step>::add_usize which will first try to convert usize to u32 while checking for overflow.

I have more changes to this code in #43127

leonardo-m · 2017-07-09T10:01:03Z

Is either the before or after code O(n) for step_by(n).next()?

I think it's now O(1), but the asm is a bit of a mess.

step_by() is part of the basics of for loops in Rust, a system language. So before pull requests about it got merged, someone that knows assembly should carefully review the code.

In the case of Range, nth will call ::add_usize which will first try to convert usize to u32 while checking for overflow.

So having only usize as step_by argument causes problems. Perhaps it's not a good idea. step_by() is meant to be used everywhere, its inner loop will run trillions of times. Perhaps all changes to step_by of the last few weeks should be reverted, and merged again only if and when they are acceptable.

SimonSapin · 2017-07-09T10:03:59Z

before pull requests about it got merged

Doing this kind of experiment and finding problems is the point of having a Nightly channel.

oyvindln · 2017-07-10T13:47:41Z

Perhaps all changes to step_by of the last few weeks should be reverted, and merged again only if and when they are acceptable.

This looks like it's an issue with TryFrom/TryInto rather than step_by itself.

EDIT: The main difference seems to be the overflow checks.

oyvindln · 2017-07-12T20:56:22Z

In addition to the overflow checks, there is a check in the next() function of the StepBy iterator, which checks if any items have been taken from the iterator. This wasn't present in the old version, but is presumably needed in the current (generic) implementation.

Additionally, there is a call to try_from that isn't optimized out, but #43194 should help with that.

The overflow checks are probably the main issue though, so having a specialisation of this trait for ranges would probably be helpful.

oyvindln · 2017-08-09T21:14:03Z

I had a go at adding specializations for the primitive types, however, that triggered #36262, which made the compiler treat range syntax without type annotations (e.g 0...50) as always being i32 rather than inferring the type.

I Also tried to specialize using the Step trait, however I didn't manage to it to work as adding default to type Item = I::Item; caused the compiler to complain about the return type of next and the result of the inner Iterator::next() being different. I'm not sure if I hit a bug, or if i was simply doing something wrong. It's possible that doing it through a trait may also hit #36262, but I didn't get that far.

leonardo-m · 2017-08-13T13:49:16Z

To better show the effects of the new step_by here you can find a test program (that solves two different Euler problems) that uses step_by (run-time 4.40 seconds):
https://gist.github.com/anonymous/05cae75cebb01d452e585effad1e01b8

And the same code with step_by replaced by while loops (run-time 3.81 seconds):
https://gist.github.com/anonymous/9db9fde7fe853c56760f8ef80f037fc3

bluss · 2018-01-07T00:37:54Z

@oyvindln Nice experiment (I'd love to see the code). I think the associated type defaults issue is expected, and it seems best to avoid using associated type specialization for now, even in std.

oyvindln · 2018-01-09T22:24:41Z

@bluss
I had something left over from the first attempt that(specializing range): https://gist.github.com/oyvindln/a543f0d62da990c18a3575950c39ada1

Didn't find any code from trying to specialize using the step trait.

Specialize StepBy<Range(Inclusive)> Part of #51557, related to #43064, #31155 As discussed in the above issues, `step_by` optimizes very badly on ranges which is related to 1. the special casing of the first `StepBy::next()` call 2. the need to do 2 additions of `n - 1` and `1` inside the range's `next()` This PR eliminates both by overriding `next()` to always produce the current element and also step ahead by `n` elements in one go. The generated code is much better, even identical in the case of a `Range` with constant `start` and `end` where `start+step` can't overflow. Without constant bounds it's a bit longer than the manual loop. `RangeInclusive` doesn't optimize as nicely but is still much better than the original asm. Unsigned integers optimize better than signed ones for some reason. See the following two links for a comparison. [godbolt: specialization for ..](https://godbolt.org/g/haHLJr) [godbolt: specialization for ..=](https://godbolt.org/g/ewyMu6) `RangeFrom`, the only other range with an `Iterator` implementation can't be specialized like this without changing behaviour due to overflow. There is no way to save "finished-ness". The approach can not be used in general, because it would produce side effects of the underlying iterator too early. May obsolete #51435, haven't checked.

leonardo-m · 2018-06-25T09:05:02Z

Since few days the step_by is fast enough for my usages, so I close this issue down (I still don't like the design of the new step_by that only works with usize steps, but I guess I can't change that). I leave Issue #45222 open because closed intervals are still too much slow.

leonardo-m mentioned this issue Jul 5, 2017

Delete deprecated & unstable range-specific step_by #43012

Merged

bstrie added I-slow Issue: Problems and improvements with respect to performance of generated code. T-libs-api Relevant to the library API team, which will review and decide on the PR/issue. labels Jul 5, 2017

leonardo-m mentioned this issue Jul 5, 2017

step_by() API change #43063

Closed

SimonSapin mentioned this issue Jul 5, 2017

Implement O(1)-time Iterator::nth for Range*, and slim the Step trait #43077

Merged

bors closed this as completed in #43077 Jul 8, 2017

Mark-Simulacrum reopened this Jul 8, 2017

SimonSapin mentioned this issue Jul 8, 2017

Redesign the std::iter::Step trait, tweak related iterator impls for ranges #43127

Closed

oyvindln mentioned this issue Jul 10, 2017

Tracking issue for TryFrom/TryInto traits #33417

Closed

3 tasks

oyvindln mentioned this issue Jul 12, 2017

Add inline annotations and avoid 128-bit where it's not needed in TryFrom + tests for xsize/x128 #43194

Closed

est31 mentioned this issue Jul 21, 2017

RFC: label-break-value rust-lang/rfcs#2046

Merged

Mark-Simulacrum added the C-enhancement Category: An issue proposing an enhancement or a PR with one. label Jul 26, 2017

quininer mentioned this issue Nov 21, 2017

twice slower then cnewhope quininer/newhope#2

Closed

leonardo-m mentioned this issue Jan 6, 2018

Tracking issue for step_by stabilization #27741

Closed

Emerentius mentioned this issue Jun 16, 2018

Specialize StepBy<Range(Inclusive)> #51601

Merged

leonardo-m closed this as completed Jun 25, 2018

Emerentius mentioned this issue Jul 5, 2018

Allow Pre-Stepping in Step-By? #52065

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

The new step_by is too much slow #43064

The new step_by is too much slow #43064

leonardo-m commented Jul 5, 2017

leonardo-m commented Jul 8, 2017

kennytm commented Jul 8, 2017

Mark-Simulacrum commented Jul 8, 2017

leonardo-m commented Jul 9, 2017 •

edited

Loading

SimonSapin commented Jul 9, 2017

leonardo-m commented Jul 9, 2017 •

edited

Loading

SimonSapin commented Jul 9, 2017

oyvindln commented Jul 10, 2017 •

edited

Loading

oyvindln commented Jul 12, 2017

oyvindln commented Aug 9, 2017

leonardo-m commented Aug 13, 2017

bluss commented Jan 7, 2018 •

edited

Loading

oyvindln commented Jan 9, 2018

leonardo-m commented Jun 25, 2018 •

edited

Loading

The new step_by is too much slow #43064

The new step_by is too much slow #43064

Comments

leonardo-m commented Jul 5, 2017

leonardo-m commented Jul 8, 2017

kennytm commented Jul 8, 2017

Mark-Simulacrum commented Jul 8, 2017

leonardo-m commented Jul 9, 2017 • edited Loading

SimonSapin commented Jul 9, 2017

leonardo-m commented Jul 9, 2017 • edited Loading

SimonSapin commented Jul 9, 2017

oyvindln commented Jul 10, 2017 • edited Loading

oyvindln commented Jul 12, 2017

oyvindln commented Aug 9, 2017

leonardo-m commented Aug 13, 2017

bluss commented Jan 7, 2018 • edited Loading

oyvindln commented Jan 9, 2018

leonardo-m commented Jun 25, 2018 • edited Loading

leonardo-m commented Jul 9, 2017 •

edited

Loading

leonardo-m commented Jul 9, 2017 •

edited

Loading

oyvindln commented Jul 10, 2017 •

edited

Loading

bluss commented Jan 7, 2018 •

edited

Loading

leonardo-m commented Jun 25, 2018 •

edited

Loading