Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

The new step_by is too much slow #43064

Closed
leonardo-m opened this issue Jul 5, 2017 · 14 comments
Closed

The new step_by is too much slow #43064

leonardo-m opened this issue Jul 5, 2017 · 14 comments
Labels
C-enhancement Category: An issue proposing an enhancement or a PR with one. I-slow Issue: Problems and improvements with respect to performance of generated code. T-libs-api Relevant to the library API team, which will review and decide on the PR/issue.

Comments

@leonardo-m
Copy link

As I explained in #42534 the new step_by can't be used:

#![feature(iterator_step_by)]

#[inline(never)]
pub fn first_foo(mut a: i32, b: i32) -> i32 {
    let mut tot = 0;

    while a < b {
        tot += a;
        a += 3;
    }
    tot
}

#[inline(never)]
pub fn second_foo(x: std::ops::Range<i32>) -> i32 {
    let mut tot = 0;

    #[allow(deprecated)]
    for a in x.step_by(3) {
        tot += a;
    }
    tot
}

fn main() {
    println!("{}", first_foo(0, 1000));
    println!("{}", first_foo(0, 100000));
    println!("{}", second_foo(0 .. 1000));
    println!("{}", second_foo(0 .. 100000));
}

Compiling with:
rustc -O --emit asm test.rs

It gives:

_ZN4test9first_foo17he3a0bddb42457919E:
    testl   %ecx, %ecx
    jle .LBB0_1
    xorl    %edx, %edx
    xorl    %eax, %eax
    .p2align    4, 0x90
.LBB0_4:
    addl    %edx, %eax
    addl    $3, %edx
    cmpl    %ecx, %edx
    jl  .LBB0_4
    jmp .LBB0_2
.LBB0_1:
    xorl    %eax, %eax
.LBB0_2:
    retq

_ZN4test10second_foo17hd5d21559ba84e6d1E:
    movq    %rcx, %r9
    shrq    $32, %r9
    xorl    %r8d, %r8d
    xorl    %r10d, %r10d
    xorl    %eax, %eax
    testb   $1, %r10b
    je  .LBB1_11
    jmp .LBB1_2
    .p2align    4, 0x90
.LBB1_7:
    shrq    $32, %rdx
    addl    %eax, %edx
    movb    $1, %r10b
    movl    %edx, %eax
    testb   $1, %r10b
    je  .LBB1_11
.LBB1_2:
    cmpl    %r9d, %ecx
    jge .LBB1_5
    leal    1(%rcx), %edx
    cmpl    %r9d, %edx
    jge .LBB1_4
    leal    2(%rcx), %edx
    cmpl    %r9d, %edx
    jge .LBB1_9
    addl    $3, %ecx
    shlq    $32, %rdx
    movl    $1, %r10d
    jmp .LBB1_6
    .p2align    4, 0x90
.LBB1_11:
    movq    %rcx, %r10
    shlq    $32, %r10
    xorl    %edx, %edx
    cmpl    %r9d, %ecx
    setl    %dl
    cmovgeq %r8, %r10
    addl    %edx, %ecx
    jmp .LBB1_6
    .p2align    4, 0x90
.LBB1_4:
    movl    %edx, %ecx
    jmp .LBB1_5
.LBB1_9:
    movl    %edx, %ecx
    .p2align    4, 0x90
.LBB1_5:
    xorl    %edx, %edx
    xorl    %r10d, %r10d
.LBB1_6:
    orq %r10, %rdx
    testl   %edx, %edx
    jne .LBB1_7
    retq

In a function like second_foo() the old step_by gave something like:

_ZN5test210second_foo17h5b0eb3168418bdd1E:
    movq    %rcx, %rdx
    shrq    $32, %rdx
    xorl    %eax, %eax
    cmpl    %edx, %ecx
    jge .LBB1_3
    xorl    %eax, %eax
    .p2align    4, 0x90
.LBB1_2:
    addl    %ecx, %eax
    addl    $3, %ecx
    cmovol  %edx, %ecx ; what is this for?
    cmpl    %edx, %ecx
    jl  .LBB1_2
.LBB1_3:
    retq
@bstrie bstrie added I-slow Issue: Problems and improvements with respect to performance of generated code. T-libs-api Relevant to the library API team, which will review and decide on the PR/issue. labels Jul 5, 2017
bors added a commit that referenced this issue Jul 8, 2017
Implement O(1)-time Iterator::nth for Range*, and slim the Step trait

Fixes #43064.
Fixes part of #39975.
Fixes items 1 <s>and 3</s> of #42168.
CC #27741.

I think #42310 and #43012 should not have landed without the `nth` part of this PR, but oh well.
@leonardo-m
Copy link
Author

Dear bors, before closing down this issue you should show me that the asm after that pull request is good enough. I'll verify it myself tomorrow with the next Nightly.

@kennytm
Copy link
Member

kennytm commented Jul 8, 2017

Technically this is closed by GitHub not bors 😄 Feel free to reopen if it is still not fixed.

@Mark-Simulacrum
Copy link
Member

Reopening so we track verification.

@leonardo-m
Copy link
Author

leonardo-m commented Jul 9, 2017

The asm now is:

_ZN4test10second_foo17h387627478d348617E:
	pushq	%r14
	pushq	%rsi
	pushq	%rdi
	pushq	%rbx
	subq	$40, %rsp
	movq	%rcx, %rsi
	movq	%rsi, %rbx
	shrq	$32, %rbx
	xorl	%r14d, %r14d
	xorl	%eax, %eax
	xorl	%edi, %edi
	testb	$1, %al
	jne	.LBB1_3
	jmp	.LBB1_2
	.p2align	4, 0x90
.LBB1_10:
	shrq	$32, %rcx
	addl	%edi, %ecx
	movb	$1, %al
	movl	%ecx, %edi
	testb	$1, %al
	je	.LBB1_2
.LBB1_3:
	movl	$2, %ecx
	callq	_ZN4core3num69_$LT$impl$u20$core..convert..TryFrom$LT$usize$GT$$u20$for$u20$u32$GT$8try_from17h98ee5a7abea3d10bE
	testl	%eax, %eax
	jne	.LBB1_4
	shrq	$32, %rax
	addl	%esi, %eax
	cmpl	%esi, %eax
	jge	.LBB1_6
.LBB1_4:
	movl	%ebx, %esi
	xorl	%ecx, %ecx
	xorl	%edx, %edx
.LBB1_7:
	orq	%rdx, %rcx
	testl	%ecx, %ecx
	jne	.LBB1_10
	jmp	.LBB1_9
	.p2align	4, 0x90
.LBB1_2:
	movq	%rsi, %rax
	shlq	$32, %rax
	xorl	%ecx, %ecx
	cmpl	%ebx, %esi
	setl	%cl
	cmovgeq	%r14, %rax
	addl	%ecx, %esi
	orq	%rax, %rcx
	testl	%ecx, %ecx
	jne	.LBB1_10
	jmp	.LBB1_9
.LBB1_6:
	movq	%rax, %rcx
	shlq	$32, %rcx
	leal	1(%rax), %esi
	xorl	%edx, %edx
	cmpl	%ebx, %eax
	setl	%dl
	cmovgel	%ebx, %esi
	cmovgeq	%r14, %rcx
	jmp	.LBB1_7
.LBB1_9:
	movl	%edi, %eax
	addq	$40, %rsp
	popq	%rbx
	popq	%rdi
	popq	%rsi
	popq	%r14
	retq

Ti me it looks still too much long. And apparently there's a call to a TryFrom in the middle of the loop...

In some of my code I see a further 10% performance decrease after the recent performance decrease caused by allocators.

@SimonSapin
Copy link
Contributor

I pretty much can’t read assembly. Is either the before or after code O(n) for step_by(n).next()?

The call to try_from is because StepBy::next calls nth on the underlying iterator with a usize argument. In the case of Range<i32>, nth will call <i32 as Step>::add_usize which will first try to convert usize to u32 while checking for overflow.

I have more changes to this code in #43127

@leonardo-m
Copy link
Author

leonardo-m commented Jul 9, 2017

Is either the before or after code O(n) for step_by(n).next()?

I think it's now O(1), but the asm is a bit of a mess.

step_by() is part of the basics of for loops in Rust, a system language. So before pull requests about it got merged, someone that knows assembly should carefully review the code.

In the case of Range, nth will call ::add_usize which will first try to convert usize to u32 while checking for overflow.

So having only usize as step_by argument causes problems. Perhaps it's not a good idea. step_by() is meant to be used everywhere, its inner loop will run trillions of times. Perhaps all changes to step_by of the last few weeks should be reverted, and merged again only if and when they are acceptable.

@SimonSapin
Copy link
Contributor

before pull requests about it got merged

Doing this kind of experiment and finding problems is the point of having a Nightly channel.

@oyvindln
Copy link
Contributor

oyvindln commented Jul 10, 2017

Perhaps all changes to step_by of the last few weeks should be reverted, and merged again only if and when they are acceptable.

This looks like it's an issue with TryFrom/TryInto rather than step_by itself.

EDIT: The main difference seems to be the overflow checks.

@oyvindln
Copy link
Contributor

In addition to the overflow checks, there is a check in the next() function of the StepBy iterator, which checks if any items have been taken from the iterator. This wasn't present in the old version, but is presumably needed in the current (generic) implementation.

Additionally, there is a call to try_from that isn't optimized out, but #43194 should help with that.

The overflow checks are probably the main issue though, so having a specialisation of this trait for ranges would probably be helpful.

@Mark-Simulacrum Mark-Simulacrum added the C-enhancement Category: An issue proposing an enhancement or a PR with one. label Jul 26, 2017
@oyvindln
Copy link
Contributor

oyvindln commented Aug 9, 2017

I had a go at adding specializations for the primitive types, however, that triggered #36262, which made the compiler treat range syntax without type annotations (e.g 0...50) as always being i32 rather than inferring the type.

I Also tried to specialize using the Step trait, however I didn't manage to it to work as adding default to type Item = I::Item; caused the compiler to complain about the return type of next and the result of the inner Iterator::next() being different. I'm not sure if I hit a bug, or if i was simply doing something wrong. It's possible that doing it through a trait may also hit #36262, but I didn't get that far.

@leonardo-m
Copy link
Author

To better show the effects of the new step_by here you can find a test program (that solves two different Euler problems) that uses step_by (run-time 4.40 seconds):
https://gist.github.com/anonymous/05cae75cebb01d452e585effad1e01b8

And the same code with step_by replaced by while loops (run-time 3.81 seconds):
https://gist.github.com/anonymous/9db9fde7fe853c56760f8ef80f037fc3

@bluss
Copy link
Member

bluss commented Jan 7, 2018

@oyvindln Nice experiment (I'd love to see the code). I think the associated type defaults issue is expected, and it seems best to avoid using associated type specialization for now, even in std.

@oyvindln
Copy link
Contributor

oyvindln commented Jan 9, 2018

@bluss
I had something left over from the first attempt that(specializing range): https://gist.github.com/oyvindln/a543f0d62da990c18a3575950c39ada1

Didn't find any code from trying to specialize using the step trait.

bors added a commit that referenced this issue Jun 21, 2018
Specialize StepBy<Range(Inclusive)>

Part of #51557, related to #43064, #31155

As discussed in the above issues, `step_by` optimizes very badly on ranges which is related to
1. the special casing of the first `StepBy::next()` call
2. the need to do 2 additions of `n - 1` and `1` inside the range's `next()`

This PR eliminates both by overriding `next()` to always produce the current element and also step ahead by `n` elements in one go. The generated code is much better, even identical in the case of a `Range` with constant `start` and `end` where `start+step` can't overflow. Without constant bounds it's a bit longer than the manual loop. `RangeInclusive` doesn't optimize as nicely but is still much better than the original asm.
Unsigned integers optimize better than signed ones for some reason.

See the following two links for a comparison.

[godbolt: specialization for ..](https://godbolt.org/g/haHLJr)
[godbolt: specialization for ..=](https://godbolt.org/g/ewyMu6)

`RangeFrom`, the only other range with an `Iterator` implementation can't be specialized like this without changing behaviour due to overflow. There is no way to save "finished-ness".

The approach can not be used in general, because it would produce side effects of the underlying iterator too early.

May obsolete #51435, haven't checked.
@leonardo-m
Copy link
Author

leonardo-m commented Jun 25, 2018

Since few days the step_by is fast enough for my usages, so I close this issue down (I still don't like the design of the new step_by that only works with usize steps, but I guess I can't change that). I leave Issue #45222 open because closed intervals are still too much slow.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
C-enhancement Category: An issue proposing an enhancement or a PR with one. I-slow Issue: Problems and improvements with respect to performance of generated code. T-libs-api Relevant to the library API team, which will review and decide on the PR/issue.
Projects
None yet
Development

No branches or pull requests

7 participants