-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve performance of rem_euclid()
for signed integers
#103913
Conversation
such code is copy from https://github.com/rust-lang/rust/blob/master/library/std/src/f32.rs and https://github.com/rust-lang/rust/blob/master/library/std/src/f64.rs using r+rhs.abs() is faster than calc it directly. Bench result: ``` $ cargo bench Compiling div-euclid v0.1.0 (/me/div-euclid) Finished bench [optimized] target(s) in 1.01s Running unittests src/lib.rs (target/release/deps/div_euclid-7a4530ca7817d1ef) running 7 tests test tests::it_works ... ignored test tests::bench_aaabs ... bench: 10,498,793 ns/iter (+/- 104,360) test tests::bench_aadefault ... bench: 11,061,862 ns/iter (+/- 94,107) test tests::bench_abs ... bench: 10,477,193 ns/iter (+/- 81,942) test tests::bench_default ... bench: 10,622,983 ns/iter (+/- 25,119) test tests::bench_zzabs ... bench: 10,481,971 ns/iter (+/- 43,787) test tests::bench_zzdefault ... bench: 11,074,976 ns/iter (+/- 29,633) test result: ok. 0 passed; 0 failed; 1 ignored; 6 measured; 0 filtered out; finished in 19.35s ``` bench code: ``` #![feature(test)] extern crate test; fn rem_euclid(a:i32,rhs:i32)->i32{ let r = a % rhs; if r < 0 { r + rhs.abs() } else { r } } #[cfg(test)] mod tests { use super::*; use test::Bencher; use rand::prelude::*; use rand::rngs::SmallRng; const N:i32=1000; #[test] fn it_works() { let a: i32 = 7; // or any other integer type let b = 4; let d:Vec<i32>=(-N..=N).collect(); let n:Vec<i32>=(-N..0).chain(1..=N).collect(); for i in &d { for j in &n { assert_eq!(i.rem_euclid(*j),rem_euclid(*i,*j)); } } assert_eq!(rem_euclid(a,b), 3); assert_eq!(rem_euclid(-a,b), 1); assert_eq!(rem_euclid(a,-b), 3); assert_eq!(rem_euclid(-a,-b), 1); } #[bench] fn bench_aaabs(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); n.shuffle(&mut rng); d.shuffle(&mut rng); n.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=rem_euclid(*i,*j); } } res }); } #[bench] fn bench_aadefault(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); n.shuffle(&mut rng); d.shuffle(&mut rng); n.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=i.rem_euclid(*j); } } res }); } #[bench] fn bench_abs(b: &mut Bencher) { let d:Vec<i32>=(-N..=N).collect(); let n:Vec<i32>=(-N..0).chain(1..=N).collect(); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=rem_euclid(*i,*j); } } res }); } #[bench] fn bench_default(b: &mut Bencher) { let d:Vec<i32>=(-N..=N).collect(); let n:Vec<i32>=(-N..0).chain(1..=N).collect(); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=i.rem_euclid(*j); } } res }); } #[bench] fn bench_zzabs(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); d.shuffle(&mut rng); n.shuffle(&mut rng); d.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=rem_euclid(*i,*j); } } res }); } #[bench] fn bench_zzdefault(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); d.shuffle(&mut rng); n.shuffle(&mut rng); d.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=i.rem_euclid(*j); } } res }); } } ```
r? @thomcc (rustbot has picked a reviewer for you, use r? to override) |
Hey! It looks like you've submitted a new PR for the library teams! If this PR contains changes to any Examples of
|
This comment has been minimized.
This comment has been minimized.
benchmark result: ``` $ cargo bench Compiling div-euclid v0.1.0 (/me/div-euclid) Finished bench [optimized] target(s) in 1.01s Running unittests src/lib.rs (target/release/deps/div_euclid-7a4530ca7817d1ef) running 7 tests test tests::it_works ... ignored test tests::bench_aaabs ... bench: 10,498,793 ns/iter (+/- 104,360) test tests::bench_aadefault ... bench: 11,061,862 ns/iter (+/- 94,107) test tests::bench_abs ... bench: 10,477,193 ns/iter (+/- 81,942) test tests::bench_default ... bench: 10,622,983 ns/iter (+/- 25,119) test tests::bench_zzabs ... bench: 10,481,971 ns/iter (+/- 43,787) test tests::bench_zzdefault ... bench: 11,074,976 ns/iter (+/- 29,633) test result: ok. 0 passed; 0 failed; 1 ignored; 6 measured; 0 filtered out; finished in 19.35s ``` benchmark code: ```rust #![feature(test)] extern crate test; #[inline(always)] fn rem_euclid(a:i32,rhs:i32)->i32{ let r = a % rhs; if r < 0 { // if rhs is `integer::MIN`, rhs.wrapping_abs() == rhs.wrapping_abs, // thus r.wrapping_add(rhs.wrapping_abs()) == r.wrapping_add(rhs) == r - rhs, // which suits our need. // otherwise, rhs.wrapping_abs() == -rhs, which won't overflow since r is negative. r.wrapping_add(rhs.wrapping_abs()) } else { r } } #[cfg(test)] mod tests { use super::*; use test::Bencher; use rand::prelude::*; use rand::rngs::SmallRng; const N:i32=1000; #[test] fn it_works() { let a: i32 = 7; // or any other integer type let b = 4; let d:Vec<i32>=(-N..=N).collect(); let n:Vec<i32>=(-N..0).chain(1..=N).collect(); for i in &d { for j in &n { assert_eq!(i.rem_euclid(*j),rem_euclid(*i,*j)); } } assert_eq!(rem_euclid(a,b), 3); assert_eq!(rem_euclid(-a,b), 1); assert_eq!(rem_euclid(a,-b), 3); assert_eq!(rem_euclid(-a,-b), 1); } #[bench] fn bench_aaabs(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); n.shuffle(&mut rng); d.shuffle(&mut rng); n.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=rem_euclid(*i,*j); } } res }); } #[bench] fn bench_aadefault(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); n.shuffle(&mut rng); d.shuffle(&mut rng); n.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=i.rem_euclid(*j); } } res }); } #[bench] fn bench_abs(b: &mut Bencher) { let d:Vec<i32>=(-N..=N).collect(); let n:Vec<i32>=(-N..0).chain(1..=N).collect(); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=rem_euclid(*i,*j); } } res }); } #[bench] fn bench_default(b: &mut Bencher) { let d:Vec<i32>=(-N..=N).collect(); let n:Vec<i32>=(-N..0).chain(1..=N).collect(); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=i.rem_euclid(*j); } } res }); } #[bench] fn bench_zzabs(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); d.shuffle(&mut rng); n.shuffle(&mut rng); d.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=rem_euclid(*i,*j); } } res }); } #[bench] fn bench_zzdefault(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); d.shuffle(&mut rng); n.shuffle(&mut rng); d.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=i.rem_euclid(*j); } } res }); } } ```
div-euclid
for int
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks good and is definitely a correct change, but i think the comment needs work. I gave a suggestion which you can take or not.
div-euclid
for intrem_euclid()
for signed integers
I was not able to reproduce these benchmark numbers locally. I actually found your version consistently slightly slower. Note that In your benchmark, the outer loop has the same value every iteration -- when checking the version that is hard on the branch predictor, you should be sure to shuffle the groups of fn samples() -> Vec<(i32, i32)> {
let n = 8000;
let mut all = (-n..=n)
.flat_map(|n| (-n..=n).map(move |d| (n, d)))
.filter(|(n, d)| n.checked_rem(*d).is_some())
.collect::<Vec<_>>();
let mut r = SmallRng::from_seed(SEED);
all.shuffle(&mut r);
all
} @rustbot author |
Actually, this is a more decisive win on x86, although it's still slight. Hmm. |
My benchmark seems around ~5% slower on aarch64, but has smaller code size. On x86_64 it's 10% faster with much cleaner/smaller assembly and one fewer potentially-unpredictable branch. I think we should probably take it. Please update the comment, though. I'll trigger a perf run too, just in case this happens to be used in some weird data structure in the compiler. @bors try @rust-timer queue |
Awaiting bors try build completion. @rustbot label: +S-waiting-on-perf |
⌛ Trying commit aafe6db with merge 4e489a0c24dfe15e921c177ef0573acc977c8954... |
☀️ Try build successful - checks-actions |
Queued 4e489a0c24dfe15e921c177ef0573acc977c8954 with parent 6718ea1, future comparison URL. |
The comment is updated. |
Finished benchmarking commit (4e489a0c24dfe15e921c177ef0573acc977c8954): comparison URL. Overall result: no relevant changes - no action neededBenchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR may lead to changes in compiler perf. @bors rollup=never Instruction countThis benchmark run did not return any relevant results for this metric. Max RSS (memory usage)This benchmark run did not return any relevant results for this metric. CyclesThis benchmark run did not return any relevant results for this metric. |
@bors r+ |
☀️ Test successful - checks-actions |
Finished benchmarking commit (6284998): comparison URL. Overall result: no relevant changes - no action needed@rustbot label: -perf-regression Instruction countThis benchmark run did not return any relevant results for this metric. Max RSS (memory usage)ResultsThis is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.
CyclesThis benchmark run did not return any relevant results for this metric. |
Improve performance of `rem_euclid()` for signed integers such code is copy from https://github.com/rust-lang/rust/blob/master/library/std/src/f32.rs and https://github.com/rust-lang/rust/blob/master/library/std/src/f64.rs using `r+rhs.abs()` is faster than calc it with an if clause. Bench result: ``` $ cargo bench Compiling div-euclid v0.1.0 (/me/div-euclid) Finished bench [optimized] target(s) in 1.01s Running unittests src/lib.rs (target/release/deps/div_euclid-7a4530ca7817d1ef) running 7 tests test tests::it_works ... ignored test tests::bench_aaabs ... bench: 10,498,793 ns/iter (+/- 104,360) test tests::bench_aadefault ... bench: 11,061,862 ns/iter (+/- 94,107) test tests::bench_abs ... bench: 10,477,193 ns/iter (+/- 81,942) test tests::bench_default ... bench: 10,622,983 ns/iter (+/- 25,119) test tests::bench_zzabs ... bench: 10,481,971 ns/iter (+/- 43,787) test tests::bench_zzdefault ... bench: 11,074,976 ns/iter (+/- 29,633) test result: ok. 0 passed; 0 failed; 1 ignored; 6 measured; 0 filtered out; finished in 19.35s ``` It seems that, default `rem_euclid` triggered a branch prediction, thus `bench_default` is faster than `bench_aadefault` and `bench_aadefault`, which shuffles the order of calculations. but all of them slower than what it was in `f64`'s and `f32`'s `rem_euclid`, thus I submit this PR. bench code: ```rust #![feature(test)] extern crate test; fn rem_euclid(a:i32,rhs:i32)->i32{ let r = a % rhs; if r < 0 { r + rhs.abs() } else { r } } #[cfg(test)] mod tests { use super::*; use test::Bencher; use rand::prelude::*; use rand::rngs::SmallRng; const N:i32=1000; #[test] fn it_works() { let a: i32 = 7; // or any other integer type let b = 4; let d:Vec<i32>=(-N..=N).collect(); let n:Vec<i32>=(-N..0).chain(1..=N).collect(); for i in &d { for j in &n { assert_eq!(i.rem_euclid(*j),rem_euclid(*i,*j)); } } assert_eq!(rem_euclid(a,b), 3); assert_eq!(rem_euclid(-a,b), 1); assert_eq!(rem_euclid(a,-b), 3); assert_eq!(rem_euclid(-a,-b), 1); } #[bench] fn bench_aaabs(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); n.shuffle(&mut rng); d.shuffle(&mut rng); n.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=rem_euclid(*i,*j); } } res }); } #[bench] fn bench_aadefault(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); n.shuffle(&mut rng); d.shuffle(&mut rng); n.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=i.rem_euclid(*j); } } res }); } #[bench] fn bench_abs(b: &mut Bencher) { let d:Vec<i32>=(-N..=N).collect(); let n:Vec<i32>=(-N..0).chain(1..=N).collect(); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=rem_euclid(*i,*j); } } res }); } #[bench] fn bench_default(b: &mut Bencher) { let d:Vec<i32>=(-N..=N).collect(); let n:Vec<i32>=(-N..0).chain(1..=N).collect(); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=i.rem_euclid(*j); } } res }); } #[bench] fn bench_zzabs(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); d.shuffle(&mut rng); n.shuffle(&mut rng); d.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=rem_euclid(*i,*j); } } res }); } #[bench] fn bench_zzdefault(b: &mut Bencher) { let mut d:Vec<i32>=(-N..=N).collect(); let mut n:Vec<i32>=(-N..0).chain(1..=N).collect(); let mut rng=SmallRng::from_seed([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,21]); d.shuffle(&mut rng); n.shuffle(&mut rng); d.shuffle(&mut rng); b.iter(||{ let mut res=0; for i in &d { for j in &n { res+=i.rem_euclid(*j); } } res }); } } ```
such code is copy from
https://github.com/rust-lang/rust/blob/master/library/std/src/f32.rs and
https://github.com/rust-lang/rust/blob/master/library/std/src/f64.rs
using
r+rhs.abs()
is faster than calc it with an if clause. Bench result:It seems that, default
rem_euclid
triggered a branch prediction, thusbench_default
is faster thanbench_aadefault
andbench_aadefault
, which shuffles the order of calculations. but all of them slower than what it was inf64
's andf32
'srem_euclid
, thus I submit this PR.bench code: