Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

uint: faster mul by u64 #125

Merged
merged 16 commits into from
May 7, 2019
Merged
35 changes: 34 additions & 1 deletion uint/benches/bigint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ criterion_group!(
u512_mul,
u512_div,
u512_rem,
u512_mul_u32_vs_u64,
mulmod_u512_vs_biguint_vs_gmp,
conversions,
u512_bit_and,
Expand Down Expand Up @@ -141,7 +142,14 @@ fn u256_mul(c: &mut Criterion) {
black_box(x.overflowing_mul(y).0)
})
},
vec![(U256::max_value(), 1u64), (U256::from(3), u64::max_value())],
vec![
(U256::max_value(), 1u64),
(U256::from(3), u64::max_value()),
(
U256::from_dec_str("21674844646682989462120101885968193938394323990565507610662749").unwrap(),
173,
),
],
),
);
}
Expand Down Expand Up @@ -330,6 +338,31 @@ fn bench_convert_to_gmp(b: &mut Bencher, i: u64) {
});
}

fn u512_mul_u32_vs_u64(c: &mut Criterion) {
let mods = vec![1u32, 42, 10_000_001, u32::max_value()];
c.bench(
"multiply u512 by u32 vs u64",
ParameterizedBenchmark::new("u32", |b, i| bench_u512_mul_u32(b, *i), mods)
.with_function("u64", |b, i| bench_u512_mul_u64(b, u64::from(*i))),
);
}

fn bench_u512_mul_u32(b: &mut Bencher, i: u32) {
let x =
U512::from_str("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap();
b.iter(|| {
black_box(x * i)
});
}

fn bench_u512_mul_u64(b: &mut Bencher, i: u64) {
let x =
U512::from_str("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap();
b.iter(|| {
black_box(x * i)
});
}

fn mulmod_u512_vs_biguint_vs_gmp(c: &mut Criterion) {
let mods = vec![1u64, 42, 10_000_001, u64::max_value()];
c.bench(
Expand Down
180 changes: 94 additions & 86 deletions uint/src/uint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ macro_rules! uint_full_mul_reg {
if $check(me[j], carry) {
let a = me[j];

let (hi, low) = $crate::split_u128(a as u128 * b as u128);
let (hi, low) = Self::split_u128(a as u128 * b as u128);

let overflow = {
let existing_low = &mut ret[i + j];
Expand Down Expand Up @@ -303,13 +303,7 @@ macro_rules! impl_mul_from {
result
}
}
}
}

#[macro_export]
#[doc(hidden)]
macro_rules! impl_mulassign_from {
($name: ident, $other: ident) => {
impl $crate::core_::ops::MulAssign<$other> for $name {
fn mul_assign(&mut self, other: $other) {
let result = *self * other;
Expand All @@ -319,29 +313,57 @@ macro_rules! impl_mulassign_from {
}
}

#[inline(always)]
#[macro_export]
#[doc(hidden)]
pub fn mul_u32(a: (u64, u64), b: u64, carry: u64) -> (u64, u64) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a public function, so we probably need to support it (with a deprecation warning?).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, this is true, but it is only used for construct_uint macro, and doc(hidden) indicate it shouldn't be a part of the public interface.

let upper = b * a.0;
let lower = b * a.1;
macro_rules! impl_mul_for_primitive {
($name: ty, $other: ident) => {
impl $crate::core_::ops::Mul<$other> for $name {
type Output = $name;

let (res1, overflow1) = lower.overflowing_add(upper << 32);
let (res2, overflow2) = res1.overflowing_add(carry);
fn mul(self, other: $other) -> $name {
let (result, carry) = self.overflowing_mul_u64(other as u64);
panic_on_overflow!(carry > 0);
result
}
}

let carry = (upper >> 32) + overflow1 as u64 + overflow2 as u64;
(res2, carry)
}
impl<'a> $crate::core_::ops::Mul<&'a $other> for $name {
type Output = $name;

#[inline(always)]
#[doc(hidden)]
pub fn split(a: u64) -> (u64, u64) {
(a >> 32, a & 0xFFFF_FFFF)
}
fn mul(self, other: &'a $other) -> $name {
let (result, carry) = self.overflowing_mul_u64(*other as u64);
panic_on_overflow!(carry > 0);
result
}
}

#[inline(always)]
#[doc(hidden)]
pub fn split_u128(a: u128) -> (u64, u64) {
((a >> 64) as _, (a & 0xFFFFFFFFFFFFFFFF) as _)
impl<'a> $crate::core_::ops::Mul<&'a $other> for &'a $name {
type Output = $name;

fn mul(self, other: &'a $other) -> $name {
let (result, carry) = self.overflowing_mul_u64(*other as u64);
panic_on_overflow!(carry > 0);
result
}
}

impl<'a> $crate::core_::ops::Mul<$other> for &'a $name {
type Output = $name;

fn mul(self, other: $other) -> $name {
let (result, carry) = self.overflowing_mul_u64(other as u64);
panic_on_overflow!(carry > 0);
result
}
}

impl $crate::core_::ops::MulAssign<$other> for $name {
fn mul_assign(&mut self, other: $other) {
let result = *self * (other as u64);
*self = result
}
}
}
}

#[macro_export]
Expand Down Expand Up @@ -428,8 +450,8 @@ macro_rules! construct_uint {

let mut res = Self::default();
for b in value.bytes().map(|b| b - 48) {
let (r, overflow) = res.overflowing_mul_u32(10);
if overflow {
let (r, overflow) = res.overflowing_mul_u64(10);
if overflow > 0 {
return Err($crate::FromDecStrErr::InvalidLength);
}
let (r, overflow) = r.overflowing_add(b.into());
Expand Down Expand Up @@ -512,6 +534,15 @@ macro_rules! construct_uint {
return true;
}

// Whether this fits u64.
#[inline]
fn fits_word(&self) -> bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like the reverse of https://github.com/paritytech/parity-common/blob/master/uint/src/uint.rs#L215 – maybe we can save some code there? And I wonder if the unroll! is still warranted, and if the comment about inlining is still relevant on modern rustc.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we can save some code there?

There is also a difference in that in it starts from 1nd index, not 0th, so I'm not sure how are you suggesting to reuse it. Initially I wrote self.bits() <= 64, but there was a small (~2%), but consistently measurable difference.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

starts from 1nd index, not 0th

Ah, missed that. Sorry.

Still, we should verify if the unroll! and the inlining advice is needed here too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made a change to unroll! to work with non-zero indices here, I'm curious if it makes any difference? :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unroll v0.2.2 is out so I checked if it makes a difference (went back to commit 0b3844e) and it looks like unrolling is quite a significant win for U256:

     Running /Users/dvd/dev/parity/parity-common/target/release/deps/bigint-abf18d123360ed8c
u256_mul//(115792089237316195423570985008687907853269984665640564039457584007913129639935, 1)
                        time:   [3.3480 ns 3.3644 ns 3.3843 ns]
                        change: [+36.959% +41.024% +44.814%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 7 outliers among 100 measurements (7.00%)
  3 (3.00%) high mild
  4 (4.00%) high severe
u256_mul//(3, 18446744073709551615)
                        time:   [3.3805 ns 3.3993 ns 3.4202 ns]
                        change: [+38.603% +43.937% +49.933%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 10 outliers among 100 measurements (10.00%)
  4 (4.00%) high mild
  6 (6.00%) high severe
u256_mul//(21674844646682989462120101885968193938394323990565507610662749, 173)
                        time:   [3.4109 ns 3.4509 ns 3.4963 ns]
                        change: [+34.791% +38.484% +41.754%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 9 outliers among 100 measurements (9.00%)
  3 (3.00%) high mild
  6 (6.00%) high severe

u512_mul//(1, 0)     time:   [8.5398 ns 8.5914 ns 8.6427 ns]
                        change: [+12.422% +16.167% +20.151%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 8 outliers among 100 measurements (8.00%)
  8 (8.00%) high severe
u512_mul//(18446744073709551615, 4294967296)
                        time:   [8.4936 ns 8.5363 ns 8.5772 ns]
                        change: [+9.2618% +13.348% +17.802%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 6 outliers among 100 measurements (6.00%)
  3 (3.00%) high mild
  3 (3.00%) high severe
u512_mul//(3759751734479964094783137206182536765532905409829204647089173492, 25097755455336819385...
                        time:   [24.957 ns 25.029 ns 25.111 ns]
                        change: [-4.7843% -1.5953% +1.9514%] (p = 0.40 > 0.05)
                        No change in performance detected.
Found 17 outliers among 100 measurements (17.00%)
  2 (2.00%) low mild
  9 (9.00%) high mild
  6 (6.00%) high severe
u512_mul//(452312848583266388373324160190187140051835877600158453279131187530910662655, 452312848...
                        time:   [25.200 ns 25.326 ns 25.461 ns]
                        change: [-7.0673% -4.0808% -1.6396%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 4 outliers among 100 measurements (4.00%)
  1 (1.00%) high mild
  3 (3.00%) high severe

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks David, will add a commit with unroll 0.2.2.

let &$name(ref arr) = self;
for i in 1..$n_words { if arr[i] != 0 { return false; } }
return true;
}


/// Return the least number of bits needed to represent the number
#[inline]
pub fn bits(&self) -> usize {
Expand Down Expand Up @@ -788,20 +819,35 @@ macro_rules! construct_uint {
}
}

/// Overflowing multiplication by u32.
fn overflowing_mul_u32(self, other: u32) -> (Self, bool) {
let $name(ref arr) = self;
let mut ret = [0u64; $n_words];
let mut carry = 0;
let o = other as u64;
#[inline(always)]
fn mul_u64(a: u64, b: u64, carry: u64) -> (u64, u64) {
let (hi, lo) = Self::split_u128(u128::from(a) * u128::from(b) + u128::from(carry));
(lo, hi)
}

for i in 0..$n_words {
let (res, carry2) = $crate::mul_u32($crate::split(arr[i]), o, carry);
ret[i] = res;
carry = carry2;
#[inline(always)]
fn split(a: u64) -> (u64, u64) {
(a >> 32, a & 0xFFFF_FFFF)
}

#[inline(always)]
fn split_u128(a: u128) -> (u64, u64) {
((a >> 64) as _, (a & 0xFFFFFFFFFFFFFFFF) as _)
}


/// Overflowing multiplication by u64.
/// Returns the result and carry.
fn overflowing_mul_u64(mut self, other: u64) -> (Self, u64) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be API change, no? We went from u32 to u64?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is a private method

let mut carry = 0u64;

for d in self.0.iter_mut() {
let (res, c) = Self::mul_u64(*d, other, carry);
*d = res;
carry = c;
}

($name(ret), carry > 0)
(self, carry)
}

/// Converts from big endian representation bytes in memory.
Expand Down Expand Up @@ -950,56 +996,18 @@ macro_rules! construct_uint {
}
}

// specialization for u32
impl $crate::core_::ops::Mul<u32> for $name {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a breaking API change. I think the reason why it wasn't implemented for all numbers was type interference. Now if you try to write U256::from(2) * 2 it's going to complain, so you need to specify the correct type (brace for fixing the tests in dependent projects).
That said I think it's a good change, the previous behaviour was weird.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's generated via impl_mul_for_primitive now (also for other primitive types)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that's my point. Existing code will break cause there are now multiple (potentially conflicting) implementations of Mul if the type is not strictly specified (i.e. you just write * 2 instead of * 2_u8 or * 2_u32)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a breaking API change.

Indeed. We'll need a 0.7 release for sure.

Copy link
Member Author

@ordian ordian Apr 23, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type Output = $name;

fn mul(self, other: u32) -> $name {
let (ret, overflow) = self.overflowing_mul_u32(other);
panic_on_overflow!(overflow);
ret
}
}

impl<'a> $crate::core_::ops::Mul<u32> for &'a $name {
type Output = $name;

fn mul(self, other: u32) -> $name {
*self * other
}
}

impl $crate::core_::ops::MulAssign<u32> for $name {
fn mul_assign(&mut self, other: u32) {
let result = *self * other;
*self = result
}
}

// all other impls
impl_mul_from!($name, u8);
impl_mul_from!($name, u16);
impl_mul_from!($name, u64);
impl_mul_from!($name, usize);

impl_mul_from!($name, i8);
impl_mul_from!($name, i16);
impl_mul_from!($name, i64);
impl_mul_from!($name, isize);

impl_mul_from!($name, $name);

impl_mulassign_from!($name, u8);
impl_mulassign_from!($name, u16);
impl_mulassign_from!($name, u64);
impl_mulassign_from!($name, usize);

impl_mulassign_from!($name, i8);
impl_mulassign_from!($name, i16);
impl_mulassign_from!($name, i64);
impl_mulassign_from!($name, isize);

impl_mulassign_from!($name, $name);
impl_mul_for_primitive!($name, u8);
impl_mul_for_primitive!($name, u16);
impl_mul_for_primitive!($name, u32);
impl_mul_for_primitive!($name, u64);
impl_mul_for_primitive!($name, usize);
impl_mul_for_primitive!($name, i8);
impl_mul_for_primitive!($name, i16);
impl_mul_for_primitive!($name, i32);
impl_mul_for_primitive!($name, i64);
impl_mul_for_primitive!($name, isize);

impl<T> $crate::core_::ops::Div<T> for $name where T: Into<$name> {
type Output = $name;
Expand Down