From 71aba933ba61aa60eaf81be074ccb64c610cc768 Mon Sep 17 00:00:00 2001 From: Aaron Kutch Date: Wed, 18 Dec 2019 18:49:55 -0600 Subject: [PATCH] Replace division implementations with code from the specialized-div-rem crate Puts the asymmetric division behind a feature flag Makes asymmetric-asm a default feature --- Cargo.toml | 5 +- src/int/mod.rs | 16 +- src/int/sdiv.rs | 114 ++---- src/int/specialized_div_rem/asymmetric.rs | 175 ++++++++ src/int/specialized_div_rem/binary_long.rs | 113 +++++ src/int/specialized_div_rem/delegate.rs | 205 +++++++++ src/int/specialized_div_rem/mod.rs | 133 ++++++ src/int/specialized_div_rem/trifecta.rs | 456 +++++++++++++++++++++ src/int/udiv.rs | 264 ++---------- 9 files changed, 1160 insertions(+), 321 deletions(-) create mode 100644 src/int/specialized_div_rem/asymmetric.rs create mode 100644 src/int/specialized_div_rem/binary_long.rs create mode 100644 src/int/specialized_div_rem/delegate.rs create mode 100644 src/int/specialized_div_rem/mod.rs create mode 100644 src/int/specialized_div_rem/trifecta.rs diff --git a/Cargo.toml b/Cargo.toml index 39c4553f..8a4af103 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ cc = { optional = true, version = "1.0" } panic-handler = { path = 'crates/panic-handler' } [features] -default = ["compiler-builtins"] +default = ["compiler-builtins", "asymmetric-asm"] # Enable compilation of C code in compiler-rt, filling in some more optimized # implementations and also filling in unimplemented intrinsics @@ -60,6 +60,9 @@ no-lang-items = [] # Only used in the compiler's build system rustc-dep-of-std = ['compiler-builtins', 'core'] +# Used for faster u128 division on x86_64 +asymmetric-asm = [] + [[example]] name = "intrinsics" required-features = ["compiler-builtins"] diff --git a/src/int/mod.rs b/src/int/mod.rs index 7587bc69..1ab49f5a 100644 --- a/src/int/mod.rs +++ b/src/int/mod.rs @@ -1,22 +1,12 @@ use core::ops; -macro_rules! hty { - ($ty:ty) => { - <$ty as LargeInt>::HighHalf - }; -} - -macro_rules! os_ty { - ($ty:ty) => { - <$ty as Int>::OtherSign - }; -} - pub mod addsub; pub mod mul; -pub mod sdiv; pub mod shift; + +mod specialized_div_rem; pub mod udiv; +pub mod sdiv; /// Trait for some basic operations on integers pub trait Int: diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs index c9e252cc..f75f2ca9 100644 --- a/src/int/sdiv.rs +++ b/src/int/sdiv.rs @@ -1,101 +1,61 @@ -use int::Int; +use super::specialized_div_rem::*; -trait Div: Int { - /// Returns `a / b` - fn div(self, other: Self) -> Self { - let s_a = self >> (Self::BITS - 1); - let s_b = other >> (Self::BITS - 1); - // NOTE it's OK to overflow here because of the `.unsigned()` below. - // This whole operation is computing the absolute value of the inputs - // So some overflow will happen when dealing with e.g. `i64::MIN` - // where the absolute value is `(-i64::MIN) as u64` - let a = (self ^ s_a).wrapping_sub(s_a); - let b = (other ^ s_b).wrapping_sub(s_b); - let s = s_a ^ s_b; - - let r = a.unsigned().aborting_div(b.unsigned()); - (Self::from_unsigned(r) ^ s) - s - } -} - -impl Div for i32 {} -impl Div for i64 {} -impl Div for i128 {} - -trait Mod: Int { - /// Returns `a % b` - fn mod_(self, other: Self) -> Self { - let s = other >> (Self::BITS - 1); - // NOTE(wrapping_sub) see comment in the `div` - let b = (other ^ s).wrapping_sub(s); - let s = self >> (Self::BITS - 1); - let a = (self ^ s).wrapping_sub(s); - - let r = a.unsigned().aborting_rem(b.unsigned()); - (Self::from_unsigned(r) ^ s) - s - } -} - -impl Mod for i32 {} -impl Mod for i64 {} -impl Mod for i128 {} - -trait Divmod: Int { - /// Returns `a / b` and sets `*rem = n % d` - fn divmod(self, other: Self, rem: &mut Self, div: F) -> Self - where - F: Fn(Self, Self) -> Self, - { - let r = div(self, other); - // NOTE won't overflow because it's using the result from the - // previous division - *rem = self - r.wrapping_mul(other); - r - } -} - -impl Divmod for i32 {} -impl Divmod for i64 {} +// NOTE: there are aborts inside the specialized_div_rem functions if division by 0 +// is encountered, however these should be unreachable and optimized away unless +// uses of `std/core::intrinsics::unchecked_div/rem` do not have a 0 check in front +// of them. intrinsics! { #[maybe_use_optimized_c_shim] #[arm_aeabi_alias = __aeabi_idiv] + /// Returns `n / d` pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 { - a.div(b) + i32_div_rem(a, b).0 } #[maybe_use_optimized_c_shim] - pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 { - a.div(b) + /// Returns `n % d` + pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 { + i32_div_rem(a, b).1 } - #[win64_128bit_abi_hack] - pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { - a.div(b) + #[maybe_use_optimized_c_shim] + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 { + let quo_rem = i32_div_rem(a, b); + *rem = quo_rem.1; + quo_rem.0 } #[maybe_use_optimized_c_shim] - pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 { - a.mod_(b) + /// Returns `n / d` + pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 { + i64_div_rem(a, b).0 } #[maybe_use_optimized_c_shim] + /// Returns `n % d` pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 { - a.mod_(b) + i64_div_rem(a, b).1 } - - #[win64_128bit_abi_hack] - pub extern "C" fn __modti3(a: i128, b: i128) -> i128 { - a.mod_(b) + + #[aapcs_on_arm] + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { + let quo_rem = i64_div_rem(a, b); + *rem = quo_rem.1; + quo_rem.0 } - #[maybe_use_optimized_c_shim] - pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 { - a.divmod(b, rem, |a, b| __divsi3(a, b)) + #[win64_128bit_abi_hack] + /// Returns `n / d` + pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { + i128_div_rem(a, b).0 } - - #[aapcs_on_arm] - pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { - a.divmod(b, rem, |a, b| __divdi3(a, b)) + + #[win64_128bit_abi_hack] + /// Returns `n % d` + pub extern "C" fn __modti3(a: i128, b: i128) -> i128 { + i128_div_rem(a, b).1 } } diff --git a/src/int/specialized_div_rem/asymmetric.rs b/src/int/specialized_div_rem/asymmetric.rs new file mode 100644 index 00000000..133d8249 --- /dev/null +++ b/src/int/specialized_div_rem/asymmetric.rs @@ -0,0 +1,175 @@ +macro_rules! impl_asymmetric { + ( + $unsigned_name:ident, // name of the unsigned function + $signed_name:ident, // name of the signed function + $half_division:ident, // function for division of a $uX by a $uX + $asymmetric_division:ident, // function for division of a $uD by a $uX + $n_h:expr, // the number of bits in $iH or $uH + $uH:ident, // unsigned integer with half the bit width of $uX + $uX:ident, // unsigned integer with half the bit width of $uD + $uD:ident, // unsigned integer with double the bit width of $uX + $iD:ident, // signed version of $uD + $($unsigned_attr:meta),*; // attributes for the unsigned function + $($signed_attr:meta),* // attributes for the signed function + ) => { + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + /// + /// This is optimized for dividing integers with the same bitwidth as the largest operand in + /// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction + /// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits. + /// + /// # Panics + /// + /// When attempting to divide by zero, this function will panic. + $( + #[$unsigned_attr] + )* + pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) { + #[inline(always)] + fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) { + let tmp = (lhs as $uD).wrapping_mul(rhs as $uD); + (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) + } + #[inline(always)] + fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) { + let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD); + (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) + } + + let n: u32 = $n_h * 2; + + // Many of these subalgorithms are taken from trifecta.rs, see that for better + // documentation + + let duo_lo = duo as $uX; + let duo_hi = (duo >> n) as $uX; + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + if div_hi == 0 { + if div_lo == 0 { + // division by zero + ::abort(); + } + if duo_hi < div_lo { + // plain $uD by $uX division that will fit into $uX + let tmp = unsafe { $asymmetric_division(duo, div_lo) }; + return (tmp.0 as $uD, tmp.1 as $uD) + } else if (div_lo >> $n_h) == 0 { + // Short division of $uD by a $uH. + let div_0 = div_lo as $uH as $uX; + let (quo_hi, rem_3) = $half_division(duo_hi, div_0); + + let duo_mid = + ((duo >> $n_h) as $uH as $uX) + | (rem_3 << $n_h); + let (quo_1, rem_2) = $half_division(duo_mid, div_0); + + let duo_lo = + (duo as $uH as $uX) + | (rem_2 << $n_h); + let (quo_0, rem_1) = $half_division(duo_lo, div_0); + + return ( + (quo_0 as $uD) + | ((quo_1 as $uD) << $n_h) + | ((quo_hi as $uD) << n), + rem_1 as $uD + ) + } else { + // Short division using the $uD by $uX division + let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo); + let tmp = unsafe { + $asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo) + }; + return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD) + } + } + + let duo_lz = duo_hi.leading_zeros(); + let div_lz = div_hi.leading_zeros(); + let rel_leading_sb = div_lz.wrapping_sub(duo_lz); + if rel_leading_sb < $n_h { + // Some x86_64 CPUs have bad `divq` implementations that make putting + // a `mul` or `mul - 1` algorithm here beneficial + let shift = n.wrapping_sub(duo_lz); + let duo_sig_n = (duo >> shift) as $uX; + let div_sig_n = (div >> shift) as $uX; + let mul = $half_division(duo_sig_n, div_sig_n).0; + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + let (tmp_lo, carry) = carrying_mul(mul,div_lo); + let (tmp_hi, overflow) = carrying_mul_add(mul,div_hi,carry); + let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n); + if ((overflow & 1) != 0) || (duo < tmp) { + return ( + mul.wrapping_sub(1) as $uD, + duo.wrapping_add(div.wrapping_sub(tmp)) + ) + } else { + return ( + mul as $uD, + duo.wrapping_sub(tmp) + ) + } + } else { + // This has been adapted from + // https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn + // adapted from www.hackersdelight.org + + // This is similar to the `mul` or `mul - 1` algorithm in that it uses only more + // significant parts of `duo` and `div` to divide a large integer with a smaller + // division instruction. + let tmp = unsafe { + $asymmetric_division(duo >> 1, ((div << div_lz) >> n) as $uX) + }; + let mut quo = tmp.0 >> ((n - 1) - div_lz); + if quo != 0 { + quo -= 1; + } + // Note that this is a large $uD multiplication being used here + let mut rem = duo - ((quo as $uD) * div); + + if rem >= div { + quo += 1; + rem -= div; + } + return (quo as $uD, rem) + } + } + + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + /// + /// This is optimized for dividing integers with the same bitwidth as the largest operand in + /// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction + /// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits. + /// + /// # Panics + /// + /// When attempting to divide by zero, this function will panic. + $( + #[$signed_attr] + )* + pub fn $signed_name(duo: $iD, div: $iD) -> ($iD,$iD) { + match (duo < 0, div < 0) { + (false,false) => { + let t = $unsigned_name(duo as $uD,div as $uD); + (t.0 as $iD,t.1 as $iD) + }, + (true,false) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD,div as $uD); + ((t.0 as $iD).wrapping_neg(),(t.1 as $iD).wrapping_neg()) + }, + (false,true) => { + let t = $unsigned_name(duo as $uD,div.wrapping_neg() as $uD); + ((t.0 as $iD).wrapping_neg(),t.1 as $iD) + }, + (true,true) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD,div.wrapping_neg() as $uD); + (t.0 as $iD,(t.1 as $iD).wrapping_neg()) + }, + } + } + } +} diff --git a/src/int/specialized_div_rem/binary_long.rs b/src/int/specialized_div_rem/binary_long.rs new file mode 100644 index 00000000..dc80b75f --- /dev/null +++ b/src/int/specialized_div_rem/binary_long.rs @@ -0,0 +1,113 @@ +macro_rules! impl_binary_long { + ( + $unsigned_name:ident, // name of the unsigned function + $signed_name:ident, // name of the signed function + $n:expr, // the number of bits in a $iX or $uX + $uX:ident, // unsigned integer that will be shifted + $iX:ident, // signed version of $uX + $($unsigned_attr:meta),*; // attributes for the unsigned function + $($signed_attr:meta),* // attributes for the signed function + ) => { + + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + /// + /// This uses binary shift long division only, and is designed for CPUs without fast + /// multiplication or division hardware. + /// + /// # Panics + /// + /// When attempting to divide by zero, this function will panic. + $( + #[$unsigned_attr] + )* + pub fn $unsigned_name(duo: $uX, div: $uX) -> ($uX,$uX) { + if div == 0 { + // division by zero + ::abort(); + } + + // Full $uX binary long division. Use `leading_zeros` on the first round, + // because we assume that the average usage of division has arguments that + // are random but have a significant number of leading zero bits. Doing + // `leading_zeros` for every round would be very expensive, especially for + // CPUs without a native count leading zeros instruction, but doing it just + // for the first round is advantageous for both performance of the common + // case and for code simplicity. Note that many benchmarks use the full + // `n_d` bits for `duo`, and the benchmarks with several bits less have a + // good performance increase. + + let div_lz = div.leading_zeros(); + let duo_lz = duo.leading_zeros(); + + if div_lz < duo_lz { + return (0, duo) + } + + // Figures out how far `div` should be shifted to align most significant + // bits + let mut shift = div_lz - duo_lz; + let mut duo = duo; + let mut div = div << shift; + let mut quo = 0; + loop { + // There is a way to do this without branching, but requires too many extra + // operations to be faster: + // let sub = duo.wrapping_sub(div); + // let sign_mask = !(((sub as $iD) >> (n_d - 1)) as $uD); + // duo -= div & sign_mask; + // quo |= sign_mask & 1; + let sub = duo.wrapping_sub(div); + if (sub as $iX) >= 0 { + duo = sub; + quo |= 1; + } + + if duo == 0 { + // must have this branch for inputs that are not as random as what is used in + // the benchmarks + return (quo << shift, duo) + } + if shift == 0 { + return (quo, duo) + } + shift -= 1; + div >>= 1; + quo <<= 1; + } + } + + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + /// + /// This uses binary shift long division only, and is designed for CPUs without fast + /// multiplication or division hardware. + /// + /// # Panics + /// + /// When attempting to divide by zero, this function will panic. + $( + #[$signed_attr] + )* + pub fn $signed_name(duo: $iX, div: $iX) -> ($iX,$iX) { + match (duo < 0, div < 0) { + (false,false) => { + let t = $unsigned_name(duo as $uX,div as $uX); + (t.0 as $iX,t.1 as $iX) + }, + (true,false) => { + let t = $unsigned_name(duo.wrapping_neg() as $uX,div as $uX); + ((t.0 as $iX).wrapping_neg(),(t.1 as $iX).wrapping_neg()) + }, + (false,true) => { + let t = $unsigned_name(duo as $uX,div.wrapping_neg() as $uX); + ((t.0 as $iX).wrapping_neg(),t.1 as $iX) + }, + (true,true) => { + let t = $unsigned_name(duo.wrapping_neg() as $uX,div.wrapping_neg() as $uX); + (t.0 as $iX,(t.1 as $iX).wrapping_neg()) + }, + } + } + } +} diff --git a/src/int/specialized_div_rem/delegate.rs b/src/int/specialized_div_rem/delegate.rs new file mode 100644 index 00000000..bd6fc117 --- /dev/null +++ b/src/int/specialized_div_rem/delegate.rs @@ -0,0 +1,205 @@ +macro_rules! impl_delegate { + ( + $unsigned_name:ident, // name of the unsigned function + $signed_name:ident, // name of the signed function + $half_division:ident, // function for division of a $uX by a $uX + $n_h:expr, // the number of bits in $iH or $uH + $uH:ident, // unsigned integer with half the bit width of $uX + $uX:ident, // unsigned integer with half the bit width of $uD + $uD:ident, // unsigned integer with double the bit width of $uX + $iD:ident, // signed version of $uD + $($unsigned_attr:meta),*; // attributes for the unsigned function + $($signed_attr:meta),* // attributes for the signed function + ) => { + + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + /// + /// This uses binary shift long division, but if it can delegates work to a smaller + /// division. This function is used for CPUs with a register size smaller than the division + /// size, and that do not have fast multiplication or division hardware. For CPUs with a + /// register size equal to the division size, the `_binary_long` functions are probably + /// faster. + /// + /// # Panics + /// + /// When attempting to divide by zero, this function will panic. + $( + #[$unsigned_attr] + )* + pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) { + // the number of bits in a $uX + let n = $n_h * 2; + + let duo_lo = duo as $uX; + let duo_hi = (duo >> n) as $uX; + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + + match (div_lo == 0, div_hi == 0, duo_hi == 0) { + (true, true, _) => { + // division by zero + ::abort(); + } + (_,false,true) => { + // `duo` < `div` + return (0, duo) + } + (false, true, true) => { + // delegate to smaller division + let tmp = $half_division(duo_lo, div_lo); + return (tmp.0 as $uD, tmp.1 as $uD) + } + (false,true,false) => { + if (div_lo >> $n_h) == 0 { + // Short division of $uD by a $uH, using $uX by $uX division + let div_0 = div_lo as $uH as $uX; + let (quo_hi, rem_3) = $half_division(duo_hi, div_0); + + let duo_mid = + ((duo >> $n_h) as $uH as $uX) + | (rem_3 << $n_h); + let (quo_1, rem_2) = $half_division(duo_mid, div_0); + + let duo_lo = + (duo as $uH as $uX) + | (rem_2 << $n_h); + let (quo_0, rem_1) = $half_division(duo_lo, div_0); + + return ( + (quo_0 as $uD) + | ((quo_1 as $uD) << $n_h) + | ((quo_hi as $uD) << n), + rem_1 as $uD + ) + } else { + // Binary long division of $uD by $uX. This is the same as below, but the + // compiler should be able to do the shifts faster. See the other long + // division below for more. + let div_lz = div_lo.leading_zeros() + n; + let duo_lz = duo_hi.leading_zeros(); + + // initial check for `div_lz < duo_lz` not needed here + + let mut shift = div_lz - duo_lz; + let mut duo = duo; + // There are edge cases where `quo` needs to be n + 1 bits before the $uX by + // $uX division can be used, e.x. 0xFFFF_FFF8 / 0x7FFF. The first iteration + // is done before the loop, and the 1 bit it adds to `quo` is added at the + // end. + let mut sub = (div_lo as $uD) << shift; + let initial_shift = shift; + + let quo_edge = if sub <= duo { + duo -= sub; + true + } else { + false + }; + let mut quo_hi: $uX = 0; + loop { + let duo_hi = (duo >> n) as $uX; + if duo_hi == 0 || shift == 0 { + // delegate what remains to $uX by $uX division + let duo_lo = duo as $uX; + let tmp = $half_division(duo_lo, div_lo); + let (quo_lo, rem_lo) = (tmp.0, tmp.1); + return ( + (quo_lo as $uD) + | ((quo_hi as $uD) << shift) + | ((quo_edge as $uD) << initial_shift), + rem_lo as $uD + ) + } + shift -= 1; + quo_hi <<= 1; + sub >>= 1; + if sub <= duo { + duo -= sub; + quo_hi |= 1; + } + } + } + } + (_,false,false) => { + // Full $uD binary long division. Use `leading_zeros` on the first round, + // because we assume that the average usage of division has arguments that + // are random but have a significant number of leading zero bits. Doing + // `leading_zeros` for every round would be very expensive, especially for + // CPUs without a native count leading zeros instruction, but doing it just + // for the first round is advantageous for both performance of the common + // case and for code simplicity. Note that many benchmarks use the full + // `n_d` bits for `duo`, and the benchmarks with several bits less have a + // good performance increase. + + // The "mul or mul - 1" algorithm is not used here, since CPUs without division + // hardware also probably do not have fast multiplication hardware. + + let div_lz = div_hi.leading_zeros(); + let duo_lz = duo_hi.leading_zeros(); + + if div_lz < duo_lz { + return (0, duo) + } + + // Figures out how far `div` should be shifted to align most significant + // bits + let mut shift = div_lz - duo_lz; + let mut duo = duo; + let mut quo = 0; + let mut sub = div << shift; + loop { + if sub <= duo { + // the subtraction will not overflow + duo -= sub; + quo |= 1; + } + let duo_hi = (duo >> n) as $uX; + if duo_hi == 0 || shift == 0 { + return (quo << shift, duo) + } + shift -= 1; + sub >>= 1; + quo <<= 1; + } + } + } + } + + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + /// + /// This uses binary shift long division, but if it can delegates work to a smaller + /// division. This function is used for CPUs with a register size smaller than the division + /// size, and that do not have fast multiplication or division hardware. For CPUs with a + /// register size equal to the division size, the `_binary_long` functions are probably + /// faster. + /// + /// # Panics + /// + /// When attempting to divide by zero, this function will panic. + $( + #[$signed_attr] + )* + pub fn $signed_name(duo: $iD, div: $iD) -> ($iD,$iD) { + match (duo < 0, div < 0) { + (false,false) => { + let t = $unsigned_name(duo as $uD,div as $uD); + (t.0 as $iD,t.1 as $iD) + }, + (true,false) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD,div as $uD); + ((t.0 as $iD).wrapping_neg(),(t.1 as $iD).wrapping_neg()) + }, + (false,true) => { + let t = $unsigned_name(duo as $uD,div.wrapping_neg() as $uD); + ((t.0 as $iD).wrapping_neg(),t.1 as $iD) + }, + (true,true) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD,div.wrapping_neg() as $uD); + (t.0 as $iD,(t.1 as $iD).wrapping_neg()) + }, + } + } + } +} diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs new file mode 100644 index 00000000..9e55cb26 --- /dev/null +++ b/src/int/specialized_div_rem/mod.rs @@ -0,0 +1,133 @@ +// This `specialized_div_rem` module is from version 0.2.0 of the +// `specialized-div-rem` crate (the build system cannot handle +// including it as a regular dependency). + +// The purpose of these macros is to easily change the both the division algorithm used +// for a given integer size and the half division used by that algorithm. For example, +// most architectures do the following: +// +// - `impl_trifecta!` is enabled by feature flags. This impl makes u128 divisions use +// the trifecta algorithm which in turn calls `u64_by_u64_div_rem` for its half +// division. The `u64_by_u64_div_rem` function is defined as simply calling whatever +// division is defined for u64 integers. +// - If the architecture does not supply a 64 bit hardware division instruction, linkers +// will look for functions such as `__udivdi3` which is defined in `compiler-builtins`. This will +// call `u64_div_rem` which is defined by the `impl_delegate!` below. The half division +// is set to be `u32_by_u32_div_rem`. In turn, the `u32_by_u32_div_rem` function is defined +// as whatever is defined for u32 integers. +// - If the architecture does not supply a 32 bit hardware instruction, `__udivsi3` will +// be used which calls `u32_div_rem_binary_long`. The `impl_binary_long!` macro uses +// no half division, so the chain of calls ends there. +// - LLVM supplies 16 bit divisions and smaller +// +// I could replace all `u64_by_u64_div_rem` in the macros by `u64_div_rem` and likewise +// for `u32_by_u32_div_rem`, but that would mean that hardware divisions would never be +// used. +// +// If for some reason benchmarks showed that u128 divisions worked best with a +// subalgorithm that wasn't the same as what is used for u64 divisions, I could add a +// different function like `u64_by_u64_div_rem_alternative` and change the +// `impl_trifecta!` macro to use that half division instead. These functions are mostly +// changed for benchmarking purposes to see what chain of algorithms is fastest. +// +// On x86_64, an asymmetrically sized division is supplied which means that +// `impl_asymmetric!` has two different subalgorithms: `u128_by_u64_div_rem` and +// `u64_by_u64_div_rem`. + +#[macro_use] +mod binary_long; + +#[macro_use] +mod delegate; + +#[cfg(not(target = "x86_64"))] +#[macro_use] +mod trifecta; + +#[cfg(target = "x86_64")] +#[macro_use] +mod asymmetric; + +// `_trifecta` is efficient for large divisions, even when division +// hardware is not availiable at all. + +#[cfg(not(target = "x86_64"))] +impl_trifecta!( + u128_div_rem, + i128_div_rem, + u64_by_u64_div_rem, + 32, + u32, + u64, + u128, + i128, + inline; + inline +); + +/// This function is unsafe, because if the quotient of `duo` and `div` does not +/// fit in a `u64`, a floating point exception is thrown. +#[cfg(all(feature = "asymmetric-asm", target = "x86_64"))] +#[inline] +unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) { + let quo: u64; + let rem: u64; + let duo_lo = duo as u64; + let duo_hi = (duo >> 64) as u64; + asm!("divq $4" + : "={rax}"(quo), "={rdx}"(rem) + : "{rax}"(duo_lo), "{rdx}"(duo_hi), "r"(div) + : "rax", "rdx" + ); + return (quo, rem) +} + +// use `_asymmetric` instead of `_trifecta`, because x86_64 supplies the `divq` instruction +#[cfg(target = "x86_64")] +impl_asymmetric!( + u128_div_rem, + i128_div_rem, + u64_by_u64_div_rem, + u128_by_u64_div_rem, + 32, + u32, + u64, + u128, + i128, + inline; + inline +); + +#[inline] +fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) { + (duo / div, duo % div) +} + +// `_delegate` is most efficient in the 64 bit range +impl_delegate!( + u64_div_rem, + i64_div_rem, + u32_by_u32_div_rem, + 16, + u16, + u32, + u64, + i64, + inline; + inline +); + +#[inline] +fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) { + (duo / div, duo % div) +} + +impl_binary_long!( + u32_div_rem, + i32_div_rem, + 32, + u32, + i32, + inline; + inline +); diff --git a/src/int/specialized_div_rem/trifecta.rs b/src/int/specialized_div_rem/trifecta.rs new file mode 100644 index 00000000..12652b6f --- /dev/null +++ b/src/int/specialized_div_rem/trifecta.rs @@ -0,0 +1,456 @@ +macro_rules! impl_trifecta { + ( + $unsigned_name:ident, // name of the unsigned function + $signed_name:ident, // name of the signed function + $half_division:ident, // function for division of a $uX by a $uX + $n_h:expr, // the number of bits in $iH or $uH + $uH:ident, // unsigned integer with half the bit width of $uX + $uX:ident, // the largest division instruction that this function calls operates on this + $uD:ident, // unsigned integer with double the bit width of $uX + $iD:ident, // signed version of $uD + $($unsigned_attr:meta),*; // attributes for the unsigned function + $($signed_attr:meta),* // attributes for the signed function + ) => { + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + /// + /// This is optimized for division of two integers with bit widths twice as large + /// as the largest hardware integer division supported. Note that some architectures supply + /// a division of an integer larger than register size by a regular sized integer (e.x. + /// x86_64 has a `divq` instruction which can divide a 128 bit integer by a 64 bit integer). + /// In that case, the `_asymmetric` algorithm should be used instead of this one. + /// + /// This is called the trifecta algorithm because it uses three main algorithms: short + /// division for small divisors, the "mul or mul - 1" algorithm for when the divisor is + /// large enough for the quotient to be determined to be one of two values via only one + /// small division, and an undersubtracting long division algorithm for middle cases. + /// + /// # Panics + /// + /// When attempting to divide by zero, this function will panic. + $( + #[$unsigned_attr] + )* + pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) { + // wrapping_{} was used everywhere for better performance when compiling with + // `debug-assertions = true` + + // This replicates `carrying_mul` (rust-lang rfc #2417). LLVM correctly optimizes this + // to use a widening multiply to 128 bits. + #[inline(always)] + fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) { + let tmp = (lhs as $uD).wrapping_mul(rhs as $uD); + (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) + } + #[inline(always)] + fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) { + let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD); + (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) + } + + // the number of bits in a $uX + let n = $n_h * 2; + + // This should be optimized away because of checks for zero upstream + if div == 0 { + // division by zero + ::abort(); + } + + // Note that throughout this function, `lo` and `hi` refer to the high and low `n` bits + // of a `$uD`, `0` to `3` refer to the 4 `n_h` bit parts of a `$uD`, + // and `mid` refer to the middle two `n_h` parts. + + let div_lz = div.leading_zeros(); + let mut duo_lz = duo.leading_zeros(); + + // the possible ranges of `duo` and `div` at this point: + // `0 <= duo < 2^n_d` + // `1 <= div < 2^n_d` + + // quotient is 0 or 1 branch + if div_lz <= duo_lz { + // The quotient cannot be more than 1. The highest set bit of `duo` needs to be at + // least one place higher than `div` for the quotient to be more than 1. + if duo >= div { + return (1,duo.wrapping_sub(div)) + } else { + return (0,duo) + } + } + + // `_sb` is the number of significant bits (from the ones place to the highest set bit) + // `{2, 2^div_sb} <= duo < 2^n_d` + // `1 <= div < {2^duo_sb, 2^(n_d - 1)}` + // smaller division branch + if duo_lz >= n { + // `duo < 2^n` so it will fit in a $uX. `div` will also fit in a $uX (because of the + // `div_lz <= duo_lz` branch) so no numerical error. + let tmp = $half_division(duo as $uX, div as $uX); + return ( + tmp.0 as $uD, + tmp.1 as $uD + ) + } + + // relative leading significant bits, cannot overflow because of above branches + let rel_leading_sb = div_lz.wrapping_sub(duo_lz); + + // `{2^n, 2^div_sb} <= duo < 2^n_d` + // `1 <= div < {2^duo_sb, 2^(n_d - 1)}` + // short division branch + if div_lz >= (n + $n_h) { + // `1 <= div < {2^duo_sb, 2^n_h}` + let duo_hi = (duo >> n) as $uX; + let div_0 = div as $uH as $uX; + let (quo_hi, rem_3) = $half_division(duo_hi, div_0); + + let duo_mid = + ((duo >> $n_h) as $uH as $uX) + | (rem_3 << $n_h); + let (quo_1, rem_2) = $half_division(duo_mid, div_0); + + let duo_lo = + (duo as $uH as $uX) + | (rem_2 << $n_h); + let (quo_0, rem_1) = $half_division(duo_lo, div_0); + + return ( + (quo_0 as $uD) + | ((quo_1 as $uD) << $n_h) + | ((quo_hi as $uD) << n), + rem_1 as $uD + ) + } + + // `{2^n, 2^div_sb} <= duo < 2^n_d` + // `2^n_h <= div < {2^duo_sb, 2^(n_d - 1)}` + // `mul` or `mul - 1` branch + if rel_leading_sb < $n_h { + // proof that `quo` can only be `mul` or `mul - 1`. + // disclaimer: this is not rigorous + // + // We are trying to find the quotient, `quo`. + // 1. quo = duo / div. (definition) + // + // `shift` is the number of bits not in the higher `n` significant bits of `duo` + // 2. shift = n - duo_lz. (definition) + // 3. duo_sig_n == duo / 2^shift. (definition) + // 4. div_sig_n == div / 2^shift. (definition) + // Note: the `_sig_n` (which in my notation usually means that the variable uses the + // most significant `n` bits) on `div_sig_n` is a misnomer, because instead of using + // a shift by `n - div_lz`, what I do is use the `n - duo_lz` shift so that the bits + // of (4) correspond to (3). + // + // this is because of the bits less significant than the sig_n bits that are cut off + // during the bit shift + // 5. duo_sig_n * 2^shift <= duo < (duo_sig_n + 1) * 2^shift. + // 6. div_sig_n * 2^shift <= div < (div_sig_n + 1) * 2^shift. + // + // dividing each bound of (5) by each bound of (6) + // (duo_sig_n * 2^shift) / (div_sig_n * 2^shift) + // (duo_sig_n * 2^shift) / ((div_sig_n + 1) * 2^shift) + // ((duo_sig_n + 1) * 2^shift) / (div_sig_n * 2^shift) + // ((duo_sig_n + 1) * 2^shift) / ((div_sig_n + 1) * 2^shift) + // simplifying each of these four + // duo_sig_n / div_sig_n + // duo_sig_n / (div_sig_n + 1) + // (duo_sig_n + 1) / div_sig_n + // (duo_sig_n + 1) / (div_sig_n + 1) + // taking the smallest and the largest of these as the low and high bounds + // and replacing `duo / div` with `quo` + // 7. duo_sig_n / (div_sig_n + 1) <= quo < (duo_sig_n + 1) / div_sig_n + // + // Here is where the range restraints from the previous branches becomes necessary. + // To help visualize what needs to happen here, is that the top `n` significant + // bits of `duo` are being divided with the corresponding bits of `div`. + // + // The `rel_leading_sb < n_h` conditional on this branch makes sure that `div_sig_n` + // is at least `2^n_h`, and the previous branches make sure that the highest bit of + // `div_sig_n` is not the `2^(n - 1)` bit (which `duo_sig_n` has been shifted up to + // attain). + // 8. `2^(n - 1) <= duo_sig_n < 2^n` + // 9. `2^n_h <= div_sig_n < 2^(n - 1)` + // 10. mul == duo_sig_n / div_sig_n. (definition) + // + // Using the bounds (8) and (9) with the `duo_sig_n / (div_sig_n + 1)` term. Note + // that on the `<` bounds we subtract 1 to make it inclusive. + // (2^n - 1) / (2^n_h + 1) = 2^n_h - 1 + // 2^(n - 1) / 2^(n - 1) = 1 + // The other equations using the both upper bounds or both lower bounds end up with + // values in the middle of this range. + // 11. 1 <= mul <= 2^n_h - 1 + // + // However, all this tells us is that `mul` can fit in a $uH. + // We want to show that `mul - 1 <= quo < mul + 1`. What this and (7) means is that + // the bits cut off by the shift at the beginning can only affect `quo` enough to + // change it between two values. First, I will show that + // `(duo_sig_n + 1) / div_sig_n` is equal to or less than `mul + 1`. We cannot + // simply use algebra here and say that `1 / div_sig_n` is 0, because we are working + // with truncated integer division. If the remainder is `div_sig_n - 1`, then the + // `+ 1` can be enough to cross the boundary and increment the quotient + // (e.x. 127/32 = 3, 128/32 = 4). Instead, we notice that in order for the remainder + // of `(duo_sig_n + x) / div_sig_n` to be wrapped around twice, `x` must be 1 for + // one of the wrap arounds and another `div_sig_n` added on to it to wrap around + // again. This cannot happen even if `div_sig_n` were 1, so + // 12. duo_sig_n / (div_sig_n + 1) <= quo < mul + 1 + // + // Next, we want to show that `duo_sig_n / (div_sig_n + 1)` is more than or equal to + // `mul - 1`. Putting all the combinations of bounds from (8) and (9) into (10) and + // `duo_sig_n / (div_sig_n + 1)`, + // (2^n - 1) / 2^n_h = 2^n_h - 1 + // (2^n - 1) / (2^n_h + 1) = 2^n_h - 1 + // + // (2^n - 1) / (2^(n - 1) - 1) = 2 + // (2^n - 1) / (2^(n - 1) - 1 + 1) = 1 + // + // 2^(n - 1) / 2^n_h = 2^(n_h - 1) + // 2^(n - 1) / (2^n_h + 1) = 2^(n_h - 1) - 1 + // + // 2^(n - 1) / (2^(n - 1) - 1) = 1 + // 2^(n - 1) / 2^(n - 1) = 1 + // + // 13. mul - 1 <= quo < mul + 1 + // + // In a lot of division algorithms using smaller divisions to construct a larger + // division, we often encounter a situation where the approximate `mul` value + // calculated from a smaller division is a few increments away from the true `quo` + // value. In those algorithms, they have to do more divisions. Because of the fact + // that our `quo` can only be one of two values, we can try the higher value `mul` + // and see if `duo - (mul*div)` overflows. If it did overflow, then `quo` must be + // `mul - 1`, and because we already calculated `duo - (mul*div)` with wrapping + // operations, we can do simple operations without dividing again to find + // `duo - ((mul - 1)*div) = duo - (mul*div - div) = duo + div - mul*div`. If it + // did not overflow, then `mul` must be the answer, because the remainder of + // `duo - ((mul - 1)*div)` is larger and farther away from overflow than + // `duo - (mul*div)`, which we already found is not overflowing. + // + // Thus, we find the quotient using only an `n` sized divide to find `mul` + // and a `n` by `d_n` sized multiply and comparison to find if `quo * mul > duo`, + // following with adds and subtracts to get the correct values. + let shift = n.wrapping_sub(duo_lz); + let duo_sig_n = (duo >> shift) as $uX; + let div_sig_n = (div >> shift) as $uX; + let mul = $half_division(duo_sig_n, div_sig_n).0; + // inline `n` bit by `n_d` bit multiplication and overflow check (we cannot do + // `mul * div > duo` directly because of possibility of overflow beyond 128 bits) + // duo cannot be more than `2^n_d - 1`, and overflow means a value more than that + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + let (tmp_lo, carry) = carrying_mul(mul,div_lo); + let (tmp_hi, overflow) = carrying_mul_add(mul,div_hi,carry); + let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n); + // Note that the overflow cannot be more than 1, otherwise the `div` subtraction + // would not be able to bring the remainder value below 2^n_d - 1, which + // contradicts many things. The `& 1` is here to encourage overflow flag use. + if ((overflow & 1) != 0) || (duo < tmp) { + return ( + mul.wrapping_sub(1) as $uD, + duo.wrapping_add(div.wrapping_sub(tmp)) + ) + } else { + return ( + mul as $uD, + duo.wrapping_sub(tmp) + ) + } + } + + // Undersubtracting long division algorithm. + // Instead of clearing a minimum of 1 bit from `duo` per iteration via + // binary long division, `n_h - 1` bits are cleared per iteration with this algorithm. + // It is a more complicated version of regular long division. Most integer division + // algorithms tend to guess a part of the quotient, and may have a larger quotient than + // the true quotient (which when multiplied by `div` will "oversubtract" the original + // dividend). They then check if the quotient was in fact too large and then have to + // correct it. This long division algorithm has been carefully constructed to always + // underguess the quotient by slim margins. This allows different subalgorithms + // to be blindly jumped to without needing an extra correction step. + // + // The only problem is that it will not work + // for many ranges of `duo` and `div`. Fortunately, the short division, + // mul or mul - 1 algorithms, and simple divisions happen to exactly fill these gaps. + // + // For an example, consider the division of 76543210 by 213 and assume that `n_h` + // is equal to two decimal digits (note: we are working with base 10 here for + // readability). + // The first `h_n` part of the divisor (21) is taken and is incremented by + // 1 to prevent oversubtraction. + // In the first step, the first `n` part of duo (7654) is divided by the 22 to make 347. + // We remember that there was 1 extra place not in the `n_h` part of the divisor and + // shift the 347 right by 1, in contrast to a normal long division. The 347 is + // multiplied by the whole divisor to make 73911, and subtracted from duo to finish the + // step. + // 347 + // ________ + // |76543210 + // -73911 + // 2632210 + // Two more steps are taken after this and then duo fits into `n` bits, and then a final + // normal long division step is made. + // 14 + // 443 + // 119 + // 347 + // ________ + // |76543210 + // -73911 + // 2632210 + // -25347 + // 97510 + // -94359 + // 3151 + // The tower at the top is added together to produce the quotient, 359357 (but in the + // actual algorithm, the quotient is progressively added to each step instead of at + // the end). + // In the actual algorithm below, instead of the final normal long division step, one of + // the three other algorithms ("quotient is 0 or 1", "mul or mul - 1", or + // "n sized division") is used. + + let mut duo = duo; + + // the number of lesser significant bits not a part of `div_sig_n_h`. Must be positive. + let div_lesser_places = (n + $n_h).wrapping_sub(div_lz); + + // the most significant `n_h` bits of div + let div_sig_n_h = (div >> div_lesser_places) as $uH; + + // has to be a `$uX` in case of overflow + let div_sig_n_h_add1 = (div_sig_n_h as $uX).wrapping_add(1); + + let mut quo: $uD = 0; + + // `{2^n, 2^(div_sb + n_h)} <= duo < 2^n_d` + // `2^n_h <= div < {2^(duo_sb - n_h), 2^n}` + loop { + let duo_lesser_places = n.wrapping_sub(duo_lz); + let duo_sig_n = (duo >> duo_lesser_places) as $uX; + + if div_lesser_places <= duo_lesser_places { + let mul = $half_division(duo_sig_n, div_sig_n_h_add1).0 as $uD; + let place = duo_lesser_places.wrapping_sub(div_lesser_places); + + //addition to the quotient + quo = quo.wrapping_add(mul << place); + + //subtraction from `duo` + //at least `n_h - 1` bits are cleared from `duo` here + duo = duo.wrapping_sub(div.wrapping_mul(mul) << place); + } else { + //`mul` or `mul - 1` algorithm + let shift = n.wrapping_sub(duo_lz); + let duo_sig_n = (duo >> shift) as $uX; + let div_sig_n = (div >> shift) as $uX; + let mul = $half_division(duo_sig_n, div_sig_n).0; + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + + let (tmp_lo, carry) = carrying_mul(mul,div_lo); + // The undersubtracting long division algorithm has already run once, so + // overflow beyond 128 bits is not possible here + let (tmp_hi, _) = carrying_mul_add(mul,div_hi,carry); + let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n); + + if duo < tmp { + return ( + // note that this time, the "mul or mul - 1" result is added to `quo` + // to get the final correct quotient + quo.wrapping_add(mul.wrapping_sub(1) as $uD), + duo.wrapping_add( + div.wrapping_sub(tmp) + ) + ) + } else { + return ( + quo.wrapping_add(mul as $uD), + duo.wrapping_sub(tmp) + ) + } + } + + duo_lz = duo.leading_zeros(); + if div_lz <= duo_lz { + //quotient can have 0 or 1 added to it + if div <= duo { + return ( + quo.wrapping_add(1), + duo.wrapping_sub(div) + ) + } else { + return ( + quo, + duo + ) + } + } + + // This can only happen if `div_sd < n` (because of previous "quo = 0 or 1" + // branches), but it is not worth it to unroll further. + if duo_lz >= n { + // simple division and addition + let tmp = $half_division(duo as $uX, div as $uX); + return ( + quo.wrapping_add(tmp.0 as $uD), + tmp.1 as $uD + ) + } + } + } + + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + /// + /// This is optimized for division of two integers with bit widths twice as large + /// as the largest hardware integer division supported. Note that some architectures supply + /// a division of an integer larger than register size by a regular sized integer (e.x. + /// x86_64 has a `divq` instruction which can divide a 128 bit integer by a 64 bit integer). + /// In that case, the `_asymmetric` algorithm should be used instead of this one. + /// + /// This is called the trifecta algorithm because it uses three main algorithms: short + /// division for small divisors, the "mul or mul - 1" algorithm for when the divisor is + /// large enough for the quotient to be determined to be one of two values via only one + /// small division, and an underguessing long division algorithm for middle cases. + /// + /// # Panics + /// + /// When attempting to divide by zero, this function will panic. + $( + #[$signed_attr] + )* + pub fn $signed_name(duo: $iD, div: $iD) -> ($iD,$iD) { + // Note: there is a way of doing this without any branches, but it is unfortunately + // significantly slower + // let n_d = $n_h * 4; + // let duo_s = duo >> (n_d - 1); + // let div_s = div >> (n_d - 1); + // let duo = (duo ^ duo_s).wrapping_sub(duo_s); + // let div = (div ^ div_s).wrapping_sub(div_s); + // let quo_s = duo_s ^ div_s; + // let rem_s = duo_s; + // let tmp = $unsigned_name(duo as $uD, div as $uD); + // ( + // ((tmp.0 as $iD) ^ quo_s).wrapping_sub(quo_s), + // ((tmp.1 as $iD) ^ rem_s).wrapping_sub(rem_s), + // ) + + match (duo < 0, div < 0) { + (false,false) => { + let t = $unsigned_name(duo as $uD,div as $uD); + (t.0 as $iD,t.1 as $iD) + }, + (true,false) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD,div as $uD); + ((t.0 as $iD).wrapping_neg(),(t.1 as $iD).wrapping_neg()) + }, + (false,true) => { + let t = $unsigned_name(duo as $uD,div.wrapping_neg() as $uD); + ((t.0 as $iD).wrapping_neg(),t.1 as $iD) + }, + (true,true) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD,div.wrapping_neg() as $uD); + (t.0 as $iD,(t.1 as $iD).wrapping_neg()) + }, + } + } + } +} diff --git a/src/int/udiv.rs b/src/int/udiv.rs index b393ac6d..1fd02899 100644 --- a/src/int/udiv.rs +++ b/src/int/udiv.rs @@ -1,270 +1,74 @@ -use int::{Int, LargeInt}; +use super::specialized_div_rem::*; -macro_rules! udivmod_inner { - ($n:expr, $d:expr, $rem:expr, $ty:ty) => {{ - let (n, d, rem) = ($n, $d, $rem); - // NOTE X is unknown, K != 0 - if n.high() == 0 { - if d.high() == 0 { - // 0 X - // --- - // 0 X - - if let Some(rem) = rem { - *rem = <$ty>::from(n.low().aborting_rem(d.low())); - } - return <$ty>::from(n.low().aborting_div(d.low())) - } else { - // 0 X - // --- - // K X - if let Some(rem) = rem { - *rem = n; - } - return 0; - }; - } - - let mut sr; - let mut q; - let mut r; - - if d.low() == 0 { - if d.high() == 0 { - // K X - // --- - // 0 0 - // NOTE This should be unreachable in safe Rust because the program will panic before - // this intrinsic is called - ::abort(); - } - - if n.low() == 0 { - // K 0 - // --- - // K 0 - if let Some(rem) = rem { - *rem = <$ty>::from_parts(0, n.high().aborting_rem(d.high())); - } - return <$ty>::from(n.high().aborting_div(d.high())) - } - - // K K - // --- - // K 0 - - if d.high().is_power_of_two() { - if let Some(rem) = rem { - *rem = <$ty>::from_parts(n.low(), n.high() & (d.high() - 1)); - } - return <$ty>::from(n.high() >> d.high().trailing_zeros()); - } - - sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros()); - - // D > N - if sr > ::BITS - 2 { - if let Some(rem) = rem { - *rem = n; - } - return 0; - } - - sr += 1; - - // 1 <= sr <= ::BITS - 1 - q = n << (<$ty>::BITS - sr); - r = n >> sr; - } else if d.high() == 0 { - // K X - // --- - // 0 K - if d.low().is_power_of_two() { - if let Some(rem) = rem { - *rem = <$ty>::from(n.low() & (d.low() - 1)); - } - - if d.low() == 1 { - return n; - } else { - let sr = d.low().trailing_zeros(); - return n >> sr; - }; - } - - sr = 1 + ::BITS + d.low().leading_zeros() - n.high().leading_zeros(); - - // 2 <= sr <= u64::BITS - 1 - q = n << (<$ty>::BITS - sr); - r = n >> sr; - } else { - // K X - // --- - // K K - sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros()); - - // D > N - if sr > ::BITS - 1 { - if let Some(rem) = rem { - *rem = n; - } - return 0; - } - - sr += 1; - - // 1 <= sr <= ::BITS - q = n << (<$ty>::BITS - sr); - r = n >> sr; - } - - // Not a special case - // q and r are initialized with - // q = n << (u64::BITS - sr) - // r = n >> sr - // 1 <= sr <= u64::BITS - 1 - let mut carry = 0; - - // Don't use a range because they may generate references to memcpy in unoptimized code - let mut i = 0; - while i < sr { - i += 1; - - // r:q = ((r:q) << 1) | carry - r = (r << 1) | (q >> (<$ty>::BITS - 1)); - q = (q << 1) | carry as $ty; - - // carry = 0 - // if r >= d { - // r -= d; - // carry = 1; - // } - let s = (d.wrapping_sub(r).wrapping_sub(1)) as os_ty!($ty) >> (<$ty>::BITS - 1); - carry = (s & 1) as hty!($ty); - r -= d & s as $ty; - } - - if let Some(rem) = rem { - *rem = r; - } - (q << 1) | carry as $ty - }} -} +// NOTE: there are aborts inside the specialized_div_rem functions if division by 0 +// is encountered, however these should be unreachable and optimized away unless +// uses of `std/core::intrinsics::unchecked_div/rem` do not have a 0 check in front +// of them. intrinsics! { #[maybe_use_optimized_c_shim] #[arm_aeabi_alias = __aeabi_uidiv] /// Returns `n / d` pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 { - // Special cases - if d == 0 { - // NOTE This should be unreachable in safe Rust because the program will panic before - // this intrinsic is called - ::abort(); - } - - if n == 0 { - return 0; - } - - let mut sr = d.leading_zeros().wrapping_sub(n.leading_zeros()); - - // d > n - if sr > u32::BITS - 1 { - return 0; - } - - // d == 1 - if sr == u32::BITS - 1 { - return n; - } - - sr += 1; - - // 1 <= sr <= u32::BITS - 1 - let mut q = n << (u32::BITS - sr); - let mut r = n >> sr; - - let mut carry = 0; - - // Don't use a range because they may generate references to memcpy in unoptimized code - let mut i = 0; - while i < sr { - i += 1; - - // r:q = ((r:q) << 1) | carry - r = (r << 1) | (q >> (u32::BITS - 1)); - q = (q << 1) | carry; - - // carry = 0; - // if r > d { - // r -= d; - // carry = 1; - // } - - let s = (d.wrapping_sub(r).wrapping_sub(1)) as i32 >> (u32::BITS - 1); - carry = (s & 1) as u32; - r -= d & s as u32; - } - - (q << 1) | carry + u32_div_rem(n, d).0 } - + #[maybe_use_optimized_c_shim] /// Returns `n % d` pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 { - let q = __udivsi3(n, d); - n - q * d + u32_div_rem(n, d).1 } - + #[maybe_use_optimized_c_shim] /// Returns `n / d` and sets `*rem = n % d` pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 { - let q = __udivsi3(n, d); + let quo_rem = u32_div_rem(n, d); if let Some(rem) = rem { - *rem = n - (q * d); + *rem = quo_rem.1; } - q + quo_rem.0 } - + #[maybe_use_optimized_c_shim] /// Returns `n / d` pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 { - __udivmoddi4(n, d, None) + u64_div_rem(n, d).0 } - + #[maybe_use_optimized_c_shim] /// Returns `n % d` pub extern "C" fn __umoddi3(n: u64, d: u64) -> u64 { - let mut rem = 0; - __udivmoddi4(n, d, Some(&mut rem)); - rem + u64_div_rem(n, d).1 } - + + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 { + let quo_rem = u64_div_rem(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; + } + quo_rem.0 + } + #[win64_128bit_abi_hack] /// Returns `n / d` pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 { - __udivmodti4(n, d, None) + u128_div_rem(n, d).0 } #[win64_128bit_abi_hack] /// Returns `n % d` pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 { - let mut rem = 0; - __udivmodti4(n, d, Some(&mut rem)); - rem - } - - /// Returns `n / d` and sets `*rem = n % d` - pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 { - udivmod_inner!(n, d, rem, u64) + u128_div_rem(n, d).1 } #[win64_128bit_abi_hack] /// Returns `n / d` and sets `*rem = n % d` - pub extern "C" fn __udivmodti4(n: u128, - d: u128, - rem: Option<&mut u128>) -> u128 { - udivmod_inner!(n, d, rem, u128) + pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 { + let quo_rem = u128_div_rem(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; + } + quo_rem.0 } }