Skip to content

Commit

Permalink
Replace division implementations with code from the specialized-div-r…
Browse files Browse the repository at this point in the history
…em crate

Puts the asymmetric division behind a feature flag

Makes asymmetric-asm a default feature
  • Loading branch information
AaronKutch committed Jan 28, 2020
1 parent 6de4f8f commit 71aba93
Show file tree
Hide file tree
Showing 9 changed files with 1,160 additions and 321 deletions.
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ cc = { optional = true, version = "1.0" }
panic-handler = { path = 'crates/panic-handler' }

[features]
default = ["compiler-builtins"]
default = ["compiler-builtins", "asymmetric-asm"]

# Enable compilation of C code in compiler-rt, filling in some more optimized
# implementations and also filling in unimplemented intrinsics
Expand All @@ -60,6 +60,9 @@ no-lang-items = []
# Only used in the compiler's build system
rustc-dep-of-std = ['compiler-builtins', 'core']

# Used for faster u128 division on x86_64
asymmetric-asm = []

[[example]]
name = "intrinsics"
required-features = ["compiler-builtins"]
Expand Down
16 changes: 3 additions & 13 deletions src/int/mod.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,12 @@
use core::ops;

macro_rules! hty {
($ty:ty) => {
<$ty as LargeInt>::HighHalf
};
}

macro_rules! os_ty {
($ty:ty) => {
<$ty as Int>::OtherSign
};
}

pub mod addsub;
pub mod mul;
pub mod sdiv;
pub mod shift;

mod specialized_div_rem;
pub mod udiv;
pub mod sdiv;

/// Trait for some basic operations on integers
pub trait Int:
Expand Down
114 changes: 37 additions & 77 deletions src/int/sdiv.rs
Original file line number Diff line number Diff line change
@@ -1,101 +1,61 @@
use int::Int;
use super::specialized_div_rem::*;

trait Div: Int {
/// Returns `a / b`
fn div(self, other: Self) -> Self {
let s_a = self >> (Self::BITS - 1);
let s_b = other >> (Self::BITS - 1);
// NOTE it's OK to overflow here because of the `.unsigned()` below.
// This whole operation is computing the absolute value of the inputs
// So some overflow will happen when dealing with e.g. `i64::MIN`
// where the absolute value is `(-i64::MIN) as u64`
let a = (self ^ s_a).wrapping_sub(s_a);
let b = (other ^ s_b).wrapping_sub(s_b);
let s = s_a ^ s_b;

let r = a.unsigned().aborting_div(b.unsigned());
(Self::from_unsigned(r) ^ s) - s
}
}

impl Div for i32 {}
impl Div for i64 {}
impl Div for i128 {}

trait Mod: Int {
/// Returns `a % b`
fn mod_(self, other: Self) -> Self {
let s = other >> (Self::BITS - 1);
// NOTE(wrapping_sub) see comment in the `div`
let b = (other ^ s).wrapping_sub(s);
let s = self >> (Self::BITS - 1);
let a = (self ^ s).wrapping_sub(s);

let r = a.unsigned().aborting_rem(b.unsigned());
(Self::from_unsigned(r) ^ s) - s
}
}

impl Mod for i32 {}
impl Mod for i64 {}
impl Mod for i128 {}

trait Divmod: Int {
/// Returns `a / b` and sets `*rem = n % d`
fn divmod<F>(self, other: Self, rem: &mut Self, div: F) -> Self
where
F: Fn(Self, Self) -> Self,
{
let r = div(self, other);
// NOTE won't overflow because it's using the result from the
// previous division
*rem = self - r.wrapping_mul(other);
r
}
}

impl Divmod for i32 {}
impl Divmod for i64 {}
// NOTE: there are aborts inside the specialized_div_rem functions if division by 0
// is encountered, however these should be unreachable and optimized away unless
// uses of `std/core::intrinsics::unchecked_div/rem` do not have a 0 check in front
// of them.

intrinsics! {
#[maybe_use_optimized_c_shim]
#[arm_aeabi_alias = __aeabi_idiv]
/// Returns `n / d`
pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 {
a.div(b)
i32_div_rem(a, b).0
}

#[maybe_use_optimized_c_shim]
pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
a.div(b)
/// Returns `n % d`
pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
i32_div_rem(a, b).1
}

#[win64_128bit_abi_hack]
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
a.div(b)
#[maybe_use_optimized_c_shim]
/// Returns `n / d` and sets `*rem = n % d`
pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
let quo_rem = i32_div_rem(a, b);
*rem = quo_rem.1;
quo_rem.0
}

#[maybe_use_optimized_c_shim]
pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
a.mod_(b)
/// Returns `n / d`
pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
i64_div_rem(a, b).0
}

#[maybe_use_optimized_c_shim]
/// Returns `n % d`
pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 {
a.mod_(b)
i64_div_rem(a, b).1
}

#[win64_128bit_abi_hack]
pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
a.mod_(b)

#[aapcs_on_arm]
/// Returns `n / d` and sets `*rem = n % d`
pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
let quo_rem = i64_div_rem(a, b);
*rem = quo_rem.1;
quo_rem.0
}

#[maybe_use_optimized_c_shim]
pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
a.divmod(b, rem, |a, b| __divsi3(a, b))
#[win64_128bit_abi_hack]
/// Returns `n / d`
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
i128_div_rem(a, b).0
}

#[aapcs_on_arm]
pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
a.divmod(b, rem, |a, b| __divdi3(a, b))

#[win64_128bit_abi_hack]
/// Returns `n % d`
pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
i128_div_rem(a, b).1
}
}
175 changes: 175 additions & 0 deletions src/int/specialized_div_rem/asymmetric.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
macro_rules! impl_asymmetric {
(
$unsigned_name:ident, // name of the unsigned function
$signed_name:ident, // name of the signed function
$half_division:ident, // function for division of a $uX by a $uX
$asymmetric_division:ident, // function for division of a $uD by a $uX
$n_h:expr, // the number of bits in $iH or $uH
$uH:ident, // unsigned integer with half the bit width of $uX
$uX:ident, // unsigned integer with half the bit width of $uD
$uD:ident, // unsigned integer with double the bit width of $uX
$iD:ident, // signed version of $uD
$($unsigned_attr:meta),*; // attributes for the unsigned function
$($signed_attr:meta),* // attributes for the signed function
) => {
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
/// tuple.
///
/// This is optimized for dividing integers with the same bitwidth as the largest operand in
/// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction
/// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits.
///
/// # Panics
///
/// When attempting to divide by zero, this function will panic.
$(
#[$unsigned_attr]
)*
pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) {
#[inline(always)]
fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
}
#[inline(always)]
fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
}

let n: u32 = $n_h * 2;

// Many of these subalgorithms are taken from trifecta.rs, see that for better
// documentation

let duo_lo = duo as $uX;
let duo_hi = (duo >> n) as $uX;
let div_lo = div as $uX;
let div_hi = (div >> n) as $uX;
if div_hi == 0 {
if div_lo == 0 {
// division by zero
::abort();
}
if duo_hi < div_lo {
// plain $uD by $uX division that will fit into $uX
let tmp = unsafe { $asymmetric_division(duo, div_lo) };
return (tmp.0 as $uD, tmp.1 as $uD)
} else if (div_lo >> $n_h) == 0 {
// Short division of $uD by a $uH.
let div_0 = div_lo as $uH as $uX;
let (quo_hi, rem_3) = $half_division(duo_hi, div_0);

let duo_mid =
((duo >> $n_h) as $uH as $uX)
| (rem_3 << $n_h);
let (quo_1, rem_2) = $half_division(duo_mid, div_0);

let duo_lo =
(duo as $uH as $uX)
| (rem_2 << $n_h);
let (quo_0, rem_1) = $half_division(duo_lo, div_0);

return (
(quo_0 as $uD)
| ((quo_1 as $uD) << $n_h)
| ((quo_hi as $uD) << n),
rem_1 as $uD
)
} else {
// Short division using the $uD by $uX division
let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo);
let tmp = unsafe {
$asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo)
};
return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD)
}
}

let duo_lz = duo_hi.leading_zeros();
let div_lz = div_hi.leading_zeros();
let rel_leading_sb = div_lz.wrapping_sub(duo_lz);
if rel_leading_sb < $n_h {
// Some x86_64 CPUs have bad `divq` implementations that make putting
// a `mul` or `mul - 1` algorithm here beneficial
let shift = n.wrapping_sub(duo_lz);
let duo_sig_n = (duo >> shift) as $uX;
let div_sig_n = (div >> shift) as $uX;
let mul = $half_division(duo_sig_n, div_sig_n).0;
let div_lo = div as $uX;
let div_hi = (div >> n) as $uX;
let (tmp_lo, carry) = carrying_mul(mul,div_lo);
let (tmp_hi, overflow) = carrying_mul_add(mul,div_hi,carry);
let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
if ((overflow & 1) != 0) || (duo < tmp) {
return (
mul.wrapping_sub(1) as $uD,
duo.wrapping_add(div.wrapping_sub(tmp))
)
} else {
return (
mul as $uD,
duo.wrapping_sub(tmp)
)
}
} else {
// This has been adapted from
// https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
// adapted from www.hackersdelight.org

// This is similar to the `mul` or `mul - 1` algorithm in that it uses only more
// significant parts of `duo` and `div` to divide a large integer with a smaller
// division instruction.
let tmp = unsafe {
$asymmetric_division(duo >> 1, ((div << div_lz) >> n) as $uX)
};
let mut quo = tmp.0 >> ((n - 1) - div_lz);
if quo != 0 {
quo -= 1;
}
// Note that this is a large $uD multiplication being used here
let mut rem = duo - ((quo as $uD) * div);

if rem >= div {
quo += 1;
rem -= div;
}
return (quo as $uD, rem)
}
}

/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
/// tuple.
///
/// This is optimized for dividing integers with the same bitwidth as the largest operand in
/// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction
/// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits.
///
/// # Panics
///
/// When attempting to divide by zero, this function will panic.
$(
#[$signed_attr]
)*
pub fn $signed_name(duo: $iD, div: $iD) -> ($iD,$iD) {
match (duo < 0, div < 0) {
(false,false) => {
let t = $unsigned_name(duo as $uD,div as $uD);
(t.0 as $iD,t.1 as $iD)
},
(true,false) => {
let t = $unsigned_name(duo.wrapping_neg() as $uD,div as $uD);
((t.0 as $iD).wrapping_neg(),(t.1 as $iD).wrapping_neg())
},
(false,true) => {
let t = $unsigned_name(duo as $uD,div.wrapping_neg() as $uD);
((t.0 as $iD).wrapping_neg(),t.1 as $iD)
},
(true,true) => {
let t = $unsigned_name(duo.wrapping_neg() as $uD,div.wrapping_neg() as $uD);
(t.0 as $iD,(t.1 as $iD).wrapping_neg())
},
}
}
}
}
Loading

0 comments on commit 71aba93

Please sign in to comment.