Skip to content

Commit

Permalink
fix division on SPARC
Browse files Browse the repository at this point in the history
  • Loading branch information
AaronKutch committed Nov 20, 2020
1 parent 63ccaf1 commit 78ea036
Show file tree
Hide file tree
Showing 3 changed files with 190 additions and 27 deletions.
130 changes: 130 additions & 0 deletions src/int/specialized_div_rem/delegate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,133 @@ macro_rules! impl_delegate {
}
};
}

/// Returns `n / d` and sets `*rem = n % d`.
///
/// This specialization exists because:
/// - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`,
/// so we have to use an old fashioned `&mut u128` argument to return the remainder.
/// - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the
/// delegate algorithm strategy the only reasonably fast way to perform `u128` division.
#[doc(hidden)]
pub fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 {
use super::*;
let duo_lo = duo as u64;
let duo_hi = (duo >> 64) as u64;
let div_lo = div as u64;
let div_hi = (div >> 64) as u64;

match (div_lo == 0, div_hi == 0, duo_hi == 0) {
(true, true, _) => zero_div_fn(),
(_, false, true) => {
*rem = duo;
return 0;
}
(false, true, true) => {
let tmp = u64_by_u64_div_rem(duo_lo, div_lo);
*rem = tmp.1 as u128;
return tmp.0 as u128;
}
(false, true, false) => {
if duo_hi < div_lo {
let norm_shift = u64_normalization_shift(div_lo, duo_hi, false);
let shl = if norm_shift == 0 {
64 - 1
} else {
64 - norm_shift
};

let mut div: u128 = div << shl;
let mut pow_lo: u64 = 1 << shl;
let mut quo_lo: u64 = 0;
let mut duo = duo;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as i128) {
duo = sub;
quo_lo |= pow_lo;
let duo_hi = (duo >> 64) as u64;
if duo_hi == 0 {
let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
*rem = tmp.1 as u128;
return (quo_lo | tmp.0) as u128;
}
}
div >>= 1;
pow_lo >>= 1;
}
} else if duo_hi == div_lo {
let tmp = u64_by_u64_div_rem(duo as u64, div as u64);
*rem = tmp.1 as u128;
return (1 << 64) | (tmp.0 as u128);
} else {
if (div_lo >> 32) == 0 {
let div_0 = div_lo as u32 as u64;
let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0);

let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32);
let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0);

let duo_lo = (duo as u32 as u64) | (rem_2 << 32);
let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0);

*rem = rem_1 as u128;
return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64);
}

let duo_lo = duo as u64;
let tmp = u64_by_u64_div_rem(duo_hi, div_lo);
let quo_hi = tmp.0;
let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64);
if duo < div {
*rem = duo;
return (quo_hi as u128) << 64;
}

let mut div: u128 = div << (64 - 1);
let mut pow_lo: u64 = 1 << (64 - 1);
let mut quo_lo: u64 = 0;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as i128) {
duo = sub;
quo_lo |= pow_lo;
let duo_hi = (duo >> 64) as u64;
if duo_hi == 0 {
let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
*rem = tmp.1 as u128;
return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64);
}
}
div >>= 1;
pow_lo >>= 1;
}
}
}
(_, false, false) => {
if duo < div {
*rem = duo;
return 0;
}
let div_original = div;
let shl = u64_normalization_shift(duo_hi, div_hi, false);
let mut duo = duo;
let mut div: u128 = div << shl;
let mut pow_lo: u64 = 1 << shl;
let mut quo_lo: u64 = 0;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as i128) {
duo = sub;
quo_lo |= pow_lo;
if duo < div_original {
*rem = duo;
return quo_lo as u128;
}
}
div >>= 1;
pow_lo >>= 1;
}
}
}
}
49 changes: 28 additions & 21 deletions src/int/specialized_div_rem/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ mod binary_long;

#[macro_use]
mod delegate;
pub use self::delegate::u128_divide_sparc;

#[macro_use]
mod trifecta;
Expand All @@ -60,27 +61,31 @@ fn zero_div_fn() -> ! {
unsafe { core::hint::unreachable_unchecked() }
}

// The `B` extension on RISC-V determines if a CLZ assembly instruction exists
#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
const USE_LZ: bool = cfg!(target_feature = "b");

#[cfg(target_arch = "arm")]
const USE_LZ: bool = if cfg!(target_feature = "thumb-mode") {
// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is supported. This
// is needed to successfully differentiate between targets like `thumbv8.base` and
// `thumbv8.main`.
cfg!(target_feature = "v6t2")
} else {
// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is supported.
// Technically, ARMv5T was the first to have CLZ, but the "v5t" target feature does not seem to
// work.
cfg!(target_feature = "v5te")
const USE_LZ: bool = {
if cfg!(target_arch = "arm") {
if cfg!(target_feature = "thumb-mode") {
// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
// supported. This is needed to successfully differentiate between targets like
// `thumbv8.base` and `thumbv8.main`.
cfg!(target_feature = "v6t2")
} else {
// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
// supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
// feature does not seem to work.
cfg!(target_feature = "v5te")
}
} else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
// LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
cfg!(target_feature = "vis3")
} else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
// The `B` extension on RISC-V determines if a CLZ assembly instruction exists
cfg!(target_feature = "b")
} else {
// All other common targets Rust supports should have CLZ instructions
true
}
};

// All other targets Rust supports have CLZ instructions
#[cfg(not(any(target_arch = "arm", target_arch = "riscv32", target_arch = "riscv64")))]
const USE_LZ: bool = true;

impl_normalization_shift!(
u32_normalization_shift,
USE_LZ,
Expand Down Expand Up @@ -115,8 +120,9 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
// faster if the target pointer width is at least 64.
#[cfg(all(
not(any(target_pointer_width = "16", target_pointer_width = "32")),
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
not(any(target_pointer_width = "16", target_pointer_width = "32"))
not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_trifecta!(
u128_div_rem,
Expand All @@ -131,8 +137,9 @@ impl_trifecta!(
// If the pointer width less than 64, then the target architecture almost certainly does not have
// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
#[cfg(all(
any(target_pointer_width = "16", target_pointer_width = "32"),
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
any(target_pointer_width = "16", target_pointer_width = "32")
not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_delegate!(
u128_div_rem,
Expand Down
38 changes: 32 additions & 6 deletions src/int/udiv.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub use int::specialized_div_rem::u128_divide_sparc;
use int::specialized_div_rem::*;

intrinsics! {
Expand Down Expand Up @@ -46,25 +47,50 @@ intrinsics! {
quo_rem.0
}

// Note: we use block configuration and not `if cfg!(...)`, because we need to entirely disable
// the existence of `u128_div_rem` to get 32-bit SPARC to compile, see `u128_divide_sparc` docs.

#[win64_128bit_abi_hack]
/// Returns `n / d`
pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 {
u128_div_rem(n, d).0
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
u128_div_rem(n, d).0
}
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
u128_divide_sparc(n, d, &mut 0)
}
}

#[win64_128bit_abi_hack]
/// Returns `n % d`
pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 {
u128_div_rem(n, d).1
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
u128_div_rem(n, d).1
}
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
let mut rem = 0;
u128_divide_sparc(n, d, &mut rem);
rem
}
}

#[win64_128bit_abi_hack]
/// Returns `n / d` and sets `*rem = n % d`
pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
let quo_rem = u128_div_rem(n, d);
if let Some(rem) = rem {
*rem = quo_rem.1;
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
let quo_rem = u128_div_rem(n, d);
if let Some(rem) = rem {
*rem = quo_rem.1;
}
quo_rem.0
}
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
let mut tmp = 0;
let quo = u128_divide_sparc(n, d, &mut tmp);
if let Some(rem) = rem {
*rem = tmp;
}
quo
}
quo_rem.0
}
}

0 comments on commit 78ea036

Please sign in to comment.