From 71aba933ba61aa60eaf81be074ccb64c610cc768 Mon Sep 17 00:00:00 2001
From: Aaron Kutch <aaronkutch@att.net>
Date: Wed, 18 Dec 2019 18:49:55 -0600
Subject: [PATCH] Replace division implementations with code from the
 specialized-div-rem crate

Puts the asymmetric division behind a feature flag

Makes asymmetric-asm a default feature
---
 Cargo.toml                                 |   5 +-
 src/int/mod.rs                             |  16 +-
 src/int/sdiv.rs                            | 114 ++----
 src/int/specialized_div_rem/asymmetric.rs  | 175 ++++++++
 src/int/specialized_div_rem/binary_long.rs | 113 +++++
 src/int/specialized_div_rem/delegate.rs    | 205 +++++++++
 src/int/specialized_div_rem/mod.rs         | 133 ++++++
 src/int/specialized_div_rem/trifecta.rs    | 456 +++++++++++++++++++++
 src/int/udiv.rs                            | 264 ++----------
 9 files changed, 1160 insertions(+), 321 deletions(-)
 create mode 100644 src/int/specialized_div_rem/asymmetric.rs
 create mode 100644 src/int/specialized_div_rem/binary_long.rs
 create mode 100644 src/int/specialized_div_rem/delegate.rs
 create mode 100644 src/int/specialized_div_rem/mod.rs
 create mode 100644 src/int/specialized_div_rem/trifecta.rs
diff --git a/Cargo.toml b/Cargo.toml
index 39c4553f..8a4af103 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,7 +38,7 @@ cc = { optional = true, version = "1.0" }
 panic-handler = { path = 'crates/panic-handler' }
 
 [features]
-default = ["compiler-builtins"]
+default = ["compiler-builtins", "asymmetric-asm"]
 
 # Enable compilation of C code in compiler-rt, filling in some more optimized
 # implementations and also filling in unimplemented intrinsics
@@ -60,6 +60,9 @@ no-lang-items = []
 # Only used in the compiler's build system
 rustc-dep-of-std = ['compiler-builtins', 'core']
 
+# Used for faster u128 division on x86_64
+asymmetric-asm = []
+
 [[example]]
 name = "intrinsics"
 required-features = ["compiler-builtins"]
diff --git a/src/int/mod.rs b/src/int/mod.rs
index 7587bc69..1ab49f5a 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -1,22 +1,12 @@
 use core::ops;
 
-macro_rules! hty {
-    ($ty:ty) => {
-        <$ty as LargeInt>::HighHalf
-    };
-}
-
-macro_rules! os_ty {
-    ($ty:ty) => {
-        <$ty as Int>::OtherSign
-    };
-}
-
 pub mod addsub;
 pub mod mul;
-pub mod sdiv;
 pub mod shift;
+
+mod specialized_div_rem;
 pub mod udiv;
+pub mod sdiv;
 
 /// Trait for some basic operations on integers
 pub trait Int:
diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs
index c9e252cc..f75f2ca9 100644
--- a/src/int/sdiv.rs
+++ b/src/int/sdiv.rs
@@ -1,101 +1,61 @@
-use int::Int;
+use super::specialized_div_rem::*;
 
-trait Div: Int {
-    /// Returns `a / b`
-    fn div(self, other: Self) -> Self {
-        let s_a = self >> (Self::BITS - 1);
-        let s_b = other >> (Self::BITS - 1);
-        // NOTE it's OK to overflow here because of the `.unsigned()` below.
-        // This whole operation is computing the absolute value of the inputs
-        // So some overflow will happen when dealing with e.g. `i64::MIN`
-        // where the absolute value is `(-i64::MIN) as u64`
-        let a = (self ^ s_a).wrapping_sub(s_a);
-        let b = (other ^ s_b).wrapping_sub(s_b);
-        let s = s_a ^ s_b;
-
-        let r = a.unsigned().aborting_div(b.unsigned());
-        (Self::from_unsigned(r) ^ s) - s
-    }
-}
-
-impl Div for i32 {}
-impl Div for i64 {}
-impl Div for i128 {}
-
-trait Mod: Int {
-    /// Returns `a % b`
-    fn mod_(self, other: Self) -> Self {
-        let s = other >> (Self::BITS - 1);
-        // NOTE(wrapping_sub) see comment in the `div`
-        let b = (other ^ s).wrapping_sub(s);
-        let s = self >> (Self::BITS - 1);
-        let a = (self ^ s).wrapping_sub(s);
-
-        let r = a.unsigned().aborting_rem(b.unsigned());
-        (Self::from_unsigned(r) ^ s) - s
-    }
-}
-
-impl Mod for i32 {}
-impl Mod for i64 {}
-impl Mod for i128 {}
-
-trait Divmod: Int {
-    /// Returns `a / b` and sets `*rem = n % d`
-    fn divmod<F>(self, other: Self, rem: &mut Self, div: F) -> Self
-    where
-        F: Fn(Self, Self) -> Self,
-    {
-        let r = div(self, other);
-        // NOTE won't overflow because it's using the result from the
-        // previous division
-        *rem = self - r.wrapping_mul(other);
-        r
-    }
-}
-
-impl Divmod for i32 {}
-impl Divmod for i64 {}
+// NOTE: there are aborts inside the specialized_div_rem functions if division by 0
+// is encountered, however these should be unreachable and optimized away unless
+// uses of `std/core::intrinsics::unchecked_div/rem` do not have a 0 check in front
+// of them.
 
 intrinsics! {
     #[maybe_use_optimized_c_shim]
     #[arm_aeabi_alias = __aeabi_idiv]
+    /// Returns `n / d`
     pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 {
-        a.div(b)
+        i32_div_rem(a, b).0
     }
 
     #[maybe_use_optimized_c_shim]
-    pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
-        a.div(b)
+    /// Returns `n % d`
+    pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
+        i32_div_rem(a, b).1
     }
 
-    #[win64_128bit_abi_hack]
-    pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
-        a.div(b)
+    #[maybe_use_optimized_c_shim]
+    /// Returns `n / d` and sets `*rem = n % d`
+    pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
+        let quo_rem = i32_div_rem(a, b);
+        *rem = quo_rem.1;
+        quo_rem.0
     }
 
     #[maybe_use_optimized_c_shim]
-    pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
-        a.mod_(b)
+    /// Returns `n / d`
+    pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
+        i64_div_rem(a, b).0
     }
 
     #[maybe_use_optimized_c_shim]
+    /// Returns `n % d`
     pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 {
-        a.mod_(b)
+        i64_div_rem(a, b).1
     }
-
-    #[win64_128bit_abi_hack]
-    pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
-        a.mod_(b)
+    
+    #[aapcs_on_arm]
+    /// Returns `n / d` and sets `*rem = n % d`
+    pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
+        let quo_rem = i64_div_rem(a, b);
+        *rem = quo_rem.1;
+        quo_rem.0
     }
 
-    #[maybe_use_optimized_c_shim]
-    pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
-        a.divmod(b, rem, |a, b| __divsi3(a, b))
+    #[win64_128bit_abi_hack]
+    /// Returns `n / d`
+    pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
+        i128_div_rem(a, b).0
     }
-
-    #[aapcs_on_arm]
-    pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
-        a.divmod(b, rem, |a, b| __divdi3(a, b))
+    
+    #[win64_128bit_abi_hack]
+    /// Returns `n % d`
+    pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
+        i128_div_rem(a, b).1
     }
 }
diff --git a/src/int/specialized_div_rem/asymmetric.rs b/src/int/specialized_div_rem/asymmetric.rs
new file mode 100644
index 00000000..133d8249
--- /dev/null
+++ b/src/int/specialized_div_rem/asymmetric.rs
@@ -0,0 +1,175 @@
+macro_rules! impl_asymmetric {
+    (
+        $unsigned_name:ident, // name of the unsigned function
+        $signed_name:ident, // name of the signed function
+        $half_division:ident, // function for division of a $uX by a $uX
+        $asymmetric_division:ident, // function for division of a $uD by a $uX
+        $n_h:expr, // the number of bits in $iH or $uH
+        $uH:ident, // unsigned integer with half the bit width of $uX
+        $uX:ident, // unsigned integer with half the bit width of $uD
+        $uD:ident, // unsigned integer with double the bit width of $uX
+        $iD:ident, // signed version of $uD
+        $($unsigned_attr:meta),*; // attributes for the unsigned function
+        $($signed_attr:meta),* // attributes for the signed function
+    ) => {
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        /// 
+        /// This is optimized for dividing integers with the same bitwidth as the largest operand in
+        /// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction
+        /// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$unsigned_attr]
+        )*
+        pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) {
+            #[inline(always)]
+            fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
+                let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
+            }
+            #[inline(always)]
+            fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
+                let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
+            }
+
+            let n: u32 = $n_h * 2;
+
+            // Many of these subalgorithms are taken from trifecta.rs, see that for better
+            // documentation
+
+            let duo_lo = duo as $uX;
+            let duo_hi = (duo >> n) as $uX;
+            let div_lo = div as $uX;
+            let div_hi = (div >> n) as $uX;
+            if div_hi == 0 {
+                if div_lo == 0 {
+                    // division by zero
+                    ::abort();
+                }
+                if duo_hi < div_lo {
+                    // plain $uD by $uX division that will fit into $uX
+                    let tmp = unsafe { $asymmetric_division(duo, div_lo) };
+                    return (tmp.0 as $uD, tmp.1 as $uD)
+                } else if (div_lo >> $n_h) == 0 {
+                    // Short division of $uD by a $uH.
+                    let div_0 = div_lo as $uH as $uX;
+                    let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
+
+                    let duo_mid =
+                        ((duo >> $n_h) as $uH as $uX)
+                        | (rem_3 << $n_h);
+                    let (quo_1, rem_2) = $half_division(duo_mid, div_0);
+
+                    let duo_lo =
+                        (duo as $uH as $uX)
+                        | (rem_2 << $n_h);
+                    let (quo_0, rem_1) = $half_division(duo_lo, div_0);
+
+                    return (
+                        (quo_0 as $uD)
+                        | ((quo_1 as $uD) << $n_h)
+                        | ((quo_hi as $uD) << n),
+                        rem_1 as $uD
+                    )
+                } else {
+                    // Short division using the $uD by $uX division
+                    let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo);
+                    let tmp = unsafe {
+                        $asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo)
+                    };
+                    return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD)
+                }
+            }
+
+            let duo_lz = duo_hi.leading_zeros();
+            let div_lz = div_hi.leading_zeros();
+            let rel_leading_sb = div_lz.wrapping_sub(duo_lz);
+            if rel_leading_sb < $n_h {
+                // Some x86_64 CPUs have bad `divq` implementations that make putting
+                // a `mul` or `mul - 1` algorithm here beneficial
+                let shift = n.wrapping_sub(duo_lz);
+                let duo_sig_n = (duo >> shift) as $uX;
+                let div_sig_n = (div >> shift) as $uX;
+                let mul = $half_division(duo_sig_n, div_sig_n).0;
+                let div_lo = div as $uX;
+                let div_hi = (div >> n) as $uX;
+                let (tmp_lo, carry) = carrying_mul(mul,div_lo);
+                let (tmp_hi, overflow) = carrying_mul_add(mul,div_hi,carry);
+                let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
+                if ((overflow & 1) != 0) || (duo < tmp) {
+                    return (
+                        mul.wrapping_sub(1) as $uD,
+                        duo.wrapping_add(div.wrapping_sub(tmp))
+                    )
+                } else {
+                    return (
+                        mul as $uD,
+                        duo.wrapping_sub(tmp)
+                    )
+                }
+            } else {
+                // This has been adapted from
+                // https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
+                // adapted from www.hackersdelight.org
+
+                // This is similar to the `mul` or `mul - 1` algorithm in that it uses only more
+                // significant parts of `duo` and `div` to divide a large integer with a smaller
+                // division instruction.
+                let tmp = unsafe {
+                    $asymmetric_division(duo >> 1, ((div << div_lz) >> n) as $uX)
+                };
+                let mut quo = tmp.0 >> ((n - 1) - div_lz);
+                if quo != 0 {
+                    quo -= 1;
+                }
+                // Note that this is a large $uD multiplication being used here
+                let mut rem = duo - ((quo as $uD) * div);
+    
+                if rem >= div {
+                    quo += 1;
+                    rem -= div;
+                }
+                return (quo as $uD, rem)
+            }
+        }
+
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        /// 
+        /// This is optimized for dividing integers with the same bitwidth as the largest operand in
+        /// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction
+        /// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$signed_attr]
+        )*
+        pub fn $signed_name(duo: $iD, div: $iD) -> ($iD,$iD) {
+            match (duo < 0, div < 0) {
+                (false,false) => {
+                    let t = $unsigned_name(duo as $uD,div as $uD);
+                    (t.0 as $iD,t.1 as $iD)
+                },
+                (true,false) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uD,div as $uD);
+                    ((t.0 as $iD).wrapping_neg(),(t.1 as $iD).wrapping_neg())
+                },
+                (false,true) => {
+                    let t = $unsigned_name(duo as $uD,div.wrapping_neg() as $uD);
+                    ((t.0 as $iD).wrapping_neg(),t.1 as $iD)
+                },
+                (true,true) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uD,div.wrapping_neg() as $uD);
+                    (t.0 as $iD,(t.1 as $iD).wrapping_neg())
+                },
+            }
+        }
+    }
+}
diff --git a/src/int/specialized_div_rem/binary_long.rs b/src/int/specialized_div_rem/binary_long.rs
new file mode 100644
index 00000000..dc80b75f
--- /dev/null
+++ b/src/int/specialized_div_rem/binary_long.rs
@@ -0,0 +1,113 @@
+macro_rules! impl_binary_long {
+    (
+        $unsigned_name:ident, // name of the unsigned function
+        $signed_name:ident, // name of the signed function
+        $n:expr, // the number of bits in a $iX or $uX
+        $uX:ident, // unsigned integer that will be shifted
+        $iX:ident, // signed version of $uX
+        $($unsigned_attr:meta),*; // attributes for the unsigned function
+        $($signed_attr:meta),* // attributes for the signed function
+    ) => {
+
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        /// 
+        /// This uses binary shift long division only, and is designed for CPUs without fast
+        /// multiplication or division hardware.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$unsigned_attr]
+        )*
+        pub fn $unsigned_name(duo: $uX, div: $uX) -> ($uX,$uX) {
+            if div == 0 {
+                // division by zero
+                ::abort();
+            }
+
+            // Full $uX binary long division. Use `leading_zeros` on the first round,
+            // because we assume that the average usage of division has arguments that
+            // are random but have a significant number of leading zero bits. Doing
+            // `leading_zeros` for every round would be very expensive, especially for
+            // CPUs without a native count leading zeros instruction, but doing it just
+            // for the first round is advantageous for both performance of the common
+            // case and for code simplicity. Note that many benchmarks use the full
+            // `n_d` bits for `duo`, and the benchmarks with several bits less have a
+            // good performance increase.
+
+            let div_lz = div.leading_zeros();
+            let duo_lz = duo.leading_zeros();
+
+            if div_lz < duo_lz {
+                return (0, duo)
+            }
+
+            // Figures out how far `div` should be shifted to align most significant
+            // bits
+            let mut shift = div_lz - duo_lz;
+            let mut duo = duo;
+            let mut div = div << shift;
+            let mut quo = 0;
+            loop {
+                // There is a way to do this without branching, but requires too many extra
+                // operations to be faster:
+                // let sub = duo.wrapping_sub(div);
+                // let sign_mask = !(((sub as $iD) >> (n_d - 1)) as $uD);
+                // duo -= div & sign_mask;
+                // quo |= sign_mask & 1;
+                let sub = duo.wrapping_sub(div);
+                if (sub as $iX) >= 0 {
+                    duo = sub;
+                    quo |= 1;
+                }
+
+                if duo == 0 {
+                    // must have this branch for inputs that are not as random as what is used in
+                    // the benchmarks
+                    return (quo << shift, duo)
+                }
+                if shift == 0 {
+                    return (quo, duo)
+                }
+                shift -= 1;
+                div >>= 1;
+                quo <<= 1;
+            }
+        }
+
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        /// 
+        /// This uses binary shift long division only, and is designed for CPUs without fast
+        /// multiplication or division hardware.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$signed_attr]
+        )*
+        pub fn $signed_name(duo: $iX, div: $iX) -> ($iX,$iX) {
+            match (duo < 0, div < 0) {
+                (false,false) => {
+                    let t = $unsigned_name(duo as $uX,div as $uX);
+                    (t.0 as $iX,t.1 as $iX)
+                },
+                (true,false) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uX,div as $uX);
+                    ((t.0 as $iX).wrapping_neg(),(t.1 as $iX).wrapping_neg())
+                },
+                (false,true) => {
+                    let t = $unsigned_name(duo as $uX,div.wrapping_neg() as $uX);
+                    ((t.0 as $iX).wrapping_neg(),t.1 as $iX)
+                },
+                (true,true) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uX,div.wrapping_neg() as $uX);
+                    (t.0 as $iX,(t.1 as $iX).wrapping_neg())
+                },
+            }
+        }
+    }
+}
diff --git a/src/int/specialized_div_rem/delegate.rs b/src/int/specialized_div_rem/delegate.rs
new file mode 100644
index 00000000..bd6fc117
--- /dev/null
+++ b/src/int/specialized_div_rem/delegate.rs
@@ -0,0 +1,205 @@
+macro_rules! impl_delegate {
+    (
+        $unsigned_name:ident, // name of the unsigned function
+        $signed_name:ident, // name of the signed function
+        $half_division:ident, // function for division of a $uX by a $uX
+        $n_h:expr, // the number of bits in $iH or $uH
+        $uH:ident, // unsigned integer with half the bit width of $uX
+        $uX:ident, // unsigned integer with half the bit width of $uD
+        $uD:ident, // unsigned integer with double the bit width of $uX
+        $iD:ident, // signed version of $uD
+        $($unsigned_attr:meta),*; // attributes for the unsigned function
+        $($signed_attr:meta),* // attributes for the signed function
+    ) => {
+
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        /// 
+        /// This uses binary shift long division, but if it can delegates work to a smaller
+        /// division. This function is used for CPUs with a register size smaller than the division
+        /// size, and that do not have fast multiplication or division hardware. For CPUs with a
+        /// register size equal to the division size, the `_binary_long` functions are probably
+        /// faster.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$unsigned_attr]
+        )*
+        pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) {
+            // the number of bits in a $uX
+            let n = $n_h * 2;
+
+            let duo_lo = duo as $uX;
+            let duo_hi = (duo >> n) as $uX;
+            let div_lo = div as $uX;
+            let div_hi = (div >> n) as $uX;
+
+            match (div_lo == 0, div_hi == 0, duo_hi == 0) {
+                (true, true, _) => {
+                    // division by zero
+                    ::abort();
+                }
+                (_,false,true) => {
+                    // `duo` < `div`
+                    return (0, duo)
+                }
+                (false, true, true) => {
+                    // delegate to smaller division
+                    let tmp = $half_division(duo_lo, div_lo);
+                    return (tmp.0 as $uD, tmp.1 as $uD)
+                }
+                (false,true,false) => {
+                    if (div_lo >> $n_h) == 0 {
+                        // Short division of $uD by a $uH, using $uX by $uX division
+                        let div_0 = div_lo as $uH as $uX;
+                        let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
+
+                        let duo_mid =
+                            ((duo >> $n_h) as $uH as $uX)
+                            | (rem_3 << $n_h);
+                        let (quo_1, rem_2) = $half_division(duo_mid, div_0);
+
+                        let duo_lo =
+                            (duo as $uH as $uX)
+                            | (rem_2 << $n_h);
+                        let (quo_0, rem_1) = $half_division(duo_lo, div_0);
+
+                        return (
+                            (quo_0 as $uD)
+                            | ((quo_1 as $uD) << $n_h)
+                            | ((quo_hi as $uD) << n),
+                            rem_1 as $uD
+                        )
+                    } else {
+                        // Binary long division of $uD by $uX. This is the same as below, but the
+                        // compiler should be able to do the shifts faster. See the other long
+                        // division below for more.
+                        let div_lz = div_lo.leading_zeros() + n;
+                        let duo_lz = duo_hi.leading_zeros();
+
+                        // initial check for `div_lz < duo_lz` not needed here
+
+                        let mut shift = div_lz - duo_lz;
+                        let mut duo = duo;
+                        // There are edge cases where `quo` needs to be n + 1 bits before the $uX by
+                        // $uX division can be used, e.x. 0xFFFF_FFF8 / 0x7FFF. The first iteration
+                        // is done before the loop, and the 1 bit it adds to `quo` is added at the
+                        // end.
+                        let mut sub = (div_lo as $uD) << shift;
+                        let initial_shift = shift;
+
+                        let quo_edge = if sub <= duo {
+                            duo -= sub;
+                            true
+                        } else {
+                            false
+                        };
+                        let mut quo_hi: $uX = 0;
+                        loop {
+                            let duo_hi = (duo >> n) as $uX;
+                            if duo_hi == 0 || shift == 0 {
+                                // delegate what remains to $uX by $uX division
+                                let duo_lo = duo as $uX;
+                                let tmp = $half_division(duo_lo, div_lo);
+                                let (quo_lo, rem_lo) = (tmp.0, tmp.1);
+                                return (
+                                    (quo_lo as $uD)
+                                    | ((quo_hi as $uD) << shift)
+                                    | ((quo_edge as $uD) << initial_shift),
+                                    rem_lo as $uD
+                                )
+                            }
+                            shift -= 1;
+                            quo_hi <<= 1;
+                            sub >>= 1;
+                            if sub <= duo {
+                                duo -= sub;
+                                quo_hi |= 1;
+                            }
+                        }
+                    }
+                }
+                (_,false,false) => {
+                    // Full $uD binary long division. Use `leading_zeros` on the first round,
+                    // because we assume that the average usage of division has arguments that
+                    // are random but have a significant number of leading zero bits. Doing
+                    // `leading_zeros` for every round would be very expensive, especially for
+                    // CPUs without a native count leading zeros instruction, but doing it just
+                    // for the first round is advantageous for both performance of the common
+                    // case and for code simplicity. Note that many benchmarks use the full
+                    // `n_d` bits for `duo`, and the benchmarks with several bits less have a
+                    // good performance increase.
+
+                    // The "mul or mul - 1" algorithm is not used here, since CPUs without division
+                    // hardware also probably do not have fast multiplication hardware.
+
+                    let div_lz = div_hi.leading_zeros();
+                    let duo_lz = duo_hi.leading_zeros();
+
+                    if div_lz < duo_lz {
+                        return (0, duo)
+                    }
+
+                    // Figures out how far `div` should be shifted to align most significant
+                    // bits
+                    let mut shift = div_lz - duo_lz;
+                    let mut duo = duo;
+                    let mut quo = 0;
+                    let mut sub = div << shift;
+                    loop {
+                        if sub <= duo {
+                            // the subtraction will not overflow
+                            duo -= sub;
+                            quo |= 1;
+                        }
+                        let duo_hi = (duo >> n) as $uX;
+                        if duo_hi == 0 || shift == 0 {
+                            return (quo << shift, duo)
+                        }
+                        shift -= 1;
+                        sub >>= 1;
+                        quo <<= 1;
+                    }
+                }
+            }
+        }
+
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        /// 
+        /// This uses binary shift long division, but if it can delegates work to a smaller
+        /// division. This function is used for CPUs with a register size smaller than the division
+        /// size, and that do not have fast multiplication or division hardware. For CPUs with a
+        /// register size equal to the division size, the `_binary_long` functions are probably
+        /// faster.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$signed_attr]
+        )*
+        pub fn $signed_name(duo: $iD, div: $iD) -> ($iD,$iD) {
+            match (duo < 0, div < 0) {
+                (false,false) => {
+                    let t = $unsigned_name(duo as $uD,div as $uD);
+                    (t.0 as $iD,t.1 as $iD)
+                },
+                (true,false) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uD,div as $uD);
+                    ((t.0 as $iD).wrapping_neg(),(t.1 as $iD).wrapping_neg())
+                },
+                (false,true) => {
+                    let t = $unsigned_name(duo as $uD,div.wrapping_neg() as $uD);
+                    ((t.0 as $iD).wrapping_neg(),t.1 as $iD)
+                },
+                (true,true) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uD,div.wrapping_neg() as $uD);
+                    (t.0 as $iD,(t.1 as $iD).wrapping_neg())
+                },
+            }
+        }
+    }
+}
diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs
new file mode 100644
index 00000000..9e55cb26
--- /dev/null
+++ b/src/int/specialized_div_rem/mod.rs
@@ -0,0 +1,133 @@
+// This `specialized_div_rem` module is from version 0.2.0 of the
+// `specialized-div-rem` crate (the build system cannot handle
+// including it as a regular dependency).
+
+// The purpose of these macros is to easily change the both the division algorithm used
+// for a given integer size and the half division used by that algorithm. For example,
+// most architectures do the following:
+//
+//  - `impl_trifecta!` is enabled by feature flags. This impl makes u128 divisions use
+//    the trifecta algorithm which in turn calls `u64_by_u64_div_rem` for its half
+//    division. The `u64_by_u64_div_rem` function is defined as simply calling whatever
+//    division is defined for u64 integers.
+//  - If the architecture does not supply a 64 bit hardware division instruction, linkers
+//    will look for functions such as `__udivdi3` which is defined in `compiler-builtins`. This will
+//    call `u64_div_rem` which is defined by the `impl_delegate!` below. The half division
+//    is set to be `u32_by_u32_div_rem`. In turn, the `u32_by_u32_div_rem` function is defined
+//    as whatever is defined for u32 integers.
+//  - If the architecture does not supply a 32 bit hardware instruction, `__udivsi3` will
+//    be used which calls `u32_div_rem_binary_long`. The `impl_binary_long!` macro uses
+//    no half division, so the chain of calls ends there.
+//  - LLVM supplies 16 bit divisions and smaller
+//  
+// I could replace all `u64_by_u64_div_rem` in the macros by `u64_div_rem` and likewise
+// for `u32_by_u32_div_rem`, but that would mean that hardware divisions would never be
+// used.
+//
+// If for some reason benchmarks showed that u128 divisions worked best with a
+// subalgorithm that wasn't the same as what is used for u64 divisions, I could add a
+// different function like `u64_by_u64_div_rem_alternative` and change the
+// `impl_trifecta!` macro to use that half division instead. These functions are mostly
+// changed for benchmarking purposes to see what chain of algorithms is fastest.
+//
+// On x86_64, an asymmetrically sized division is supplied which means that
+// `impl_asymmetric!` has two different subalgorithms: `u128_by_u64_div_rem` and
+// `u64_by_u64_div_rem`.
+
+#[macro_use]
+mod binary_long;
+
+#[macro_use]
+mod delegate;
+
+#[cfg(not(target = "x86_64"))]
+#[macro_use]
+mod trifecta;
+
+#[cfg(target = "x86_64")]
+#[macro_use]
+mod asymmetric;
+
+// `_trifecta` is efficient for large divisions, even when division
+// hardware is not availiable at all.
+
+#[cfg(not(target = "x86_64"))]
+impl_trifecta!(
+    u128_div_rem,
+    i128_div_rem,
+    u64_by_u64_div_rem,
+    32,
+    u32,
+    u64,
+    u128,
+    i128,
+    inline;
+    inline
+);
+
+/// This function is unsafe, because if the quotient of `duo` and `div` does not
+/// fit in a `u64`, a floating point exception is thrown.
+#[cfg(all(feature = "asymmetric-asm", target = "x86_64"))]
+#[inline]
+unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
+    let quo: u64;
+    let rem: u64;
+    let duo_lo = duo as u64;
+    let duo_hi = (duo >> 64) as u64;
+    asm!("divq $4"
+        : "={rax}"(quo), "={rdx}"(rem)
+        : "{rax}"(duo_lo), "{rdx}"(duo_hi), "r"(div)
+        : "rax", "rdx"
+    );
+    return (quo, rem)
+}
+
+// use `_asymmetric` instead of `_trifecta`, because x86_64 supplies the `divq` instruction
+#[cfg(target = "x86_64")]
+impl_asymmetric!(
+    u128_div_rem,
+    i128_div_rem,
+    u64_by_u64_div_rem,
+    u128_by_u64_div_rem,
+    32,
+    u32,
+    u64,
+    u128,
+    i128,
+    inline;
+    inline
+);
+
+#[inline]
+fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
+    (duo / div, duo % div)
+}
+
+// `_delegate` is most efficient in the 64 bit range
+impl_delegate!(
+    u64_div_rem,
+    i64_div_rem,
+    u32_by_u32_div_rem,
+    16,
+    u16,
+    u32,
+    u64,
+    i64,
+    inline;
+    inline
+);
+
+#[inline]
+fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
+    (duo / div, duo % div)
+}
+
+impl_binary_long!(
+    u32_div_rem,
+    i32_div_rem,
+    32,
+    u32,
+    i32,
+    inline;
+    inline
+);
diff --git a/src/int/specialized_div_rem/trifecta.rs b/src/int/specialized_div_rem/trifecta.rs
new file mode 100644
index 00000000..12652b6f
--- /dev/null
+++ b/src/int/specialized_div_rem/trifecta.rs
@@ -0,0 +1,456 @@
+macro_rules! impl_trifecta {
+    (
+        $unsigned_name:ident, // name of the unsigned function
+        $signed_name:ident, // name of the signed function
+        $half_division:ident, // function for division of a $uX by a $uX
+        $n_h:expr, // the number of bits in $iH or $uH
+        $uH:ident, // unsigned integer with half the bit width of $uX
+        $uX:ident, // the largest division instruction that this function calls operates on this
+        $uD:ident, // unsigned integer with double the bit width of $uX
+        $iD:ident, // signed version of $uD
+        $($unsigned_attr:meta),*; // attributes for the unsigned function
+        $($signed_attr:meta),* // attributes for the signed function
+    ) => {
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        /// 
+        /// This is optimized for division of two integers with bit widths twice as large
+        /// as the largest hardware integer division supported. Note that some architectures supply
+        /// a division of an integer larger than register size by a regular sized integer (e.x.
+        /// x86_64 has a `divq` instruction which can divide a 128 bit integer by a 64 bit integer).
+        /// In that case, the `_asymmetric` algorithm should be used instead of this one.
+        /// 
+        /// This is called the trifecta algorithm because it uses three main algorithms: short
+        /// division for small divisors, the "mul or mul - 1" algorithm for when the divisor is
+        /// large enough for the quotient to be determined to be one of two values via only one
+        /// small division, and an undersubtracting long division algorithm for middle cases.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$unsigned_attr]
+        )*
+        pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) {
+            // wrapping_{} was used everywhere for better performance when compiling with
+            // `debug-assertions = true`
+
+            // This replicates `carrying_mul` (rust-lang rfc #2417). LLVM correctly optimizes this
+            // to use a widening multiply to 128 bits.
+            #[inline(always)]
+            fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
+                let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
+            }
+            #[inline(always)]
+            fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
+                let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
+                (tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
+            }
+
+            // the number of bits in a $uX
+            let n = $n_h * 2;
+
+            // This should be optimized away because of checks for zero upstream
+            if div == 0 {
+                // division by zero
+                ::abort();
+            }
+
+            // Note that throughout this function, `lo` and `hi` refer to the high and low `n` bits
+            // of a `$uD`, `0` to `3` refer to the 4 `n_h` bit parts of a `$uD`,
+            // and `mid` refer to the middle two `n_h` parts.
+
+            let div_lz = div.leading_zeros();
+            let mut duo_lz = duo.leading_zeros();
+
+            // the possible ranges of `duo` and `div` at this point:
+            // `0 <= duo < 2^n_d`
+            // `1 <= div < 2^n_d`
+
+            // quotient is 0 or 1 branch
+            if div_lz <= duo_lz {
+                // The quotient cannot be more than 1. The highest set bit of `duo` needs to be at
+                // least one place higher than `div` for the quotient to be more than 1.
+                if duo >= div {
+                    return (1,duo.wrapping_sub(div))
+                } else {
+                    return (0,duo)
+                }
+            }
+
+            // `_sb` is the number of significant bits (from the ones place to the highest set bit)
+            // `{2, 2^div_sb} <= duo < 2^n_d`
+            // `1 <= div < {2^duo_sb, 2^(n_d - 1)}`
+            // smaller division branch
+            if duo_lz >= n {
+                // `duo < 2^n` so it will fit in a $uX. `div` will also fit in a $uX (because of the
+                // `div_lz <= duo_lz` branch) so no numerical error.
+                let tmp = $half_division(duo as $uX, div as $uX);
+                return (
+                    tmp.0 as $uD,
+                    tmp.1 as $uD
+                )
+            }
+            
+            // relative leading significant bits, cannot overflow because of above branches
+            let rel_leading_sb = div_lz.wrapping_sub(duo_lz);
+            
+            // `{2^n, 2^div_sb} <= duo < 2^n_d`
+            // `1 <= div < {2^duo_sb, 2^(n_d - 1)}`
+            // short division branch
+            if div_lz >= (n + $n_h) {
+                // `1 <= div < {2^duo_sb, 2^n_h}`
+                let duo_hi = (duo >> n) as $uX;
+                let div_0 = div as $uH as $uX;
+                let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
+
+                let duo_mid =
+                    ((duo >> $n_h) as $uH as $uX)
+                    | (rem_3 << $n_h);
+                let (quo_1, rem_2) = $half_division(duo_mid, div_0);
+
+                let duo_lo =
+                    (duo as $uH as $uX)
+                    | (rem_2 << $n_h);
+                let (quo_0, rem_1) = $half_division(duo_lo, div_0);
+
+                return (
+                    (quo_0 as $uD)
+                    | ((quo_1 as $uD) << $n_h)
+                    | ((quo_hi as $uD) << n),
+                    rem_1 as $uD
+                )
+            }
+            
+            // `{2^n, 2^div_sb} <= duo < 2^n_d`
+            // `2^n_h <= div < {2^duo_sb, 2^(n_d - 1)}`
+            // `mul` or `mul - 1` branch
+            if rel_leading_sb < $n_h {
+                // proof that `quo` can only be `mul` or `mul - 1`.
+                // disclaimer: this is not rigorous
+                //
+                // We are trying to find the quotient, `quo`.
+                // 1. quo = duo / div. (definition)
+                //
+                // `shift` is the number of bits not in the higher `n` significant bits of `duo`
+                // 2. shift = n - duo_lz. (definition)
+                // 3. duo_sig_n == duo / 2^shift. (definition)
+                // 4. div_sig_n == div / 2^shift. (definition)
+                // Note: the `_sig_n` (which in my notation usually means that the variable uses the
+                // most significant `n` bits) on `div_sig_n` is a misnomer, because instead of using
+                // a shift by `n - div_lz`, what I do is use the `n - duo_lz` shift so that the bits
+                // of (4) correspond to (3).
+                //
+                // this is because of the bits less significant than the sig_n bits that are cut off
+                // during the bit shift
+                // 5. duo_sig_n * 2^shift <= duo < (duo_sig_n + 1) * 2^shift.
+                // 6. div_sig_n * 2^shift <= div < (div_sig_n + 1) * 2^shift.
+                //
+                // dividing each bound of (5) by each bound of (6)
+                // (duo_sig_n * 2^shift) / (div_sig_n * 2^shift)
+                // (duo_sig_n * 2^shift) / ((div_sig_n + 1) * 2^shift)
+                // ((duo_sig_n + 1) * 2^shift) / (div_sig_n * 2^shift)
+                // ((duo_sig_n + 1) * 2^shift) / ((div_sig_n + 1) * 2^shift)
+                // simplifying each of these four
+                // duo_sig_n / div_sig_n
+                // duo_sig_n / (div_sig_n + 1)
+                // (duo_sig_n + 1) / div_sig_n
+                // (duo_sig_n + 1) / (div_sig_n + 1)
+                // taking the smallest and the largest of these as the low and high bounds
+                // and replacing `duo / div` with `quo`
+                // 7. duo_sig_n / (div_sig_n + 1) <= quo < (duo_sig_n + 1) / div_sig_n
+                //
+                // Here is where the range restraints from the previous branches becomes necessary.
+                // To help visualize what needs to happen here, is that the top `n` significant
+                // bits of `duo` are being divided with the corresponding bits of `div`.
+                //
+                // The `rel_leading_sb < n_h` conditional on this branch makes sure that `div_sig_n`
+                // is at least `2^n_h`, and the previous branches make sure that the highest bit of
+                // `div_sig_n` is not the `2^(n - 1)` bit (which `duo_sig_n` has been shifted up to
+                // attain).
+                // 8. `2^(n - 1) <= duo_sig_n < 2^n`
+                // 9. `2^n_h <= div_sig_n < 2^(n - 1)`
+                // 10. mul == duo_sig_n / div_sig_n. (definition)
+                //
+                // Using the bounds (8) and (9) with the `duo_sig_n / (div_sig_n + 1)` term. Note
+                // that on the `<` bounds we subtract 1 to make it inclusive.
+                // (2^n - 1) / (2^n_h + 1) = 2^n_h - 1
+                // 2^(n - 1) / 2^(n - 1) = 1
+                // The other equations using the both upper bounds or both lower bounds end up with
+                // values in the middle of this range.
+                // 11. 1 <= mul <= 2^n_h - 1
+                //
+                // However, all this tells us is that `mul` can fit in a $uH.
+                // We want to show that `mul - 1 <= quo < mul + 1`. What this and (7) means is that
+                // the bits cut off by the shift at the beginning can only affect `quo` enough to
+                // change it between two values. First, I will show that
+                // `(duo_sig_n + 1) / div_sig_n` is equal to or less than `mul + 1`. We cannot
+                // simply use algebra here and say that `1 / div_sig_n` is 0, because we are working
+                // with truncated integer division. If the remainder is `div_sig_n - 1`, then the
+                // `+ 1` can be enough to cross the boundary and increment the quotient
+                // (e.x. 127/32 = 3, 128/32 = 4). Instead, we notice that in order for the remainder
+                // of `(duo_sig_n + x) / div_sig_n` to be wrapped around twice, `x` must be 1 for
+                // one of the wrap arounds and another `div_sig_n` added on to it to wrap around
+                // again. This cannot happen even if `div_sig_n` were 1, so
+                // 12. duo_sig_n / (div_sig_n + 1) <= quo < mul + 1
+                //
+                // Next, we want to show that `duo_sig_n / (div_sig_n + 1)` is more than or equal to
+                // `mul - 1`. Putting all the combinations of bounds from (8) and (9) into (10) and
+                // `duo_sig_n / (div_sig_n + 1)`,
+                // (2^n - 1) / 2^n_h = 2^n_h - 1
+                // (2^n - 1) / (2^n_h + 1) = 2^n_h - 1
+                //
+                // (2^n - 1) / (2^(n - 1) - 1) = 2
+                // (2^n - 1) / (2^(n - 1) - 1 + 1) = 1
+                //
+                // 2^(n - 1) / 2^n_h = 2^(n_h - 1)
+                // 2^(n - 1) / (2^n_h + 1) = 2^(n_h - 1) - 1
+                //
+                // 2^(n - 1) / (2^(n - 1) - 1) = 1
+                // 2^(n - 1) / 2^(n - 1) = 1
+                //
+                // 13. mul - 1 <= quo < mul + 1
+                //
+                // In a lot of division algorithms using smaller divisions to construct a larger
+                // division, we often encounter a situation where the approximate `mul` value
+                // calculated from a smaller division is a few increments away from the true `quo`
+                // value. In those algorithms, they have to do more divisions. Because of the fact
+                // that our `quo` can only be one of two values, we can try the higher value `mul`
+                // and see if `duo - (mul*div)` overflows. If it did overflow, then `quo` must be
+                // `mul - 1`, and because we already calculated `duo - (mul*div)` with wrapping
+                // operations, we can do simple operations without dividing again to find
+                // `duo - ((mul - 1)*div) = duo - (mul*div - div) = duo + div - mul*div`. If it
+                // did not overflow, then `mul` must be the answer, because the remainder of
+                // `duo - ((mul - 1)*div)` is larger and farther away from overflow than
+                // `duo - (mul*div)`, which we already found is not overflowing.
+                //
+                // Thus, we find the quotient using only an `n` sized divide to find `mul`
+                // and a `n` by `d_n` sized multiply and comparison to find if `quo * mul > duo`,
+                // following with adds and subtracts to get the correct values.
+                let shift = n.wrapping_sub(duo_lz);
+                let duo_sig_n = (duo >> shift) as $uX;
+                let div_sig_n = (div >> shift) as $uX;
+                let mul = $half_division(duo_sig_n, div_sig_n).0;
+                // inline `n` bit by `n_d` bit multiplication and overflow check (we cannot do
+                // `mul * div > duo` directly because of possibility of overflow beyond 128 bits)
+                // duo cannot be more than `2^n_d - 1`, and overflow means a value more than that
+                let div_lo = div as $uX;
+                let div_hi = (div >> n) as $uX;
+                let (tmp_lo, carry) = carrying_mul(mul,div_lo);
+                let (tmp_hi, overflow) = carrying_mul_add(mul,div_hi,carry);
+                let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
+                // Note that the overflow cannot be more than 1, otherwise the `div` subtraction
+                // would not be able to bring the remainder value below 2^n_d - 1, which
+                // contradicts many things. The `& 1` is here to encourage overflow flag use.
+                if ((overflow & 1) != 0) || (duo < tmp) {
+                    return (
+                        mul.wrapping_sub(1) as $uD,
+                        duo.wrapping_add(div.wrapping_sub(tmp))
+                    )
+                } else {
+                    return (
+                        mul as $uD,
+                        duo.wrapping_sub(tmp)
+                    )
+                }
+            }
+
+            // Undersubtracting long division algorithm.
+            // Instead of clearing a minimum of 1 bit from `duo` per iteration via
+            // binary long division, `n_h - 1` bits are cleared per iteration with this algorithm.
+            // It is a more complicated version of regular long division. Most integer division
+            // algorithms tend to guess a part of the quotient, and may have a larger quotient than
+            // the true quotient (which when multiplied by `div` will "oversubtract" the original
+            // dividend). They then check if the quotient was in fact too large and then have to
+            // correct it. This long division algorithm has been carefully constructed to always
+            // underguess the quotient by slim margins. This allows different subalgorithms
+            // to be blindly jumped to without needing an extra correction step.
+            // 
+            // The only problem is that it will not work
+            // for many ranges of `duo` and `div`. Fortunately, the short division,
+            // mul or mul - 1 algorithms, and simple divisions happen to exactly fill these gaps.
+            // 
+            // For an example, consider the division of 76543210 by 213 and assume that `n_h`
+            // is equal to two decimal digits (note: we are working with base 10 here for
+            // readability).
+            // The first `h_n` part of the divisor (21) is taken and is incremented by
+            // 1 to prevent oversubtraction.
+            // In the first step, the first `n` part of duo (7654) is divided by the 22 to make 347.
+            // We remember that there was 1 extra place not in the `n_h` part of the divisor and
+            // shift the 347 right by 1, in contrast to a normal long division. The 347 is
+            // multiplied by the whole divisor to make 73911, and subtracted from duo to finish the
+            // step.
+            //    347
+            //  ________
+            // |76543210
+            // -73911
+            //   2632210
+            // Two more steps are taken after this and then duo fits into `n` bits, and then a final
+            // normal long division step is made.
+            //        14
+            //       443
+            //     119
+            //    347
+            //  ________
+            // |76543210
+            // -73911
+            //   2632210
+            //  -25347
+            //     97510
+            //    -94359
+            //      3151
+            // The tower at the top is added together to produce the quotient, 359357 (but in the
+            // actual algorithm, the quotient is progressively added to each step instead of at
+            // the end).
+            // In the actual algorithm below, instead of the final normal long division step, one of
+            // the three other algorithms ("quotient is 0 or 1", "mul or mul - 1", or
+            // "n sized division") is used.
+
+            let mut duo = duo;
+
+            // the number of lesser significant bits not a part of `div_sig_n_h`. Must be positive.
+            let div_lesser_places = (n + $n_h).wrapping_sub(div_lz);
+
+            // the most significant `n_h` bits of div
+            let div_sig_n_h = (div >> div_lesser_places) as $uH;
+
+            // has to be a `$uX` in case of overflow
+            let div_sig_n_h_add1 = (div_sig_n_h as $uX).wrapping_add(1);
+
+            let mut quo: $uD = 0;
+
+            // `{2^n, 2^(div_sb + n_h)} <= duo < 2^n_d`
+            // `2^n_h <= div < {2^(duo_sb - n_h), 2^n}`
+            loop {
+                let duo_lesser_places = n.wrapping_sub(duo_lz);
+                let duo_sig_n = (duo >> duo_lesser_places) as $uX;
+
+                if div_lesser_places <= duo_lesser_places {
+                    let mul = $half_division(duo_sig_n, div_sig_n_h_add1).0 as $uD;
+                    let place = duo_lesser_places.wrapping_sub(div_lesser_places);
+
+                    //addition to the quotient
+                    quo = quo.wrapping_add(mul << place);
+
+                    //subtraction from `duo`
+                    //at least `n_h - 1` bits are cleared from `duo` here
+                    duo = duo.wrapping_sub(div.wrapping_mul(mul) << place);
+                } else {
+                    //`mul` or `mul - 1` algorithm
+                    let shift = n.wrapping_sub(duo_lz);
+                    let duo_sig_n = (duo >> shift) as $uX;
+                    let div_sig_n = (div >> shift) as $uX;
+                    let mul = $half_division(duo_sig_n, div_sig_n).0;
+                    let div_lo = div as $uX;
+                    let div_hi = (div >> n) as $uX;
+
+                    let (tmp_lo, carry) = carrying_mul(mul,div_lo);
+                    // The undersubtracting long division algorithm has already run once, so
+                    // overflow beyond 128 bits is not possible here
+                    let (tmp_hi, _) = carrying_mul_add(mul,div_hi,carry);
+                    let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
+
+                    if duo < tmp {
+                        return (
+                            // note that this time, the "mul or mul - 1" result is added to `quo`
+                            // to get the final correct quotient
+                            quo.wrapping_add(mul.wrapping_sub(1) as $uD),
+                            duo.wrapping_add(
+                                div.wrapping_sub(tmp)
+                            )
+                        )
+                    } else {
+                        return (
+                            quo.wrapping_add(mul as $uD),
+                            duo.wrapping_sub(tmp)
+                        )
+                    }
+                }
+
+                duo_lz = duo.leading_zeros();
+                if div_lz <= duo_lz {
+                    //quotient can have 0 or 1 added to it
+                    if div <= duo {
+                        return (
+                            quo.wrapping_add(1),
+                            duo.wrapping_sub(div)
+                        )
+                    } else {
+                        return (
+                            quo,
+                            duo
+                        )
+                    }
+                }
+
+                // This can only happen if `div_sd < n` (because of previous "quo = 0 or 1"
+                // branches), but it is not worth it to unroll further.
+                if duo_lz >= n {
+                    // simple division and addition
+                    let tmp = $half_division(duo as $uX, div as $uX);
+                    return (
+                        quo.wrapping_add(tmp.0 as $uD),
+                        tmp.1 as $uD
+                    )
+                }
+            }
+        }
+
+        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
+        /// tuple.
+        /// 
+        /// This is optimized for division of two integers with bit widths twice as large
+        /// as the largest hardware integer division supported. Note that some architectures supply
+        /// a division of an integer larger than register size by a regular sized integer (e.x.
+        /// x86_64 has a `divq` instruction which can divide a 128 bit integer by a 64 bit integer).
+        /// In that case, the `_asymmetric` algorithm should be used instead of this one.
+        /// 
+        /// This is called the trifecta algorithm because it uses three main algorithms: short
+        /// division for small divisors, the "mul or mul - 1" algorithm for when the divisor is
+        /// large enough for the quotient to be determined to be one of two values via only one
+        /// small division, and an underguessing long division algorithm for middle cases.
+        ///
+        /// # Panics
+        ///
+        /// When attempting to divide by zero, this function will panic.
+        $(
+            #[$signed_attr]
+        )*
+        pub fn $signed_name(duo: $iD, div: $iD) -> ($iD,$iD) {
+            // Note: there is a way of doing this without any branches, but it is unfortunately
+            // significantly slower
+            // let n_d = $n_h * 4;
+            // let duo_s = duo >> (n_d - 1);
+            // let div_s = div >> (n_d - 1);
+            // let duo = (duo ^ duo_s).wrapping_sub(duo_s);
+            // let div = (div ^ div_s).wrapping_sub(div_s);
+            // let quo_s = duo_s ^ div_s;
+            // let rem_s = duo_s;
+            // let tmp = $unsigned_name(duo as $uD, div as $uD);
+            // (
+            //     ((tmp.0 as $iD) ^ quo_s).wrapping_sub(quo_s),
+            //     ((tmp.1 as $iD) ^ rem_s).wrapping_sub(rem_s),
+            // )
+
+            match (duo < 0, div < 0) {
+                (false,false) => {
+                    let t = $unsigned_name(duo as $uD,div as $uD);
+                    (t.0 as $iD,t.1 as $iD)
+                },
+                (true,false) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uD,div as $uD);
+                    ((t.0 as $iD).wrapping_neg(),(t.1 as $iD).wrapping_neg())
+                },
+                (false,true) => {
+                    let t = $unsigned_name(duo as $uD,div.wrapping_neg() as $uD);
+                    ((t.0 as $iD).wrapping_neg(),t.1 as $iD)
+                },
+                (true,true) => {
+                    let t = $unsigned_name(duo.wrapping_neg() as $uD,div.wrapping_neg() as $uD);
+                    (t.0 as $iD,(t.1 as $iD).wrapping_neg())
+                },
+            }
+        }
+    }
+}
diff --git a/src/int/udiv.rs b/src/int/udiv.rs
index b393ac6d..1fd02899 100644
--- a/src/int/udiv.rs
+++ b/src/int/udiv.rs
@@ -1,270 +1,74 @@
-use int::{Int, LargeInt};
+use super::specialized_div_rem::*;
 
-macro_rules! udivmod_inner {
-    ($n:expr, $d:expr, $rem:expr, $ty:ty) => {{
-        let (n, d, rem) = ($n, $d, $rem);
-        // NOTE X is unknown, K != 0
-        if n.high() == 0 {
-            if d.high() == 0 {
-                // 0 X
-                // ---
-                // 0 X
-
-                if let Some(rem) = rem {
-                    *rem = <$ty>::from(n.low().aborting_rem(d.low()));
-                }
-                return <$ty>::from(n.low().aborting_div(d.low()))
-            } else {
-                // 0 X
-                // ---
-                // K X
-                if let Some(rem) = rem {
-                    *rem = n;
-                }
-                return 0;
-            };
-        }
-
-        let mut sr;
-        let mut q;
-        let mut r;
-
-        if d.low() == 0 {
-            if d.high() == 0 {
-                // K X
-                // ---
-                // 0 0
-                // NOTE This should be unreachable in safe Rust because the program will panic before
-                // this intrinsic is called
-                ::abort();
-            }
-
-            if n.low() == 0 {
-                // K 0
-                // ---
-                // K 0
-                if let Some(rem) = rem {
-                    *rem = <$ty>::from_parts(0, n.high().aborting_rem(d.high()));
-                }
-                return <$ty>::from(n.high().aborting_div(d.high()))
-            }
-
-            // K K
-            // ---
-            // K 0
-
-            if d.high().is_power_of_two() {
-                if let Some(rem) = rem {
-                    *rem = <$ty>::from_parts(n.low(), n.high() & (d.high() - 1));
-                }
-                return <$ty>::from(n.high() >> d.high().trailing_zeros());
-            }
-
-            sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
-
-            // D > N
-            if sr > <hty!($ty)>::BITS - 2 {
-                if let Some(rem) = rem {
-                    *rem = n;
-                }
-                return 0;
-            }
-
-            sr += 1;
-
-            // 1 <= sr <= <hty!($ty)>::BITS - 1
-            q = n << (<$ty>::BITS - sr);
-            r = n >> sr;
-        } else if d.high() == 0 {
-            // K X
-            // ---
-            // 0 K
-            if d.low().is_power_of_two() {
-                if let Some(rem) = rem {
-                    *rem = <$ty>::from(n.low() & (d.low() - 1));
-                }
-
-                if d.low() == 1 {
-                    return n;
-                } else {
-                    let sr = d.low().trailing_zeros();
-                    return n >> sr;
-                };
-            }
-
-            sr = 1 + <hty!($ty)>::BITS + d.low().leading_zeros() - n.high().leading_zeros();
-
-            // 2 <= sr <= u64::BITS - 1
-            q = n << (<$ty>::BITS - sr);
-            r = n >> sr;
-        } else {
-            // K X
-            // ---
-            // K K
-            sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
-
-            // D > N
-            if sr > <hty!($ty)>::BITS - 1 {
-                if let Some(rem) = rem {
-                    *rem = n;
-                }
-                return 0;
-            }
-
-            sr += 1;
-
-            // 1 <= sr <= <hty!($ty)>::BITS
-            q = n << (<$ty>::BITS - sr);
-            r = n >> sr;
-        }
-
-        // Not a special case
-        // q and r are initialized with
-        // q = n << (u64::BITS - sr)
-        // r = n >> sr
-        // 1 <= sr <= u64::BITS - 1
-        let mut carry = 0;
-
-        // Don't use a range because they may generate references to memcpy in unoptimized code
-        let mut i = 0;
-        while i < sr {
-            i += 1;
-
-            // r:q = ((r:q) << 1) | carry
-            r = (r << 1) | (q >> (<$ty>::BITS - 1));
-            q = (q << 1) | carry as $ty;
-
-            // carry = 0
-            // if r >= d {
-            //     r -= d;
-            //     carry = 1;
-            // }
-            let s = (d.wrapping_sub(r).wrapping_sub(1)) as os_ty!($ty) >> (<$ty>::BITS - 1);
-            carry = (s & 1) as hty!($ty);
-            r -= d & s as $ty;
-        }
-
-        if let Some(rem) = rem {
-            *rem = r;
-        }
-        (q << 1) | carry as $ty
-    }}
-}
+// NOTE: there are aborts inside the specialized_div_rem functions if division by 0
+// is encountered, however these should be unreachable and optimized away unless
+// uses of `std/core::intrinsics::unchecked_div/rem` do not have a 0 check in front
+// of them.
 
 intrinsics! {
     #[maybe_use_optimized_c_shim]
     #[arm_aeabi_alias = __aeabi_uidiv]
     /// Returns `n / d`
     pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 {
-        // Special cases
-        if d == 0 {
-            // NOTE This should be unreachable in safe Rust because the program will panic before
-            // this intrinsic is called
-            ::abort();
-        }
-
-        if n == 0 {
-            return 0;
-        }
-
-        let mut sr = d.leading_zeros().wrapping_sub(n.leading_zeros());
-
-        // d > n
-        if sr > u32::BITS - 1 {
-            return 0;
-        }
-
-        // d == 1
-        if sr == u32::BITS - 1 {
-            return n;
-        }
-
-        sr += 1;
-
-        // 1 <= sr <= u32::BITS - 1
-        let mut q = n << (u32::BITS - sr);
-        let mut r = n >> sr;
-
-        let mut carry = 0;
-
-        // Don't use a range because they may generate references to memcpy in unoptimized code
-        let mut i = 0;
-        while i < sr {
-            i += 1;
-
-            // r:q = ((r:q) << 1) | carry
-            r = (r << 1) | (q >> (u32::BITS - 1));
-            q = (q << 1) | carry;
-
-            // carry = 0;
-            // if r > d {
-            //     r -= d;
-            //     carry = 1;
-            // }
-
-            let s = (d.wrapping_sub(r).wrapping_sub(1)) as i32 >> (u32::BITS - 1);
-            carry = (s & 1) as u32;
-            r -= d & s as u32;
-        }
-
-        (q << 1) | carry
+        u32_div_rem(n, d).0
     }
-
+    
     #[maybe_use_optimized_c_shim]
     /// Returns `n % d`
     pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 {
-        let q = __udivsi3(n, d);
-        n - q * d
+        u32_div_rem(n, d).1
     }
-
+    
     #[maybe_use_optimized_c_shim]
     /// Returns `n / d` and sets `*rem = n % d`
     pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 {
-        let q = __udivsi3(n, d);
+        let quo_rem = u32_div_rem(n, d);
         if let Some(rem) = rem {
-            *rem = n - (q * d);
+            *rem = quo_rem.1;
         }
-        q
+        quo_rem.0
     }
-
+    
     #[maybe_use_optimized_c_shim]
     /// Returns `n / d`
     pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 {
-        __udivmoddi4(n, d, None)
+        u64_div_rem(n, d).0
     }
-
+    
     #[maybe_use_optimized_c_shim]
     /// Returns `n % d`
     pub extern "C" fn __umoddi3(n: u64, d: u64) -> u64 {
-        let mut rem = 0;
-        __udivmoddi4(n, d, Some(&mut rem));
-        rem
+        u64_div_rem(n, d).1
     }
-
+    
+    /// Returns `n / d` and sets `*rem = n % d`
+    pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 {
+        let quo_rem = u64_div_rem(n, d);
+        if let Some(rem) = rem {
+            *rem = quo_rem.1;
+        }
+        quo_rem.0
+    }
+    
     #[win64_128bit_abi_hack]
     /// Returns `n / d`
     pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 {
-        __udivmodti4(n, d, None)
+        u128_div_rem(n, d).0
     }
 
     #[win64_128bit_abi_hack]
     /// Returns `n % d`
     pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 {
-        let mut rem = 0;
-        __udivmodti4(n, d, Some(&mut rem));
-        rem
-    }
-
-    /// Returns `n / d` and sets `*rem = n % d`
-    pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 {
-        udivmod_inner!(n, d, rem, u64)
+        u128_div_rem(n, d).1
     }
 
     #[win64_128bit_abi_hack]
     /// Returns `n / d` and sets `*rem = n % d`
-    pub extern "C" fn __udivmodti4(n: u128,
-                                   d: u128,
-                                   rem: Option<&mut u128>) -> u128 {
-        udivmod_inner!(n, d, rem, u128)
+    pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
+        let quo_rem = u128_div_rem(n, d);
+        if let Some(rem) = rem {
+            *rem = quo_rem.1;
+        }
+        quo_rem.0
     }
 }