Skip to content

Commit

Permalink
upd
Browse files Browse the repository at this point in the history
  • Loading branch information
wjr-z committed Feb 2, 2024
1 parent 5cc6e00 commit 0a4a2bb
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 91 deletions.
14 changes: 7 additions & 7 deletions include/wjr/math/add-impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,25 @@ template <
WJR_INTRINSIC_CONSTEXPR_E T addc(T a, T b, type_identity_t<U> c_in, U &c_out);

template <
typename T, typename U,
typename T, typename U = unsigned char,
std::enable_if_t<is_unsigned_integral_v<T> && is_unsigned_integral_v<U>, int> = 0>
WJR_INTRINSIC_CONSTEXPR_E U addc_1(T *dst, const T *src0, size_t n,
type_identity_t<T> src1, U c_in);
type_identity_t<T> src1, U c_in = 0u);

template <
typename T, typename U,
typename T, typename U = unsigned char,
std::enable_if_t<is_unsigned_integral_v<T> && is_unsigned_integral_v<U>, int> = 0>
WJR_INTRINSIC_CONSTEXPR_E U addc_n(T *dst, const T *src0, const T *src1, size_t n,
U c_in);
U c_in = 0u);

template <
typename T, typename U,
typename T, typename U = unsigned char,
std::enable_if_t<is_unsigned_integral_v<T> && is_unsigned_integral_v<U>, int> = 0>
WJR_INTRINSIC_CONSTEXPR_E U addc_s(T *dst, const T *src0, size_t n, const T *src1,
size_t m, U c_in);
size_t m, U c_in = 0u);

WJR_INTRINSIC_CONSTEXPR_E void __addc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
uint64_t hi0, uint64_t lo1, uint64_t hi1);
uint64_t hi0, uint64_t lo1, uint64_t hi1);

} // namespace wjr

Expand Down
2 changes: 1 addition & 1 deletion include/wjr/math/details.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ inline constexpr de_bruijn<uint32_t, 0x077C'B531> de_bruijn32 = {};
inline constexpr de_bruijn<uint64_t, 0x03f7'9d71'b4ca'8b09> de_bruijn64 = {};

using default_stack_allocator =
stack_allocator<6 * 1024, 2 * 1024, 6, 48 * 1024, 16 * 1024, 2>;
stack_allocator<6 * 1024, 2 * 1024, 4, 48 * 1024, 16 * 1024, 2>;
inline constexpr default_stack_allocator stack_alloc = {};

} // namespace math_details
Expand Down
57 changes: 53 additions & 4 deletions include/wjr/math/div.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,8 @@ WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(T *dst, T *rem, const T *src, size_t n,

// reference : GMP
template <typename T>
WJR_NOINLINE WJR_CONSTEXPR20 T sb_div_qr_s(T *dst, T *src, size_t n, T *div, size_t m,
T dinv) {
WJR_NOINLINE WJR_CONSTEXPR20 T sb_div_qr_s(T *dst, T *src, size_t n, const T *div,
size_t m, T dinv) {
using divider = div3by2_divider<T>;
constexpr T mask = std::numeric_limits<T>::max();

Expand All @@ -286,7 +286,7 @@ WJR_NOINLINE WJR_CONSTEXPR20 T sb_div_qr_s(T *dst, T *src, size_t n, T *div, siz

qh = reverse_compare_n(src - m, div, m) >= 0;
if (qh != 0) {
(void)subc_n(src - n, src - n, div, m, 0u);
(void)subc_n(src - m, src - m, div, m);
}

dst += n - m;
Expand Down Expand Up @@ -316,7 +316,7 @@ WJR_NOINLINE WJR_CONSTEXPR20 T sb_div_qr_s(T *dst, T *src, size_t n, T *div, siz
src[0] = n0;

if (WJR_UNLIKELY(cy != 0)) {
n1 += d1 + addc_n(src - m, src - m, div, m + 1, 0u);
n1 += d1 + addc_n(src - m, src - m, div, m + 1);
q--;
}
}
Expand All @@ -329,6 +329,55 @@ WJR_NOINLINE WJR_CONSTEXPR20 T sb_div_qr_s(T *dst, T *src, size_t n, T *div, siz
return qh;
}

inline constexpr size_t dc_div_qr_threshold = toom22_mul_threshold * 2;

template <typename T>
T dc_div4by2_qr(T *dst, T *src, const T *div, size_t m, T dinv, T *stk) {
size_t lo, hi;
T cy, qh, ql;

lo = m >> 1; /* floor(n/2) */
hi = m - lo; /* ceil(n/2) */

if (hi < dc_div_qr_threshold) {
qh = sb_div_qr_s(dst + lo, src + 2 * lo, 2 * hi, div + lo, hi, dinv);
} else {
qh = dc_div4by2_qr(dst + lo, src + 2 * lo, div + lo, hi, dinv, stk);
}

mul_s(stk, dst + lo, hi, div, lo);

cy = subc_n(src + lo, src + lo, stk, m);
if (qh != 0) {
cy += subc_n(src + m, src + m, div, lo);
}

while (cy != 0) {
qh -= subc_1(dst + lo, dst + lo, hi, 1u);
cy -= addc_n(src + lo, src + lo, div, m);
}

if (lo < dc_div_qr_threshold) {
ql = sb_div_qr_s(dst, src + hi, 2 * lo, div + hi, lo, dinv);
} else {
ql = dc_div4by2_qr(dst, src + hi, div + hi, lo, dinv, stk);
}

mul_s(stk, div, hi, dst, lo);

cy = subc_n(src, src, stk, m);
if (ql != 0) {
cy += subc_n(src + lo, src + lo, div, hi);
}

while (cy != 0) {
subc_1(dst, dst, lo, 1u);
cy -= addc_n(src, src, div, m);
}

return qh;
}

template <typename T>
WJR_INTRINSIC_CONSTEXPR20 void div_qr_s(T *dst, T *rem, const T *src, size_t n, T *div,
size_t m) {
Expand Down
Loading

0 comments on commit 0a4a2bb

Please sign in to comment.