Skip to content

Commit

Permalink
upd
Browse files Browse the repository at this point in the history
  • Loading branch information
wjr-z committed Jan 29, 2024
1 parent f780432 commit 5481e6d
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 46 deletions.
2 changes: 1 addition & 1 deletion include/wjr/math/add-impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ template <
WJR_INTRINSIC_CONSTEXPR_E U addc_s(T *dst, const T *src0, size_t n, const T *src1,
size_t m, U c_in);

WJR_INTRINSIC_CONSTEXPR void __addc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
WJR_INTRINSIC_CONSTEXPR_E void __addc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
uint64_t hi0, uint64_t lo1, uint64_t hi1);

} // namespace wjr
Expand Down
25 changes: 19 additions & 6 deletions include/wjr/math/add.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,12 +201,25 @@ WJR_INTRINSIC_CONSTEXPR_E U addc_s(T *dst, const T *src0, size_t n, const T *src
return c_in;
}

WJR_INTRINSIC_CONSTEXPR void __addc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
uint64_t hi0, uint64_t lo1, uint64_t hi1) {
lo0 += lo1;
hi0 += hi1 + (lo0 < lo1);
al = lo0;
ah = hi0;
WJR_INTRINSIC_CONSTEXPR void __fallback_addc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
uint64_t hi0, uint64_t lo1,
uint64_t hi1) {
uint64_t __al = lo0 + lo1;
ah = hi0 + hi1 + (__al < lo0);
al = __al;
}

WJR_INTRINSIC_CONSTEXPR_E void __addc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
uint64_t hi0, uint64_t lo1, uint64_t hi1) {
#if WJR_HAS_BUILTIN(__ASM_ADDC_128)
if (is_constant_evaluated() || (WJR_BUILTIN_CONSTANT_P(lo0 == 0) && lo0 == 0) ||
(WJR_BUILTIN_CONSTANT_P(lo1 == 0) && lo1 == 0)) {
return __fallback_addc_128(al, ah, lo0, hi0, lo1, hi1);
}
return __asm_addc_128(al, ah, lo0, hi0, lo1, hi1);
#else
return __fallback_addc_128(al, ah, lo0, hi0, lo1, hi1);
#endif
}

} // namespace wjr
Expand Down
16 changes: 8 additions & 8 deletions include/wjr/math/div.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ WJR_CONSTEXPR20 void fallback_div_qr_1_without_shift(T *dst, T &rem, const T *sr
WJR_ASSERT(n >= 1);

T divisor = div.get_divisor();
T value = div.get_value();

T lo, hi, hi1;
T lo, hi;

hi = src[n - 1];

Expand All @@ -37,11 +38,9 @@ WJR_CONSTEXPR20 void fallback_div_qr_1_without_shift(T *dst, T &rem, const T *sr
return;
}

hi1 = hi + 1;

do {
lo = src[n - 1];
dst[n - 1] = div.divide_without_shift(lo, hi, hi1);
dst[n - 1] = div.divide(divisor, value, lo, hi);
} while (--n);

rem = hi;
Expand All @@ -54,25 +53,26 @@ WJR_CONSTEXPR20 void fallback_div_qr_1_with_shift(T *dst, T &rem, const T *src,
WJR_ASSERT(div.get_shift() != 0);
WJR_ASSERT(n >= 1);

T divisor = div.get_divisor();
T value = div.get_value();
unsigned int shift = div.get_shift();

T lo, hi, hi1;
T lo, hi;

T rbp = src[n - 1];
--n;
hi = rbp >> (64 - shift);
hi1 = hi + 1;

if (WJR_LIKELY(n != 0)) {
do {
lo = src[n - 1];
dst[n] = div.divide_without_shift(shld(rbp, lo, shift), hi, hi1);
dst[n] = div.divide(divisor, value, shld(rbp, lo, shift), hi);
rbp = lo;
--n;
} while (WJR_LIKELY(n != 0));
}

dst[0] = div.divide_without_shift(rbp << shift, hi, hi1);
dst[0] = div.divide(divisor, value, rbp << shift, hi);
rem = hi >> shift;

return;
Expand Down
44 changes: 23 additions & 21 deletions include/wjr/math/divider.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,14 @@ class div2by1_divider {

constexpr bool is_power_of_two() const { return m_divisor == (1ull << 63); }

// hi1 is hi + 1
WJR_INTRINSIC_CONSTEXPR20 T divide_without_shift(T lo, T &hi, T &hi1) const {
WJR_INTRINSIC_CONSTEXPR20 static T divide(T divisor, T value, T lo, T &hi) {
WJR_ASSERT(value >> (std::numeric_limits<T>::digits - 1));

if (WJR_BUILTIN_CONSTANT_P(lo == 0) && lo == 0) {
return divide_without_shift_lo0(lo, hi, hi1);
return divide_without_shift_lo0(divisor, value, lo, hi);
}

return basic_divide_without_shift(lo, hi, hi1);
return basic_divide_without_shift(divisor, value, lo, hi);
}

WJR_INLINE_CONSTEXPR_E static T reciprocal_word(T d) {
Expand Down Expand Up @@ -162,44 +163,45 @@ class div2by1_divider {
#endif
}

WJR_INTRINSIC_CONSTEXPR20 T basic_divide_without_shift(T lo, T &hi, T &hi1) const {
WJR_ASSERT(hi1 == hi + 1);
WJR_INTRINSIC_CONSTEXPR20 static T basic_divide_without_shift(T divisor, T value,
T lo, T &hi) {
T hi1 = hi + 1;

T rax, rdx;

rax = mul(hi, m_value, rdx);
rax = mul(hi, value, rdx);
__addc_128(rax, rdx, rax, rdx, lo, hi1);

lo -= mullo(rdx, m_divisor);
lo -= mullo(rdx, divisor);

div2by1_adjust(rax, m_divisor, lo, rdx);
div2by1_adjust(rax, divisor, lo, rdx);

if (WJR_UNLIKELY(lo >= m_divisor)) {
if (WJR_UNLIKELY(lo >= divisor)) {
WJR_FORCE_BRANCH_BARRIER();
lo -= m_divisor;
lo -= divisor;
++rdx;
}

hi = lo;
hi1 = hi + 1;
return rdx;
}

WJR_INTRINSIC_CONSTEXPR20 T divide_without_shift_lo0(T lo, T &hi, T &hi1) const {
WJR_ASSERT(hi1 == hi + 1);
WJR_INTRINSIC_CONSTEXPR20 static T divide_without_shift_lo0(T divisor, T value, T lo,
T &hi) {
WJR_ASSERT(lo == 0);

T hi1 = hi + 1;

T rax, rdx;

rax = mul(hi, m_value, rdx);
rax = mul(hi, value, rdx);
rdx += hi1;

lo -= mullo(rdx, m_divisor);
lo -= mullo(rdx, divisor);

div2by1_adjust(rax, m_divisor, lo, rdx);
div2by1_adjust(rax, divisor, lo, rdx);

hi = lo;
hi1 = hi + 1;
return rdx;
}

Expand Down Expand Up @@ -347,9 +349,9 @@ class divexact1_divider {
constexpr divexact1_divider(T divisor, T value, unsigned int shift)
: m_divisor(divisor), m_value(value), m_shift(shift) {}

constexpr T divisor() const { return m_divisor; }
constexpr T value() const { return m_value; }
constexpr unsigned int shift() const { return m_shift; }
constexpr T get_divisor() const { return m_divisor; }
constexpr T get_value() const { return m_value; }
constexpr unsigned int get_shift() const { return m_shift; }

constexpr bool is_power_of_two() const { return m_divisor == 1; }

Expand Down
2 changes: 1 addition & 1 deletion include/wjr/math/sub-impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ template <typename T, std::enable_if_t<is_unsigned_integral_v<T>, int> = 0>
WJR_INTRINSIC_CONSTEXPR_E ssize_t abs_subc_s(T *dst, const T *src0, size_t n,
const T *src1, size_t m);

WJR_INTRINSIC_CONSTEXPR void __subc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
WJR_INTRINSIC_CONSTEXPR_E void __subc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
uint64_t hi0, uint64_t lo1, uint64_t hi1);

} // namespace wjr
Expand Down
26 changes: 19 additions & 7 deletions include/wjr/math/sub.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,13 +259,25 @@ WJR_INTRINSIC_CONSTEXPR_E ssize_t abs_subc_s(T *dst, const T *src0, size_t n,
return ret;
}

WJR_INTRINSIC_CONSTEXPR void __subc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
uint64_t hi0, uint64_t lo1, uint64_t hi1) {
auto tmp = lo0;
lo0 -= lo1;
hi0 -= hi1 + (lo0 > tmp);
al = lo0;
ah = hi0;
WJR_INTRINSIC_CONSTEXPR void __fallback_subc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
uint64_t hi0, uint64_t lo1,
uint64_t hi1) {
uint64_t __al = lo0 - lo1;
ah = hi0 - hi1 - (__al > lo0);
al = __al;
}

WJR_INTRINSIC_CONSTEXPR_E void __subc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
uint64_t hi0, uint64_t lo1, uint64_t hi1) {
#if WJR_HAS_BUILTIN(__ASM_SUBC_128)
if (is_constant_evaluated() || (WJR_BUILTIN_CONSTANT_P(lo0 == 0) && lo0 == 0) ||
(WJR_BUILTIN_CONSTANT_P(lo1 == 0) && lo1 == 0)) {
return __fallback_subc_128(al, ah, lo0, hi0, lo1, hi1);
}
return __asm_subc_128(al, ah, lo0, hi0, lo1, hi1);
#else
return __fallback_subc_128(al, ah, lo0, hi0, lo1, hi1);
#endif
}

} // namespace wjr
Expand Down
20 changes: 20 additions & 0 deletions include/wjr/x86/add.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,26 @@ namespace wjr {

#endif

// There are certain issues with optimizing in GCC
#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) && defined(WJR_COMPILER_GCC)
#define WJR_HAS_BUILTIN___ASM_ADDC_128 WJR_HAS_DEF
#endif

#if WJR_HAS_BUILTIN(__ASM_ADDC_128)

WJR_INTRINSIC_INLINE void __asm_addc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
uint64_t hi0, uint64_t lo1, uint64_t hi1) {
asm("add{q %[lo1], %[lo0]| %[lo0], %[lo1]}\n\t"
"adc{q %[hi1], %[hi0]| %[hi0], %[hi1]}"
: [lo0] "+&r"(lo0), [hi0] "+r"(hi0)
: [lo1] "r"(lo1), [hi1] "r"(hi1)
: "cc", "memory");
al = lo0;
ah = hi0;
}

#endif

} // namespace wjr

#endif // WJR_X86_ADD_HPP__
3 changes: 1 addition & 2 deletions include/wjr/x86/divider.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ WJR_INTRINSIC_INLINE void asm_div2by1_adjust(T rax, T div, T &r8, T &rdx) {
template <typename T, std::enable_if_t<std::is_same_v<T, uint64_t>, int> = 0>
WJR_INTRINSIC_INLINE void asm_div3by2_adjust(T d1, T &p, T &v) {
T r8 = p;
asm volatile(
"sub{q %[d1], %[r8]| %[r8], %[d1]}\n\t"
asm("sub{q %[d1], %[r8]| %[r8], %[d1]}\n\t"
"cmovae{q %[r8], %[p]| %[p], %[r8]}\n\t"
"adc{q $-1, %[v]| %[v], -1}"
: [p] "+r"(p), [v] "+r"(v), [r8] "+r"(r8)
Expand Down
20 changes: 20 additions & 0 deletions include/wjr/x86/sub.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,26 @@ namespace wjr {

#endif

// There are certain issues with optimizing in GCC
#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) && defined(WJR_COMPILER_GCC)
#define WJR_HAS_BUILTIN___ASM_SUBC_128 WJR_HAS_DEF
#endif

#if WJR_HAS_BUILTIN(__ASM_SUBC_128)

WJR_INTRINSIC_INLINE void __asm_subc_128(uint64_t &al, uint64_t &ah, uint64_t lo0,
uint64_t hi0, uint64_t lo1, uint64_t hi1) {
asm("sub{q %[lo1], %[lo0]| %[lo0], %[lo1]}\n\t"
"sbb{q %[hi1], %[hi0]| %[hi0], %[hi1]}"
: [lo0] "+&r"(lo0), [hi0] "+r"(hi0)
: [lo1] "r"(lo1), [hi1] "r"(hi1)
: "cc", "memory");
al = lo0;
ah = hi0;
}

#endif

} // namespace wjr

#endif // WJR_X86_SUB_HPP__

0 comments on commit 5481e6d

Please sign in to comment.