Skip to content

Commit

Permalink
opt
Browse files Browse the repository at this point in the history
  • Loading branch information
wjr-z committed Jul 4, 2024
1 parent e2f8a50 commit fe82d52
Show file tree
Hide file tree
Showing 12 changed files with 841 additions and 1,430 deletions.
822 changes: 111 additions & 711 deletions godbolt/wjr.hpp

Large diffs are not rendered by default.

24 changes: 12 additions & 12 deletions include/wjr/math/div-impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,22 @@ inline uint128_t div128by64to128(uint64_t &rem, uint64_t lo, uint64_t hi,
inline uint128_t div128by64to128(uint64_t &rem, uint64_t lo, uint64_t hi,
uint64_t div) noexcept;

WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src,
size_t n,
const div2by1_divider<uint64_t> &div) noexcept;
WJR_INTRINSIC_INLINE void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src,
size_t n,
const div2by1_divider<uint64_t> &div) noexcept;

WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src,
size_t n, uint64_t div) noexcept;
WJR_INTRINSIC_INLINE void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src,
size_t n, uint64_t div) noexcept;

WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n,
const div3by2_divider<uint64_t> &div) noexcept;
WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n,
const div3by2_divider<uint64_t> &div) noexcept;

WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n, const uint64_t *div) noexcept;
WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n, const uint64_t *div) noexcept;

WJR_INTRINSIC_CONSTEXPR20 void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n, const uint64_t *div, size_t m) noexcept;
WJR_INTRINSIC_INLINE void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n, const uint64_t *div, size_t m) noexcept;

WJR_INTRINSIC_CONSTEXPR20 uint64_t divexact_dbm1c(uint64_t *dst, const uint64_t *src,
size_t n, uint64_t bd,
Expand Down
175 changes: 22 additions & 153 deletions include/wjr/math/div.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,9 @@ inline uint128_t div128by64to128(uint64_t &rem, uint64_t lo, uint64_t hi,
}

// reference : https://ieeexplore.ieee.org/document/5487506
WJR_INLINE_CONSTEXPR20 uint64_t
div_qr_1_noshift(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n,
const div2by1_divider_noshift<uint64_t> &div) noexcept {
inline uint64_t div_qr_1_noshift(uint64_t *dst, uint64_t &rem, const uint64_t *src,
size_t n,
const div2by1_divider_noshift<uint64_t> &div) noexcept {
WJR_ASSERT_ASSUME(n >= 1);
WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 1, src, n - 1));

Expand Down Expand Up @@ -169,52 +169,10 @@ div_qr_1_noshift(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n,
return qh;
}

WJR_INLINE_CONSTEXPR20 uint64_t
div_qr_1_shift(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n,
const div2by1_divider<uint64_t> &div) noexcept {
WJR_ASSERT_ASSUME(n >= 1);
WJR_ASSERT(div.get_shift() != 0);
WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 1, src, n - 1));

const uint64_t divisor = div.get_divisor();
const uint64_t value = div.get_value();
const auto shift = div.get_shift();

uint64_t qh;
uint64_t lo, hi;

uint64_t rbp = src[n - 1];
--n;
hi = rbp >> (64 - shift);

do {
if (WJR_UNLIKELY(n == 0)) {
qh = div.divide(divisor, value, rbp << shift, hi);
break;
}

lo = src[n - 1];
qh = div.divide(divisor, value, shld(rbp, lo, shift), hi);
rbp = lo;
--n;
extern uint64_t div_qr_1_shift(uint64_t *dst, uint64_t &rem, const uint64_t *src,
size_t n, const div2by1_divider<uint64_t> &div) noexcept;

if (WJR_LIKELY(n != 0)) {
do {
lo = src[n - 1];
dst[n] = div.divide(divisor, value, shld(rbp, lo, shift), hi);
rbp = lo;
--n;
} while (WJR_LIKELY(n != 0));
}

dst[0] = div.divide(divisor, value, rbp << shift, hi);
} while (0);

rem = hi >> shift;
return qh;
}

WJR_INTRINSIC_CONSTEXPR20 uint64_t
WJR_INTRINSIC_INLINE uint64_t
div_qr_1_impl(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n,
const div2by1_divider<uint64_t> &div) noexcept {
if (div.get_shift() == 0) {
Expand All @@ -225,9 +183,9 @@ div_qr_1_impl(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n,
}

// return high quotient limb
WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src,
size_t n,
const div2by1_divider<uint64_t> &div) noexcept {
WJR_INTRINSIC_INLINE void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src,
size_t n,
const div2by1_divider<uint64_t> &div) noexcept {
WJR_ASSERT_ASSUME(n >= 1);

if (WJR_UNLIKELY(div.is_zero_or_single_bit())) {
Expand Down Expand Up @@ -289,102 +247,14 @@ WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint
dst[n - 1] = div_qr_1_impl(dst, rem, src, n, div2by1_divider<uint64_t>(div));
}

WJR_INLINE_CONSTEXPR20 uint64_t
div_qr_2_noshift(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n,
const div3by2_divider_noshift<uint64_t> &div) noexcept {
WJR_ASSERT_ASSUME(n >= 2);
WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 2, src, n - 2));
WJR_ASSERT_L2(WJR_IS_SEPARATE_P(dst, n - 2, rem, 2));

const uint64_t divisor0 = div.get_divisor0();
const uint64_t divisor1 = div.get_divisor1();
const uint64_t value = div.get_value();

uint64_t qh = 0;
uint64_t u0, u1, u2;

u2 = src[n - 1];
u1 = src[n - 2];

if (__less_equal_128(divisor0, divisor1, u1, u2)) {
__sub_128(u1, u2, u1, u2, divisor0, divisor1);
qh = 1;
}

do {
if (WJR_UNLIKELY(n == 2)) {
break;
}

n -= 2;

do {
u0 = src[n - 1];
dst[n - 1] = div.divide(divisor0, divisor1, value, u0, u1, u2);
--n;
} while (WJR_LIKELY(n != 0));

} while (0);

rem[0] = u1;
rem[1] = u2;
return qh;
}

WJR_INLINE_CONSTEXPR20 uint64_t
div_qr_2_shift(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n,
const div3by2_divider<uint64_t> &div) noexcept {
WJR_ASSERT_ASSUME(n >= 2);
WJR_ASSERT(div.get_shift() != 0);
WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 2, src, n - 2));
WJR_ASSERT_L2(WJR_IS_SEPARATE_P(dst, n - 2, rem, 2));
extern uint64_t div_qr_2_noshift(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n,
const div3by2_divider_noshift<uint64_t> &div) noexcept;

const uint64_t divisor0 = div.get_divisor0();
const uint64_t divisor1 = div.get_divisor1();
const uint64_t value = div.get_value();
const auto shift = div.get_shift();
extern uint64_t div_qr_2_shift(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n, const div3by2_divider<uint64_t> &div) noexcept;

uint64_t qh;
uint64_t u0, u1, u2;
uint64_t rbp;

rbp = src[n - 2];
u2 = src[n - 1];
u1 = shld(u2, rbp, shift);
u2 >>= (64 - shift);

n -= 2;

do {
if (WJR_UNLIKELY(n == 0)) {
qh = div.divide(divisor0, divisor1, value, rbp << shift, u1, u2);
break;
}

u0 = src[n - 1];
qh = div.divide(divisor0, divisor1, value, shld(rbp, u0, shift), u1, u2);
rbp = u0;
--n;

if (WJR_LIKELY(n != 0)) {
do {
u0 = src[n - 1];
dst[n] =
div.divide(divisor0, divisor1, value, shld(rbp, u0, shift), u1, u2);
rbp = u0;
--n;
} while (WJR_LIKELY(n != 0));
}

dst[0] = div.divide(divisor0, divisor1, value, rbp << shift, u1, u2);
} while (0);

rem[0] = shrd(u1, u2, shift);
rem[1] = u2 >> shift;
return qh;
}

WJR_INTRINSIC_CONSTEXPR20 uint64_t
WJR_INTRINSIC_INLINE uint64_t
div_qr_2_impl(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n,
const div3by2_divider<uint64_t> &div) noexcept {
if (div.get_shift() == 0) {
Expand All @@ -394,16 +264,16 @@ div_qr_2_impl(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n,
return div_qr_2_shift(dst, rem, src, n, div);
}

WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n,
const div3by2_divider<uint64_t> &div) noexcept {
WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n,
const div3by2_divider<uint64_t> &div) noexcept {
WJR_ASSERT_ASSUME(n >= 2);

dst[n - 2] = div_qr_2_impl(dst, rem, src, n, div);
}

WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n, const uint64_t *div) noexcept {
WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n, const uint64_t *div) noexcept {
WJR_ASSERT_ASSUME(n >= 2);

dst[n - 2] =
Expand All @@ -421,9 +291,8 @@ extern uint64_t dc_div_qr_s(uint64_t *dst, uint64_t *src, size_t n, const uint64
extern void __div_qr_s_impl(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n,
const uint64_t *div, size_t m) noexcept;

WJR_INTRINSIC_CONSTEXPR20 void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n, const uint64_t *div,
size_t m) noexcept {
WJR_INTRINSIC_INLINE void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src,
size_t n, const uint64_t *div, size_t m) noexcept {
WJR_ASSERT_ASSUME(m >= 1);
WJR_ASSERT_ASSUME(n >= m);

Expand Down
12 changes: 9 additions & 3 deletions include/wjr/math/mul.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,14 @@ WJR_INTRINSIC_CONSTEXPR20 uint64_t try_addmul_1(uint64_t *dst, const uint64_t *s
return 0;
}

if (ml == 0) {
return 0;
if constexpr (maxn <= 3) {
if (ml == 0) {
return 0;
}
} else {
if (WJR_UNLIKELY(ml == 0)) {
return 0;
}
}

if constexpr (maxn == 1) {
Expand Down Expand Up @@ -453,7 +459,6 @@ inline constexpr size_t toom3_sqr_threshold = WJR_TOOM3_SQR_THRESHOLD;
enum class __mul_mode : uint8_t {
toom22 = 0x00,
toom33 = 0x01,
toom44 = 0x02,
all = 0x03,
};

Expand Down Expand Up @@ -652,6 +657,7 @@ void __mul_n(uint64_t *WJR_RESTRICT dst, const uint64_t *src0, const uint64_t *s
} else {
c_out = cf0 * cf1;
}

c_out += try_addmul_1<m0>(dst + n, src1, n, cf0);
c_out += try_addmul_1<m1>(dst + n, src0, n, cf1);
}
Expand Down
11 changes: 9 additions & 2 deletions include/wjr/preprocessor/compiler/attribute.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,11 @@
#if defined(WJR_COMPILER_MSVC)
#define WJR_MS_ABI
#define WJR_HAS_FEATURE_MS_ABI WJR_HAS_DEF
#elif defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC)
#elif WJR_HAS_ATTRIBUTE(__ms_abi__)
#define WJR_MS_ABI __attribute__((__ms_abi__))
#define WJR_HAS_FEATURE_MS_ABI WJR_HAS_DEF
#else
#elif defined(WJR_ENABLE_ASSEMBLY)
#undef WJR_ENABLE_ASSEMBLY
#endif

#define WJR_ASSUME_MAY_NOT_PURE(expr) \
Expand Down Expand Up @@ -238,6 +239,12 @@
#define WJR_MALLOC
#endif

#if WJR_HAS_ATTRIBUTE(nonnull)
#define WJR_NONNULL(...) __attribute__((__VA_ARGS__))
#else
#define WJR_NONNULL(...)
#endif

#define WJR_INLINE inline
#define WJR_CONSTEXPR constexpr

Expand Down
24 changes: 12 additions & 12 deletions include/wjr/x86/math/mul-impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,6 @@ namespace wjr {

#if defined(__BMI2__) && defined(__ADX__)

#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM)
#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF
#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF
#elif defined(WJR_ENABLE_ASSEMBLY)
#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF_VAR(3)
#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF_VAR(3)
#endif

#endif

#if defined(__BMI2__) && defined(__ADX__)

#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM)
#define WJR_HAS_BUILTIN_ASM_SUBMUL_1 WJR_HAS_DEF
#elif defined(WJR_ENABLE_ASSEMBLY)
Expand All @@ -79,6 +67,18 @@ namespace wjr {

#endif

#if defined(__BMI2__) && defined(__ADX__)

#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM)
#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF
#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF
#elif defined(WJR_ENABLE_ASSEMBLY)
#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF_VAR(3)
#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF_VAR(3)
#endif

#endif

} // namespace wjr

#endif // WJR_X86_MATH_MUL_IMPL_HPP__
Loading

0 comments on commit fe82d52

Please sign in to comment.