Skip to content

Commit

Permalink
fix bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
wjr-z committed Jan 26, 2024
1 parent d27d94e commit 9e5ae53
Show file tree
Hide file tree
Showing 7 changed files with 339 additions and 102 deletions.
46 changes: 43 additions & 3 deletions include/wjr/math/mul.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,11 +210,15 @@ WJR_INTRINSIC_CONSTEXPR_E T addmul_1(T *dst, const T *src, size_t n,
}

#if WJR_HAS_BUILTIN(ASM_ADDMUL_1)
if (is_constant_evaluated()) {
if constexpr (sizeof(T) == 8) {
if (is_constant_evaluated()) {
return fallback_addmul_1(dst, src, n, ml);
}

return asm_addmul_1(dst, src, n, ml);
} else {
return fallback_addmul_1(dst, src, n, ml);
}

return asm_addmul_1(dst, src, n, ml);
#else
return fallback_addmul_1(dst, src, n, ml);
#endif
Expand Down Expand Up @@ -260,6 +264,42 @@ WJR_INTRINSIC_CONSTEXPR_E T submul_1(T *dst, const T *src, size_t n,
#endif
}

template <typename T>
WJR_INTRINSIC_CONSTEXPR T fallback_addlsh_n(T *dst, const T *src0, const T *src1,
size_t n, type_identity_t<T> cl) {
T tcl = std::numeric_limits<T>::digits - cl;
T lo = 0, hi = 0;
T o_in = 0, c_in = 0;

for (size_t i = 0; i < n; ++i) {
lo = src1[i] << cl;
hi = src1[i] >> tcl;
lo = addc<T>(lo, c_in, 0u, c_in);
dst[i] = addc<T>(lo, src0[i], 0u, o_in);
c_in += hi + o_in;
}

return c_in;
}

template <typename T>
WJR_INTRINSIC_CONSTEXPR_E T addlsh_n(T *dst, const T *src0, const T *src1, size_t n,
type_identity_t<T> cl) {
if (WJR_UNLIKELY(cl == 0)) {
return wjr::addc_n(dst, src0, src1, n, 0u);
}

#if WJR_HAS_BUILTIN(ASM_ADDLSH_N)
if (is_constant_evaluated()) {
return fallback_addlsh_n(dst, src0, src1, n, cl);
}

return asm_addlsh_n(dst, src0, src1, n, cl);
#else
return fallback_addlsh_n(dst, src0, src1, n, cl);
#endif
}

// preview :

// native default threshold of toom-cook-2
Expand Down
17 changes: 14 additions & 3 deletions include/wjr/preprocessor/preview.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,18 @@
#endif //

#if defined(NDEBUG)
#define WJR_ASSERT_NOMESSAGE_I(expr) WJR_ASSUME(expr)
#define WJR_ASSERT_MESSAGE_I(expr) WJR_UNREACHABLE()
#define WJR_ASSERT_NOMESSAGE_I(expr) \
do { \
if (WJR_UNLIKELY(!(expr))) { \
std::abort(); \
WJR_UNREACHABLE(); \
} \
} while (0)
#define WJR_ASSERT_MESSAGE_I(expr) \
do { \
std::abort(); \
WJR_UNREACHABLE(); \
} while (0)
#else
#define WJR_ASSERT_NOMESSAGE_I(expr) assert(expr)
#define WJR_ASSERT_MESSAGE_I(expr) \
Expand Down Expand Up @@ -58,8 +68,9 @@
WJR_ASSERT_CHECK_I_MESSAGE) \
(__VA_ARGS__)

#define WJR_ASSERT_UNCHECK_I(...) \
#define WJR_ASSERT_UNCHECK_I(expr, ...) \
do { \
WJR_ASSUME(expr); \
} while (0)

// level = [0, 2]
Expand Down
4 changes: 2 additions & 2 deletions include/wjr/x86/add.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ namespace wjr {
(defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC))
#define WJR_HAS_BUILTIN_ASM_ADDC WJR_HAS_DEF
#define WJR_HAS_BUILTIN_ASM_ADDC_N WJR_HAS_DEF
#endif

#define WJR_ADDSUB_I 1

#include <wjr/x86/gen_addsub.hpp>

#endif

} // namespace wjr

#endif // WJR_X86_ADD_HPP__
210 changes: 210 additions & 0 deletions include/wjr/x86/gen_addrsblsh_n.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
// WJR_ADDSUB_I :
// 0 : SUB
// 1 : ADD

#ifndef WJR_ADDSUB_I
#error "abort"
#endif

#define WJR_addsub WJR_PP_BOOL_IF(WJR_ADDSUB_I, add, rsb)
#define WJR_adcsbb WJR_PP_BOOL_IF(WJR_ADDSUB_I, adc, sbb)

WJR_INLINE uint64_t WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addsub, lsh_n))(
uint64_t *dst, const uint64_t *src0, const uint64_t *src1, size_t n, uint64_t cl) {
WJR_ASSERT(n != 0);
WJR_ASSERT(cl != 0);

size_t cx = n / 8;
uint64_t tcl = 64 - cl;
uint64_t r8, r9 = n, r10;

asm volatile(
"and{l $7, %k[r9]| %k[r9], 7}\n\t"
"lea{q| %[r8], [rip +} .Llookup%={(%%rip), %[r8]|]}\n\t"
"movs{lq (%[r8], %[r9], 4), %[r9]|xd %[r9], DWORD PTR [%[r8] + "
"%[r9] * 4]}\n\t"
"lea{q (%[r8], %[r9], 1), %[r9]| %[r9], [%[r9] + %[r8]]}\n\t"
"jmp{q *%[r9]| %[r9]}\n\t"

".align 8\n\t"
".Llookup%=:\n\t"
".long .Ll0%=-.Llookup%=\n\t"
".long .Ll1%=-.Llookup%=\n\t"
".long .Ll2%=-.Llookup%=\n\t"
".long .Ll3%=-.Llookup%=\n\t"
".long .Ll4%=-.Llookup%=\n\t"
".long .Ll5%=-.Llookup%=\n\t"
".long .Ll6%=-.Llookup%=\n\t"
".long .Ll7%=-.Llookup%=\n\t"
".align 16\n\t"

".Ll0%=:\n\t"
"mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t"
"xor %k[r9], %k[r9]\n\t"
"jmp .Lb0%=\n\t"

".Ll2%=:\n\t"
"mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t"
"xor %k[r9], %k[r9]\n\t"
"lea{q -48(%[src0]), %[src0]| %[src0], [%[src0] - 48]}\n\t"
"lea{q -48(%[src1]), %[src1]| %[src1], [%[src1] - 48]}\n\t"
"lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t"
"jmp .Lb2%=\n\t"

".Ll3%=:\n\t"
"mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t"
"xor %k[r10], %k[r10]\n\t"
"lea{q -40(%[src0]), %[src0]| %[src0], [%[src0] - 40]}\n\t"
"lea{q -40(%[src1]), %[src1]| %[src1], [%[src1] - 40]}\n\t"
"lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t"
"jmp .Lb3%=\n\t"

".Ll4%=:\n\t"
"mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t"
"xor %k[r9], %k[r9]\n\t"
"lea{q -32(%[src0]), %[src0]| %[src0], [%[src0] - 32]}\n\t"
"lea{q -32(%[src1]), %[src1]| %[src1], [%[src1] - 32]}\n\t"
"lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t"
"jmp .Lb4%=\n\t"

".Ll5%=:\n\t"
"mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t"
"xor %k[r10], %k[r10]\n\t"
"lea{q -24(%[src0]), %[src0]| %[src0], [%[src0] - 24]}\n\t"
"lea{q -24(%[src1]), %[src1]| %[src1], [%[src1] - 24]}\n\t"
"lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t"
"jmp .Lb5%=\n\t"

".Ll6%=:\n\t"
"mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t"
"xor %k[r9], %k[r9]\n\t"
"lea{q -16(%[src0]), %[src0]| %[src0], [%[src0] - 16]}\n\t"
"lea{q -16(%[src1]), %[src1]| %[src1], [%[src1] - 16]}\n\t"
"lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t"
"jmp .Lb6%=\n\t"

".Ll7%=:\n\t"
"mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t"
"xor %k[r10], %k[r10]\n\t"
"lea{q -8(%[src0]), %[src0]| %[src0], [%[src0] - 8]}\n\t"
"lea{q -8(%[src1]), %[src1]| %[src1], [%[src1] - 8]}\n\t"
"lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t"
"jmp .Lb7%=\n\t"

".Ld1%=:\n\t"
"shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q (%[dst]), %[r8]| [%[dst]], %[r8]}\n\t"
"shrx{q %[tcl], %[r9], %[r9]| %[r9], %[r9], %[tcl]}\n\t"
"mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q %[cx], %[r9]| %[r9], %[cx]}\n\t"
"jmp .Ldone%=\n\t"

".Ll1%=:\n\t"
"mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t"
"xor %k[r10], %k[r10]\n\t"
"jrcxz .Ld1%=\n\t"
"lea{q 8(%[src0]), %[src0]| %[src0], [%[src0] + 8]}\n\t"
"lea{q 8(%[src1]), %[src1]| %[src1], [%[src1] + 8]}\n\t"
"lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t"

".align 32\n\t"
".Lloop%=:\n\t"

".Lb1%=:\n\t"
"shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
"lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t"
"mov{q (%[src1]), %[r10]| [%[src1]], %[r10]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q -8(%[src0]), %[r8]| [%[src0] - 8], %[r8]}\n\t"
"shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t"
"mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t"

".Lb0%=:\n\t"
"shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t"
"lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t"
"mov{q 8(%[src1]), %[r9]| [%[src1] + 8], %[r9]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q (%[src0]), %[r8]| [%[src0]], %[r8]}\n\t"
"shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t"
"mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t"

"lea{q -1(%[cx]), %[cx]| %[cx], [%[cx] - 1]}\n\t"

".Lb7%=:\n\t"
"shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
"lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t"
"mov{q 16(%[src1]), %[r10]| [%[src1] + 16], %[r10]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q 8(%[src0]), %[r8]| [%[src0] + 8], %[r8]}\n\t"
"shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t"
"mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t"

".Lb6%=:\n\t"
"shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t"
"lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t"
"mov{q 24(%[src1]), %[r9]| [%[src1] + 24], %[r9]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q 16(%[src0]), %[r8]| [%[src0] + 16], %[r8]}\n\t"
"shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t"
"mov{q %[r8], 16(%[dst])| [%[dst] + 16], %[r8]}\n\t"

".Lb5%=:\n\t"
"shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
"lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t"
"mov{q 32(%[src1]), %[r10]| [%[src1] + 32], %[r10]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q 24(%[src0]), %[r8]| [%[src0] + 24], %[r8]}\n\t"
"shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t"
"mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t"

".Lb4%=:\n\t"
"shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t"
"lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t"
"mov{q 40(%[src1]), %[r9]| [%[src1] + 40], %[r9]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q 32(%[src0]), %[r8]| [%[src0] + 32], %[r8]}\n\t"
"shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t"
"mov{q %[r8], 32(%[dst])| [%[dst] + 32], %[r8]}\n\t"

".Lb3%=:\n\t"
"shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
"lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t"
"mov{q 48(%[src1]), %[r10]| [%[src1] + 48], %[r10]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q 40(%[src0]), %[r8]| [%[src0] + 40], %[r8]}\n\t"
"shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t"
"mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t"

".Lb2%=:\n\t"
"shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t"
"lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t"
"mov{q 56(%[src1]), %[r9]| [%[src1] + 56], %[r9]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q 48(%[src0]), %[r8]| [%[src0]+ 48], %[r8]}\n\t"
"shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t"
"mov{q %[r8], 48(%[dst])| [%[dst] + 48], %[r8]}\n\t"

"lea{q 64(%[src0]), %[src0]| %[src0], [%[src0] + 64]}\n\t"
"lea{q 64(%[src1]), %[src1]| %[src1], [%[src1] + 64]}\n\t"
"lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t"

"jrcxz .Lloop_out%=\n\t"
"jmp .Lloop%=\n\t"
".Lloop_out%=:\n\t"

"shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t"
"lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q -8(%[dst]), %[r8]| [%[dst] - 8], %[r8]}\n\t"
"shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t"
"mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t"
WJR_PP_STR(WJR_adcsbb) "{q %[cx], %[r9]| %[r9], %[cx]}\n\t"

".Ldone%=:"

: [dst] "+r"(dst), [src0] "+r"(src0), [src1] "+r"(src1), [cx] "+c"(cx),
[cl] "+r"(cl), [tcl] "+r"(tcl), [r8] "=r"(r8), [r9] "+r"(r9),
[r10] "=r"(r10)
:
: "cc", "memory");

WJR_ASSUME(cx == 0);

return r9;
}

#undef WJR_adcsbb
#undef WJR_addcsubc

#undef WJR_ADDSUB_I
Loading

0 comments on commit 9e5ae53

Please sign in to comment.