diff --git a/include/wjr/math/mul.hpp b/include/wjr/math/mul.hpp index a4dc350d..75217cf2 100644 --- a/include/wjr/math/mul.hpp +++ b/include/wjr/math/mul.hpp @@ -210,11 +210,15 @@ WJR_INTRINSIC_CONSTEXPR_E T addmul_1(T *dst, const T *src, size_t n, } #if WJR_HAS_BUILTIN(ASM_ADDMUL_1) - if (is_constant_evaluated()) { + if constexpr (sizeof(T) == 8) { + if (is_constant_evaluated()) { + return fallback_addmul_1(dst, src, n, ml); + } + + return asm_addmul_1(dst, src, n, ml); + } else { return fallback_addmul_1(dst, src, n, ml); } - - return asm_addmul_1(dst, src, n, ml); #else return fallback_addmul_1(dst, src, n, ml); #endif @@ -260,6 +264,42 @@ WJR_INTRINSIC_CONSTEXPR_E T submul_1(T *dst, const T *src, size_t n, #endif } +template +WJR_INTRINSIC_CONSTEXPR T fallback_addlsh_n(T *dst, const T *src0, const T *src1, + size_t n, type_identity_t cl) { + T tcl = std::numeric_limits::digits - cl; + T lo = 0, hi = 0; + T o_in = 0, c_in = 0; + + for (size_t i = 0; i < n; ++i) { + lo = src1[i] << cl; + hi = src1[i] >> tcl; + lo = addc(lo, c_in, 0u, c_in); + dst[i] = addc(lo, src0[i], 0u, o_in); + c_in += hi + o_in; + } + + return c_in; +} + +template +WJR_INTRINSIC_CONSTEXPR_E T addlsh_n(T *dst, const T *src0, const T *src1, size_t n, + type_identity_t cl) { + if (WJR_UNLIKELY(cl == 0)) { + return wjr::addc_n(dst, src0, src1, n, 0u); + } + +#if WJR_HAS_BUILTIN(ASM_ADDLSH_N) + if (is_constant_evaluated()) { + return fallback_addlsh_n(dst, src0, src1, n, cl); + } + + return asm_addlsh_n(dst, src0, src1, n, cl); +#else + return fallback_addlsh_n(dst, src0, src1, n, cl); +#endif +} + // preview : // native default threshold of toom-cook-2 diff --git a/include/wjr/preprocessor/preview.hpp b/include/wjr/preprocessor/preview.hpp index 5b93d639..2948cb47 100644 --- a/include/wjr/preprocessor/preview.hpp +++ b/include/wjr/preprocessor/preview.hpp @@ -29,8 +29,18 @@ #endif // #if defined(NDEBUG) -#define WJR_ASSERT_NOMESSAGE_I(expr) WJR_ASSUME(expr) -#define WJR_ASSERT_MESSAGE_I(expr) WJR_UNREACHABLE() +#define WJR_ASSERT_NOMESSAGE_I(expr) \ + do { \ + if (WJR_UNLIKELY(!(expr))) { \ + std::abort(); \ + WJR_UNREACHABLE(); \ + } \ + } while (0) +#define WJR_ASSERT_MESSAGE_I(expr) \ + do { \ + std::abort(); \ + WJR_UNREACHABLE(); \ + } while (0) #else #define WJR_ASSERT_NOMESSAGE_I(expr) assert(expr) #define WJR_ASSERT_MESSAGE_I(expr) \ @@ -58,8 +68,9 @@ WJR_ASSERT_CHECK_I_MESSAGE) \ (__VA_ARGS__) -#define WJR_ASSERT_UNCHECK_I(...) \ +#define WJR_ASSERT_UNCHECK_I(expr, ...) \ do { \ + WJR_ASSUME(expr); \ } while (0) // level = [0, 2] diff --git a/include/wjr/x86/add.hpp b/include/wjr/x86/add.hpp index 19ef8b36..2397f59e 100644 --- a/include/wjr/x86/add.hpp +++ b/include/wjr/x86/add.hpp @@ -13,12 +13,12 @@ namespace wjr { (defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC)) #define WJR_HAS_BUILTIN_ASM_ADDC WJR_HAS_DEF #define WJR_HAS_BUILTIN_ASM_ADDC_N WJR_HAS_DEF -#endif #define WJR_ADDSUB_I 1 - #include +#endif + } // namespace wjr #endif // WJR_X86_ADD_HPP__ \ No newline at end of file diff --git a/include/wjr/x86/gen_addrsblsh_n.hpp b/include/wjr/x86/gen_addrsblsh_n.hpp new file mode 100644 index 00000000..d2247f84 --- /dev/null +++ b/include/wjr/x86/gen_addrsblsh_n.hpp @@ -0,0 +1,210 @@ +// WJR_ADDSUB_I : +// 0 : SUB +// 1 : ADD + +#ifndef WJR_ADDSUB_I +#error "abort" +#endif + +#define WJR_addsub WJR_PP_BOOL_IF(WJR_ADDSUB_I, add, rsb) +#define WJR_adcsbb WJR_PP_BOOL_IF(WJR_ADDSUB_I, adc, sbb) + +WJR_INLINE uint64_t WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addsub, lsh_n))( + uint64_t *dst, const uint64_t *src0, const uint64_t *src1, size_t n, uint64_t cl) { + WJR_ASSERT(n != 0); + WJR_ASSERT(cl != 0); + + size_t cx = n / 8; + uint64_t tcl = 64 - cl; + uint64_t r8, r9 = n, r10; + + asm volatile( + "and{l $7, %k[r9]| %k[r9], 7}\n\t" + "lea{q| %[r8], [rip +} .Llookup%={(%%rip), %[r8]|]}\n\t" + "movs{lq (%[r8], %[r9], 4), %[r9]|xd %[r9], DWORD PTR [%[r8] + " + "%[r9] * 4]}\n\t" + "lea{q (%[r8], %[r9], 1), %[r9]| %[r9], [%[r9] + %[r8]]}\n\t" + "jmp{q *%[r9]| %[r9]}\n\t" + + ".align 8\n\t" + ".Llookup%=:\n\t" + ".long .Ll0%=-.Llookup%=\n\t" + ".long .Ll1%=-.Llookup%=\n\t" + ".long .Ll2%=-.Llookup%=\n\t" + ".long .Ll3%=-.Llookup%=\n\t" + ".long .Ll4%=-.Llookup%=\n\t" + ".long .Ll5%=-.Llookup%=\n\t" + ".long .Ll6%=-.Llookup%=\n\t" + ".long .Ll7%=-.Llookup%=\n\t" + ".align 16\n\t" + + ".Ll0%=:\n\t" + "mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t" + "xor %k[r9], %k[r9]\n\t" + "jmp .Lb0%=\n\t" + + ".Ll2%=:\n\t" + "mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t" + "xor %k[r9], %k[r9]\n\t" + "lea{q -48(%[src0]), %[src0]| %[src0], [%[src0] - 48]}\n\t" + "lea{q -48(%[src1]), %[src1]| %[src1], [%[src1] - 48]}\n\t" + "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t" + "jmp .Lb2%=\n\t" + + ".Ll3%=:\n\t" + "mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t" + "xor %k[r10], %k[r10]\n\t" + "lea{q -40(%[src0]), %[src0]| %[src0], [%[src0] - 40]}\n\t" + "lea{q -40(%[src1]), %[src1]| %[src1], [%[src1] - 40]}\n\t" + "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t" + "jmp .Lb3%=\n\t" + + ".Ll4%=:\n\t" + "mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t" + "xor %k[r9], %k[r9]\n\t" + "lea{q -32(%[src0]), %[src0]| %[src0], [%[src0] - 32]}\n\t" + "lea{q -32(%[src1]), %[src1]| %[src1], [%[src1] - 32]}\n\t" + "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t" + "jmp .Lb4%=\n\t" + + ".Ll5%=:\n\t" + "mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t" + "xor %k[r10], %k[r10]\n\t" + "lea{q -24(%[src0]), %[src0]| %[src0], [%[src0] - 24]}\n\t" + "lea{q -24(%[src1]), %[src1]| %[src1], [%[src1] - 24]}\n\t" + "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t" + "jmp .Lb5%=\n\t" + + ".Ll6%=:\n\t" + "mov{q (%[src1]), %[r10]| %[r10], [%[src1]]}\n\t" + "xor %k[r9], %k[r9]\n\t" + "lea{q -16(%[src0]), %[src0]| %[src0], [%[src0] - 16]}\n\t" + "lea{q -16(%[src1]), %[src1]| %[src1], [%[src1] - 16]}\n\t" + "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t" + "jmp .Lb6%=\n\t" + + ".Ll7%=:\n\t" + "mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t" + "xor %k[r10], %k[r10]\n\t" + "lea{q -8(%[src0]), %[src0]| %[src0], [%[src0] - 8]}\n\t" + "lea{q -8(%[src1]), %[src1]| %[src1], [%[src1] - 8]}\n\t" + "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t" + "jmp .Lb7%=\n\t" + + ".Ld1%=:\n\t" + "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q (%[dst]), %[r8]| [%[dst]], %[r8]}\n\t" + "shrx{q %[tcl], %[r9], %[r9]| %[r9], %[r9], %[tcl]}\n\t" + "mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q %[cx], %[r9]| %[r9], %[cx]}\n\t" + "jmp .Ldone%=\n\t" + + ".Ll1%=:\n\t" + "mov{q (%[src1]), %[r9]| %[r9], [%[src1]]}\n\t" + "xor %k[r10], %k[r10]\n\t" + "jrcxz .Ld1%=\n\t" + "lea{q 8(%[src0]), %[src0]| %[src0], [%[src0] + 8]}\n\t" + "lea{q 8(%[src1]), %[src1]| %[src1], [%[src1] + 8]}\n\t" + "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" + + ".align 32\n\t" + ".Lloop%=:\n\t" + + ".Lb1%=:\n\t" + "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t" + "lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t" + "mov{q (%[src1]), %[r10]| [%[src1]], %[r10]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q -8(%[src0]), %[r8]| [%[src0] - 8], %[r8]}\n\t" + "shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" + + ".Lb0%=:\n\t" + "shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t" + "lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t" + "mov{q 8(%[src1]), %[r9]| [%[src1] + 8], %[r9]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q (%[src0]), %[r8]| [%[src0]], %[r8]}\n\t" + "shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t" + "mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t" + + "lea{q -1(%[cx]), %[cx]| %[cx], [%[cx] - 1]}\n\t" + + ".Lb7%=:\n\t" + "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t" + "lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t" + "mov{q 16(%[src1]), %[r10]| [%[src1] + 16], %[r10]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q 8(%[src0]), %[r8]| [%[src0] + 8], %[r8]}\n\t" + "shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t" + "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t" + + ".Lb6%=:\n\t" + "shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t" + "lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t" + "mov{q 24(%[src1]), %[r9]| [%[src1] + 24], %[r9]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q 16(%[src0]), %[r8]| [%[src0] + 16], %[r8]}\n\t" + "shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t" + "mov{q %[r8], 16(%[dst])| [%[dst] + 16], %[r8]}\n\t" + + ".Lb5%=:\n\t" + "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t" + "lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t" + "mov{q 32(%[src1]), %[r10]| [%[src1] + 32], %[r10]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q 24(%[src0]), %[r8]| [%[src0] + 24], %[r8]}\n\t" + "shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t" + "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t" + + ".Lb4%=:\n\t" + "shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t" + "lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t" + "mov{q 40(%[src1]), %[r9]| [%[src1] + 40], %[r9]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q 32(%[src0]), %[r8]| [%[src0] + 32], %[r8]}\n\t" + "shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t" + "mov{q %[r8], 32(%[dst])| [%[dst] + 32], %[r8]}\n\t" + + ".Lb3%=:\n\t" + "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t" + "lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t" + "mov{q 48(%[src1]), %[r10]| [%[src1] + 48], %[r10]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q 40(%[src0]), %[r8]| [%[src0] + 40], %[r8]}\n\t" + "shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t" + "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t" + + ".Lb2%=:\n\t" + "shlx{q %[cl], %[r10], %[r8]| %[r8], %[r10], %[cl]}\n\t" + "lea{q (%[r9], %[r8]), %[r8]| %[r8], [%[r9] + %[r8]]}\n\t" + "mov{q 56(%[src1]), %[r9]| [%[src1] + 56], %[r9]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q 48(%[src0]), %[r8]| [%[src0]+ 48], %[r8]}\n\t" + "shrx{q %[tcl], %[r10], %[r10]| %[r10],%[r10], %[tcl]}\n\t" + "mov{q %[r8], 48(%[dst])| [%[dst] + 48], %[r8]}\n\t" + + "lea{q 64(%[src0]), %[src0]| %[src0], [%[src0] + 64]}\n\t" + "lea{q 64(%[src1]), %[src1]| %[src1], [%[src1] + 64]}\n\t" + "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" + + "jrcxz .Lloop_out%=\n\t" + "jmp .Lloop%=\n\t" + ".Lloop_out%=:\n\t" + + "shlx{q %[cl], %[r9], %[r8]| %[r8], %[r9], %[cl]}\n\t" + "lea{q (%[r10], %[r8]), %[r8]| %[r8], [%[r10] + %[r8]]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q -8(%[dst]), %[r8]| [%[dst] - 8], %[r8]}\n\t" + "shrx{q %[tcl], %[r9], %[r9]| %[r9],%[r9], %[tcl]}\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" + WJR_PP_STR(WJR_adcsbb) "{q %[cx], %[r9]| %[r9], %[cx]}\n\t" + + ".Ldone%=:" + + : [dst] "+r"(dst), [src0] "+r"(src0), [src1] "+r"(src1), [cx] "+c"(cx), + [cl] "+r"(cl), [tcl] "+r"(tcl), [r8] "=r"(r8), [r9] "+r"(r9), + [r10] "=r"(r10) + : + : "cc", "memory"); + + WJR_ASSUME(cx == 0); + + return r9; +} + +#undef WJR_adcsbb +#undef WJR_addcsubc + +#undef WJR_ADDSUB_I \ No newline at end of file diff --git a/include/wjr/x86/gen_addsub.hpp b/include/wjr/x86/gen_addsub.hpp index cd6e0b67..8660515c 100644 --- a/include/wjr/x86/gen_addsub.hpp +++ b/include/wjr/x86/gen_addsub.hpp @@ -6,15 +6,12 @@ #error "abort" #endif -#define WJR_ADDSUB WJR_PP_BOOL_IF(WJR_ADDSUB_I, ADDC, SUBC) #define WJR_addcsubc WJR_PP_BOOL_IF(WJR_ADDSUB_I, addc, subc) #define WJR_adcsbb WJR_PP_BOOL_IF(WJR_ADDSUB_I, adc, sbb) -#if WJR_HAS_BUILTIN(WJR_PP_CONCAT(ASM_, WJR_ADDSUB)) - -template -WJR_INTRINSIC_INLINE T WJR_PP_CONCAT(asm_, WJR_addcsubc)(T a, T b, U c_in, U &c_out) { - static_assert(std::is_same_v, ""); +template +WJR_INTRINSIC_INLINE uint64_t WJR_PP_CONCAT(asm_, WJR_addcsubc)(uint64_t a, uint64_t b, + U c_in, U &c_out) { #if WJR_ADDSUB_I == 0 if (WJR_BUILTIN_CONSTANT_P(c_in)) { @@ -99,16 +96,11 @@ WJR_INTRINSIC_INLINE T WJR_PP_CONCAT(asm_, WJR_addcsubc)(T a, T b, U c_in, U &c_ #undef WJR_REGISTER_BUILTIN_ASM_ADDSUB_0 } -#endif - -#if WJR_HAS_BUILTIN(WJR_PP_CONCAT(ASM_, WJR_PP_CONCAT(WJR_ADDSUB, _N))) - -template -WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const T *src0, - const T *src1, size_t n, - U c_in) { - static_assert(std::is_same_v, ""); - +template +WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(uint64_t *dst, + const uint64_t *src0, + const uint64_t *src1, + size_t n, U c_in) { if (WJR_BUILTIN_CONSTANT_P(n)) { if (n == 1) { dst[0] = WJR_PP_CONCAT(asm_, WJR_addcsubc)(src0[0], src1[0], c_in, c_in); @@ -116,30 +108,26 @@ WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const } } - const auto cdst = dst; - const auto csrc0 = src0; - const auto csrc1 = src1; - size_t cx = n / 8; - T r8 = c_in, r9, r10 = n & 7, r11; + uint64_t r8 = c_in, r9, r10 = n & 7, r11; asm volatile( "add{b $255, %b[r8]| %b[r8], 255}\n\t" - "lea{q| %[r9], [rip +} .Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%={(%%rip), %[r9]|]}\n\t" + "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + %[r10] * 4]}\n\t" "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r9] + %[r10]]}\n\t" "jmp{q *%[r10]| %[r10]}\n\t" ".align 8\n\t" - ".Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=:\n\t" - ".long .Ll0%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t" - ".long .Ll1%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t" - ".long .Ll2%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t" - ".long .Ll3%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t" - ".long .Ll4%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t" - ".long .Ll5%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t" - ".long .Ll6%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t" - ".long .Ll7%=-.Lasm_" WJR_PP_STR(WJR_adcsbb) "_n_lookup%=\n\t" + ".Llookup%=:\n\t" + ".long .Ll0%=-.Llookup%=\n\t" + ".long .Ll1%=-.Llookup%=\n\t" + ".long .Ll2%=-.Llookup%=\n\t" + ".long .Ll3%=-.Llookup%=\n\t" + ".long .Ll4%=-.Llookup%=\n\t" + ".long .Ll5%=-.Llookup%=\n\t" + ".long .Ll6%=-.Llookup%=\n\t" + ".long .Ll7%=-.Llookup%=\n\t" ".align 16\n\t" ".Ll0%=:\n\t" @@ -229,7 +217,7 @@ WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const "lea{q 16(%[dst]), %[dst]| %[dst], [%[dst] + 16]}\n\t" ".align 32\n\t" - ".Lwjr_asm_" WJR_PP_STR(WJR_adcsbb) "_n_loop%=:\n\t" + ".Lloop%=:\n\t" ".Lb2%=:\n\t" "mov{q (%[src0]), %[r9]| %[r9], [%[src0]]}\n\t" @@ -277,7 +265,7 @@ WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" "dec %[cx]\n\t" - "jne .Lwjr_asm_" WJR_PP_STR(WJR_adcsbb) "_n_loop%=\n\t" + "jne .Lloop%=\n\t" WJR_PP_STR(WJR_adcsbb) "{q -8(%[src1]), %[r10]| %[r10], [%[src1] - 8]}\n\t" "mov{q %[r8], -16(%[dst])| [%[dst] - 16], %[r8]}\n\t" @@ -292,19 +280,13 @@ WJR_INLINE U WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addcsubc, _n))(T *dst, const : : "cc", "memory"); - WJR_ASSUME(dst == cdst + n); - WJR_ASSUME(src0 == csrc0 + n); - WJR_ASSUME(src1 == csrc1 + n); WJR_ASSUME(cx == 0); WJR_ASSUME(r9 == 0u || r9 == 1u); return r9; } -#endif - #undef WJR_adcsbb #undef WJR_addcsubc -#undef WJR_ADDSUB #undef WJR_ADDSUB_I \ No newline at end of file diff --git a/include/wjr/x86/mul.hpp b/include/wjr/x86/mul.hpp index b145f5e4..b88db200 100644 --- a/include/wjr/x86/mul.hpp +++ b/include/wjr/x86/mul.hpp @@ -70,30 +70,27 @@ WJR_INTRINSIC_INLINE T mulx(T a, T b, T &hi) { WJR_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t dx) { WJR_ASSERT(n != 0); - const auto cdst = dst; - const auto csrc = src; - size_t cx = n / 8; uint64_t r8, r9, r10 = n, r11; asm volatile( "and{l $7, %k[r10]| %k[r10], 7}\n\t" - "lea{q| %[r9], [rip +} .Lasm_mul_1_lookup%={(%%rip), %[r9]|]}\n\t" + "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + " "%[r10] * 4]}\n\t" "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" "jmp{q *%[r10]| %[r10]}\n\t" ".align 8\n\t" - ".Lasm_mul_1_lookup%=:\n\t" - ".long .Ll0%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll1%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll2%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll3%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll4%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll5%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll6%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll7%=-.Lasm_mul_1_lookup%=\n\t" + ".Llookup%=:\n\t" + ".long .Ll0%=-.Llookup%=\n\t" + ".long .Ll1%=-.Llookup%=\n\t" + ".long .Ll2%=-.Llookup%=\n\t" + ".long .Ll3%=-.Llookup%=\n\t" + ".long .Ll4%=-.Llookup%=\n\t" + ".long .Ll5%=-.Llookup%=\n\t" + ".long .Ll6%=-.Llookup%=\n\t" + ".long .Ll7%=-.Llookup%=\n\t" ".align 16\n\t" ".Ll0%=:\n\t" @@ -153,7 +150,7 @@ WJR_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" ".align 32\n\t" - ".Lasm_addmul_1_loop%=:\n\t" + ".Lloop%=:\n\t" ".Lb1%=:\n\t" "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" @@ -199,9 +196,9 @@ WJR_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" "dec %[cx]\n\t" - "jne .Lasm_addmul_1_loop%=\n\t" + "jne .Lloop%=\n\t" - "adc{q $0, %[r9]| %[r9], 0}\n\t" + "adc{q %[cx], %[r9]| %[r9], %[cx]}\n\t" "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" ".Ldone%=:" @@ -211,8 +208,6 @@ WJR_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint : : "cc", "memory"); - WJR_ASSUME(dst == cdst + n); - WJR_ASSUME(src == csrc + n); WJR_ASSUME(cx == 0); return r9; @@ -231,30 +226,27 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t dx) { WJR_ASSERT(n != 0); - const auto cdst = dst; - const auto csrc = src; - size_t cx = n / 8; uint64_t r8, r9, r10 = n, r11; asm volatile( "and{l $7, %k[r10]| %k[r10], 7}\n\t" - "lea{q| %[r9], [rip +} .Lasm_mul_1_lookup%={(%%rip), %[r9]|]}\n\t" + "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + " "%[r10] * 4]}\n\t" "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" "jmp{q *%[r10]| %[r10]}\n\t" ".align 8\n\t" - ".Lasm_mul_1_lookup%=:\n\t" - ".long .Ll0%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll1%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll2%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll3%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll4%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll5%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll6%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll7%=-.Lasm_mul_1_lookup%=\n\t" + ".Llookup%=:\n\t" + ".long .Ll0%=-.Llookup%=\n\t" + ".long .Ll1%=-.Llookup%=\n\t" + ".long .Ll2%=-.Llookup%=\n\t" + ".long .Ll3%=-.Llookup%=\n\t" + ".long .Ll4%=-.Llookup%=\n\t" + ".long .Ll5%=-.Llookup%=\n\t" + ".long .Ll6%=-.Llookup%=\n\t" + ".long .Ll7%=-.Llookup%=\n\t" ".align 16\n\t" ".Ll0%=:\n\t" @@ -299,7 +291,7 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, ".Ld1%=:\n\t" "add{q (%[dst]), %[r8]| [%[dst]], %[r8]}\n\t" - "adc{q $0, %[r9]| %[r9], 0}\n\t" + "adc{q %[cx], %[r9]| %[r9], %[cx]}\n\t" "mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t" "jmp .Ldone%=\n\t" @@ -310,7 +302,7 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" ".align 32\n\t" - ".Lasm_addmul_1_loop%=:\n\t" + ".Lloop%=:\n\t" ".Lb1%=:\n\t" "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" @@ -366,7 +358,7 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" "jrcxz .Lloop_out%=\n\t" - "jmp .Lasm_addmul_1_loop%=\n\t" + "jmp .Lloop%=\n\t" ".Lloop_out%=:\n\t" "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" @@ -381,8 +373,6 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, : : "cc", "memory"); - WJR_ASSUME(dst == cdst + n); - WJR_ASSUME(src == csrc + n); WJR_ASSUME(cx == 0); return r9; @@ -410,9 +400,6 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t dx) { WJR_ASSERT(n != 0); - const auto cdst = dst; - const auto csrc = src; - size_t cx = n / 8; uint64_t r8, r9, r10 = n & 7, r11; @@ -421,22 +408,22 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, "mov{b $255, %b[r11]| %b[r11], 255}\n\t" "add{b $1, %b[r11]| %b[r11], 1}\n\t" - "lea{q| %[r9], [rip +} .Lasm_mul_1_lookup%={(%%rip), %[r9]|]}\n\t" + "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + %[r10] * " "4]}\n\t" "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" "jmp{q *%[r10]| %[r10]}\n\t" ".align 8\n\t" - ".Lasm_mul_1_lookup%=:\n\t" - ".long .Ll0%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll1%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll2%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll3%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll4%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll5%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll6%=-.Lasm_mul_1_lookup%=\n\t" - ".long .Ll7%=-.Lasm_mul_1_lookup%=\n\t" + ".Llookup%=:\n\t" + ".long .Ll0%=-.Llookup%=\n\t" + ".long .Ll1%=-.Llookup%=\n\t" + ".long .Ll2%=-.Llookup%=\n\t" + ".long .Ll3%=-.Llookup%=\n\t" + ".long .Ll4%=-.Llookup%=\n\t" + ".long .Ll5%=-.Llookup%=\n\t" + ".long .Ll6%=-.Llookup%=\n\t" + ".long .Ll7%=-.Llookup%=\n\t" ".align 16\n\t" ".Ll0%=:\n\t" @@ -500,7 +487,7 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" ".align 32\n\t" - ".Lasm_addmul_1_loop%=:\n\t" + ".Lloop%=:\n\t" ".Lb1%=:\n\t" "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" @@ -564,7 +551,7 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" "jrcxz .Lloop_out%=\n\t" - "jmp .Lasm_addmul_1_loop%=\n\t" + "jmp .Lloop%=\n\t" ".Lloop_out%=:\n\t" "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" @@ -579,8 +566,6 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, : : "cc", "memory"); - WJR_ASSUME(dst == cdst + n); - WJR_ASSUME(src == csrc + n); WJR_ASSUME(cx == 0); return r9; @@ -588,6 +573,15 @@ WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, #endif +#if WJR_HAS_BUILTIN(ASM_MUL_1) +#define WJR_HAS_BUILTIN_ASM_ADDLSH_N WJR_HAS_DEF +#define WJR_HAS_BUILTIN_ASM_SUBLSH_N WJR_HAS_DEF + +#define WJR_ADDSUB_I 1 +#include + +#endif + } // namespace wjr #endif // WJR_X86_MUL_HPP__ \ No newline at end of file diff --git a/include/wjr/x86/sub.hpp b/include/wjr/x86/sub.hpp index fe953064..fab10c33 100644 --- a/include/wjr/x86/sub.hpp +++ b/include/wjr/x86/sub.hpp @@ -13,12 +13,12 @@ namespace wjr { (defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC)) #define WJR_HAS_BUILTIN_ASM_SUBC WJR_HAS_DEF #define WJR_HAS_BUILTIN_ASM_SUBC_N WJR_HAS_DEF -#endif #define WJR_ADDSUB_I 0 - #include +#endif + } // namespace wjr #endif // WJR_X86_SUB_HPP__ \ No newline at end of file