From fe82d5277395f6c1231c8bef9f24fffb24114a16 Mon Sep 17 00:00:00 2001 From: wjr <1966336874@qq.com> Date: Thu, 4 Jul 2024 17:39:42 +0800 Subject: [PATCH] opt --- godbolt/wjr.hpp | 822 +++--------------- include/wjr/math/div-impl.hpp | 24 +- include/wjr/math/div.hpp | 175 +--- include/wjr/math/mul.hpp | 12 +- .../wjr/preprocessor/compiler/attribute.hpp | 11 +- include/wjr/x86/math/mul-impl.hpp | 24 +- include/wjr/x86/math/mul.hpp | 544 +----------- src/wjr/math/div.cpp | 137 +++ src/wjr/math/mul.cpp | 38 +- src/wjr/x86/math/mul.cpp | 469 ++++++++++ tests/units/src/details.hpp | 7 +- tests/units/src/math.cpp | 8 +- 12 files changed, 841 insertions(+), 1430 deletions(-) diff --git a/godbolt/wjr.hpp b/godbolt/wjr.hpp index 4f9f1a77..ed164d15 100644 --- a/godbolt/wjr.hpp +++ b/godbolt/wjr.hpp @@ -1506,10 +1506,11 @@ #if defined(WJR_COMPILER_MSVC) #define WJR_MS_ABI #define WJR_HAS_FEATURE_MS_ABI WJR_HAS_DEF -#elif defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC) +#elif WJR_HAS_ATTRIBUTE(__ms_abi__) #define WJR_MS_ABI __attribute__((__ms_abi__)) #define WJR_HAS_FEATURE_MS_ABI WJR_HAS_DEF -#else +#elif defined(WJR_ENABLE_ASSEMBLY) +#undef WJR_ENABLE_ASSEMBLY #endif #define WJR_ASSUME_MAY_NOT_PURE(expr) \ @@ -1621,6 +1622,12 @@ #define WJR_MALLOC #endif +#if WJR_HAS_ATTRIBUTE(nonnull) +#define WJR_NONNULL(...) __attribute__((__VA_ARGS__)) +#else +#define WJR_NONNULL(...) +#endif + #define WJR_INLINE inline #define WJR_CONSTEXPR constexpr @@ -21876,18 +21883,6 @@ namespace wjr { #if defined(__BMI2__) && defined(__ADX__) -#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF -#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF -#elif defined(WJR_ENABLE_ASSEMBLY) -#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF_VAR(3) -#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF_VAR(3) -#endif - -#endif - -#if defined(__BMI2__) && defined(__ADX__) - #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) #define WJR_HAS_BUILTIN_ASM_SUBMUL_1 WJR_HAS_DEF #elif defined(WJR_ENABLE_ASSEMBLY) @@ -21908,6 +21903,18 @@ namespace wjr { #endif +#if defined(__BMI2__) && defined(__ADX__) + +#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) +#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF +#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF +#elif defined(WJR_ENABLE_ASSEMBLY) +#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF_VAR(3) +#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF_VAR(3) +#endif + +#endif + } // namespace wjr #endif // WJR_X86_MATH_MUL_IMPL_HPP__ @@ -21940,162 +21947,12 @@ WJR_INTRINSIC_INLINE uint64_t builtin_umul128(uint64_t a, uint64_t b, #if WJR_HAS_BUILTIN(ASM_MUL_1) #if WJR_HAS_BUILTIN(ASM_MUL_1) == 1 - -inline uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, - uint64_t rdx) noexcept { - size_t rcx = n / 8; - uint64_t r8, r9, r10 = n, r11; - - const auto pdst = dst; - const auto psrc = src; - - (void)(pdst); - (void)(psrc); - - asm volatile( - "and{l $7, %k[r10]| %k[r10], 7}\n\t" - "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" - "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + " - "%[r10] * 4]}\n\t" - "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" - "jmp{q *%[r10]| %[r10]}\n\t" - - ".align 8\n\t" - ".Llookup%=:\n\t" - ".long .Ll0%=-.Llookup%=\n\t" - ".long .Ll1%=-.Llookup%=\n\t" - ".long .Ll2%=-.Llookup%=\n\t" - ".long .Ll3%=-.Llookup%=\n\t" - ".long .Ll4%=-.Llookup%=\n\t" - ".long .Ll5%=-.Llookup%=\n\t" - ".long .Ll6%=-.Llookup%=\n\t" - ".long .Ll7%=-.Llookup%=\n\t" - ".align 16\n\t" - - ".Ll0%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "jmp .Lb0%=\n\t" - - ".Ll2%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q -48(%[src]), %[src]| %[src], [%[src] - 48]}\n\t" - "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb2%=\n\t" - - ".Ll3%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q -40(%[src]), %[src]| %[src], [%[src] - 40]}\n\t" - "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb3%=\n\t" - - ".Ll4%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q -32(%[src]), %[src]| %[src], [%[src] - 32]}\n\t" - "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb4%=\n\t" - - ".Ll5%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q -24(%[src]), %[src]| %[src], [%[src] - 24]}\n\t" - "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb5%=\n\t" - - ".Ll6%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q -16(%[src]), %[src]| %[src], [%[src] - 16]}\n\t" - "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb6%=\n\t" - - ".Ll7%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q -8(%[src]), %[src]| %[src], [%[src] - 8]}\n\t" - "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb7%=\n\t" - - ".Ld1%=:\n\t" - "jmp .Ldone%=\n\t" - - ".Ll1%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 8(%[src]), %[src]| %[src], [%[src] + 8]}\n\t" - "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" - "jrcxz .Ld1%=\n\t" - - ".align 32\n\t" - ".Lloop%=:\n\t" - - ".Lb1%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - - ".Lb0%=:\n\t" - "mulx{q 8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 8]}\n\t" - "mov{q %[r10], (%[dst])| [%[dst]], %[r10]}\n\t" - "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - - ".Lb7%=:\n\t" - "mulx{q 16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 16]}\n\t" - "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t" - "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - - ".Lb6%=:\n\t" - "mulx{q 24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 24]}\n\t" - "mov{q %[r10], 16(%[dst])| [%[dst] + 16], %[r10]}\n\t" - "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - - ".Lb5%=:\n\t" - "mulx{q 32(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 32]}\n\t" - "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t" - "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - - ".Lb4%=:\n\t" - "mulx{q 40(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 40]}\n\t" - "mov{q %[r10], 32(%[dst])| [%[dst] + 32], %[r10]}\n\t" - "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - - ".Lb3%=:\n\t" - "mulx{q 48(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 48]}\n\t" - "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t" - "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - - ".Lb2%=:\n\t" - "mulx{q 56(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 56]}\n\t" - "mov{q %[r10], 48(%[dst])| [%[dst] + 48], %[r10]}\n\t" - "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - - "lea{q 64(%[src]), %[src]| %[src], [%[src] + 64]}\n\t" - "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" - - "dec %[rcx]\n\t" - "jne .Lloop%=\n\t" - - "adc{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" - ".Ldone%=:\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - - : [dst] "+&r"(dst), [src] "+&r"(src), [rcx] "+&c"(rcx), [r8] "=&r"(r8), - [r9] "=&r"(r9), [r10] "+&r"(r10), [r11] "=&r"(r11) - : "d"(rdx) - : "cc", "memory"); - - WJR_ASSERT_ASSUME(rcx == 0); - WJR_ASSERT_ASSUME(dst == pdst + n); - WJR_ASSERT_ASSUME(src == psrc + n); - - return r9; -} - +extern uint64_t __wjr_asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, + uint64_t rdx) noexcept; #else - extern "C" WJR_MS_ABI uint64_t __wjr_asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept; +#endif WJR_INTRINSIC_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept { @@ -22104,177 +21961,15 @@ WJR_INTRINSIC_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size #endif -#endif - #if WJR_HAS_BUILTIN(ASM_ADDMUL_1) #if WJR_HAS_BUILTIN(ASM_ADDMUL_1) == 1 - -inline uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, - uint64_t rdx) noexcept { - size_t rcx = n / 8; - uint64_t r8, r9, r10 = n, r11; - - const auto pdst = dst; - const auto psrc = src; - - (void)(pdst); - (void)(psrc); - - asm volatile( - "and{l $7, %k[r10]| %k[r10], 7}\n\t" - "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" - "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + " - "%[r10] * 4]}\n\t" - "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" - "jmp{q *%[r10]| %[r10]}\n\t" - - ".align 8\n\t" - ".Llookup%=:\n\t" - ".long .Ll0%=-.Llookup%=\n\t" - ".long .Ll1%=-.Llookup%=\n\t" - ".long .Ll2%=-.Llookup%=\n\t" - ".long .Ll3%=-.Llookup%=\n\t" - ".long .Ll4%=-.Llookup%=\n\t" - ".long .Ll5%=-.Llookup%=\n\t" - ".long .Ll6%=-.Llookup%=\n\t" - ".long .Ll7%=-.Llookup%=\n\t" - ".align 16\n\t" - - ".Ll0%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "jmp .Lb0%=\n\t" - - ".Ll2%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q 16(%[src]), %[src]| %[src], [%[src] + 16]}\n\t" - "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t" - "jmp .Lb2%=\n\t" - - ".Ll3%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 24(%[src]), %[src]| %[src], [%[src] + 24]}\n\t" - "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t" - "jmp .Lb3%=\n\t" - - ".Ll4%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q 32(%[src]), %[src]| %[src], [%[src] + 32]}\n\t" - "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t" - "jmp .Lb4%=\n\t" - - ".Ll5%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 40(%[src]), %[src]| %[src], [%[src] + 40]}\n\t" - "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t" - "jmp .Lb5%=\n\t" - - ".Ll6%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q -16(%[src]), %[src]| %[src], [%[src] - 16]}\n\t" - "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t" - "jmp .Lb6%=\n\t" - - ".Ll7%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q -8(%[src]), %[src]| %[src], [%[src] - 8]}\n\t" - "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t" - "jmp .Lb7%=\n\t" - - ".Ld1%=:\n\t" - "add{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - "adc{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" - "jmp .Ldone%=\n\t" - - ".Ll1%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 8(%[src]), %[src]| %[src], [%[src] + 8]}\n\t" - "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" - "jrcxz .Ld1%=\n\t" - - ".align 32\n\t" - ".Lloop%=:\n\t" - - ".Lb1%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - - ".Lb0%=:\n\t" - "mulx{q 8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 8]}\n\t" - "lea{q -1(%[rcx]), %[rcx]| %[rcx], [%[rcx] - 1]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q (%[dst]), %[r10]| %[r10], [%[dst]]}\n\t" - "mov{q %[r10], (%[dst])| [%[dst]], %[r10]}\n\t" - - ".Lb7%=:\n\t" - "mulx{q 16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 16]}\n\t" - "adcx{q 8(%[dst]), %[r8]| %[r8], [%[dst] + 8]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t" - - ".Lb6%=:\n\t" - "mulx{q 24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 24]}\n\t" - "lea{q 64(%[src]), %[src]| %[src], [%[src] + 64]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 16(%[dst]), %[r10]| %[r10], [%[dst] + 16]}\n\t" - "mov{q %[r10], 16(%[dst])| [%[dst] + 16], %[r10]}\n\t" - - ".Lb5%=:\n\t" - "mulx{q -32(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 32]}\n\t" - "adcx{q 24(%[dst]), %[r8]| %[r8], [%[dst] + 24]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t" - - ".Lb4%=:\n\t" - "mulx{q -24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 24]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 32(%[dst]), %[r10]| %[r10], [%[dst] + 32]}\n\t" - "mov{q %[r10], 32(%[dst])| [%[dst] + 32], %[r10]}\n\t" - - ".Lb3%=:\n\t" - "mulx{q -16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 16]}\n\t" - "adcx{q 40(%[dst]), %[r8]| %[r8], [%[dst] + 40]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t" - - ".Lb2%=:\n\t" - "mulx{q -8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 8]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 48(%[dst]), %[r10]| %[r10], [%[dst] + 48]}\n\t" - "mov{q %[r10], 48(%[dst])| [%[dst] + 48], %[r10]}\n\t" - - "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" - - "jrcxz .Lloop_out%=\n\t" - "jmp .Lloop%=\n\t" - ".Lloop_out%=:\n\t" - - "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - "adox{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" - "adc{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" - - ".Ldone%=:" - - : [dst] "+&r"(dst), [src] "+&r"(src), [rcx] "+&c"(rcx), [r8] "=&r"(r8), - [r9] "=&r"(r9), [r10] "+&r"(r10), [r11] "=&r"(r11) - : "d"(rdx) - : "cc", "memory"); - - WJR_ASSERT_ASSUME(rcx == 0); - WJR_ASSERT_ASSUME(dst == pdst + n); - WJR_ASSERT_ASSUME(src == psrc + n); - - return r9; -} - +extern uint64_t __wjr_asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, + uint64_t rdx) noexcept; #else - extern "C" WJR_MS_ABI uint64_t __wjr_asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept; +#endif WJR_INTRINSIC_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept { @@ -22283,222 +21978,16 @@ WJR_INTRINSIC_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, s #endif -#endif - -#if WJR_HAS_BUILTIN(ASM_BASECASE_MUL_S) - -#if WJR_HAS_BUILTIN(ASM_BASECASE_MUL_S) == 1 -extern void __wjr_asm_basecase_mul_s_impl(uint64_t *dst, const uint64_t *src0, size_t rdx, - const uint64_t *src1, size_t m) noexcept; -#else -extern "C" WJR_MS_ABI void __wjr_asm_basecase_mul_s_impl(uint64_t *dst, - const uint64_t *src0, size_t rdx, - const uint64_t *src1, - size_t m) noexcept; -#endif - -inline void asm_basecase_mul_s(uint64_t *dst, const uint64_t *src0, size_t n, - const uint64_t *src1, size_t m) noexcept { - WJR_ASSERT(n >= m); - WJR_ASSERT(m >= 1); - __wjr_asm_basecase_mul_s_impl(dst, src0, n, src1, m); -} - -#endif - -#if WJR_HAS_BUILTIN(ASM_BASECASE_SQR) - -#if WJR_HAS_BUILTIN(ASM_BASECASE_SQR) == 1 -extern void __wjr_asm_basecase_sqr_impl(uint64_t *dst, const uint64_t *src, - size_t rdx) noexcept; -#else -extern "C" WJR_MS_ABI void __wjr_asm_basecase_sqr_impl(uint64_t *dst, const uint64_t *src, - size_t rdx) noexcept; -#endif - -inline void asm_basecase_sqr(uint64_t *dst, const uint64_t *src, size_t n) noexcept { - WJR_ASSERT(n >= 1); - __wjr_asm_basecase_sqr_impl(dst, src, n); -} - -#endif - #if WJR_HAS_BUILTIN(ASM_SUBMUL_1) #if WJR_HAS_BUILTIN(ASM_SUBMUL_1) == 1 - // slower than asm_addmul_1 -inline uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, - uint64_t rdx) noexcept { - WJR_ASSERT(n != 0); - - size_t rcx = n / 8; - uint64_t r8, r9, r10 = static_cast(n), r11; - - asm volatile( - // set CF = 1, OF = 0 - "and{l $7, %k[r10]| %k[r10], 7}\n\t" - "stc\n\t" - - "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" - "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + %[r10] * " - "4]}\n\t" - "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" - "jmp{q *%[r10]| %[r10]}\n\t" - - ".align 8\n\t" - ".Llookup%=:\n\t" - ".long .Ll0%=-.Llookup%=\n\t" - ".long .Ll1%=-.Llookup%=\n\t" - ".long .Ll2%=-.Llookup%=\n\t" - ".long .Ll3%=-.Llookup%=\n\t" - ".long .Ll4%=-.Llookup%=\n\t" - ".long .Ll5%=-.Llookup%=\n\t" - ".long .Ll6%=-.Llookup%=\n\t" - ".long .Ll7%=-.Llookup%=\n\t" - ".align 16\n\t" - - ".Ll0%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "jmp .Lb0%=\n\t" - - ".Ll2%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q 16(%[src]), %[src]| %[src], [%[src] + 16]}\n\t" - "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t" - "jmp .Lb2%=\n\t" - - ".Ll3%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 24(%[src]), %[src]| %[src], [%[src] + 24]}\n\t" - "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t" - "jmp .Lb3%=\n\t" - - ".Ll4%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q 32(%[src]), %[src]| %[src], [%[src] + 32]}\n\t" - "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t" - "jmp .Lb4%=\n\t" - - ".Ll5%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 40(%[src]), %[src]| %[src], [%[src] + 40]}\n\t" - "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t" - "jmp .Lb5%=\n\t" - - ".Ll6%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q -16(%[src]), %[src]| %[src], [%[src] - 16]}\n\t" - "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t" - "jmp .Lb6%=\n\t" - - ".Ll7%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q -8(%[src]), %[src]| %[src], [%[src] - 8]}\n\t" - "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t" - "jmp .Lb7%=\n\t" - - ".Ld1%=:\n\t" - "adc{q (%[dst]), %[r8]| %[r8], [%[dst]]}\n\t" - "sbb{q $-1, %[r9]| %[r9], -1}\n\t" - "mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t" - "jmp .Ldone%=\n\t" - - ".Ll1%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "not %[r8]\n\t" - "jrcxz .Ld1%=\n\t" - "lea{q 8(%[src]), %[src]| %[src], [%[src] + 8]}\n\t" - "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" - - ".align 32\n\t" - ".Lloop%=:\n\t" - - ".Lb1%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - - ".Lb0%=:\n\t" - "not %[r10]\n\t" - "mulx{q 8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 8]}\n\t" - "lea{q -1(%[rcx]), %[rcx]| %[rcx], [%[rcx] - 1]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q (%[dst]), %[r10]| %[r10], [%[dst]]}\n\t" - "mov{q %[r10], (%[dst])| [%[dst]], %[r10]}\n\t" - - ".Lb7%=:\n\t" - "not %[r8]\n\t" - "mulx{q 16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 16]}\n\t" - "adcx{q 8(%[dst]), %[r8]| %[r8], [%[dst] + 8]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t" - - ".Lb6%=:\n\t" - "not %[r10]\n\t" - "mulx{q 24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 24]}\n\t" - "lea{q 64(%[src]), %[src]| %[src], [%[src] + 64]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 16(%[dst]), %[r10]| %[r10], [%[dst] + 16]}\n\t" - "mov{q %[r10], 16(%[dst])| [%[dst] + 16], %[r10]}\n\t" - - ".Lb5%=:\n\t" - "not %[r8]\n\t" - "mulx{q -32(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 32]}\n\t" - "adcx{q 24(%[dst]), %[r8]| %[r8], [%[dst] + 24]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t" - - ".Lb4%=:\n\t" - "mulx{q -24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 24]}\n\t" - "not %[r10]\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 32(%[dst]), %[r10]| %[r10], [%[dst] + 32]}\n\t" - "mov{q %[r10], 32(%[dst])| [%[dst] + 32], %[r10]}\n\t" - - ".Lb3%=:\n\t" - "not %[r8]\n\t" - "mulx{q -16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 16]}\n\t" - "adcx{q 40(%[dst]), %[r8]| %[r8], [%[dst] + 40]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t" - - ".Lb2%=:\n\t" - "not %[r10]\n\t" - "mulx{q -8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 8]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 48(%[dst]), %[r10]| %[r10], [%[dst] + 48]}\n\t" - "not %[r8]\n\t" - "mov{q %[r10], 48(%[dst])| [%[dst] + 48], %[r10]}\n\t" - - "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" - - "jrcxz .Lloop_out%=\n\t" - "jmp .Lloop%=\n\t" - ".Lloop_out%=:\n\t" - - "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - "adox{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" - "sbb{q $-1, %[r9]| %[r9], -1}\n\t" - - ".Ldone%=:" - - : [dst] "+&r"(dst), [src] "+&r"(src), [rcx] "+&c"(rcx), [r8] "=&r"(r8), - [r9] "=&r"(r9), [r10] "+&r"(r10), [r11] "=&r"(r11) - : "d"(rdx) - : "cc", "memory"); - - WJR_ASSUME(rcx == 0); - - return r9; -} - +extern uint64_t __wjr_asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, + uint64_t rdx) noexcept; #else - extern "C" WJR_MS_ABI uint64_t __wjr_asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept; +#endif WJR_INTRINSIC_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept { @@ -22507,8 +21996,6 @@ WJR_INTRINSIC_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, s #endif -#endif - #if WJR_HAS_BUILTIN(ASM_ADDLSH_N) #define WJR_ADDSUB_I 1 // WJR_ADDSUB_I : @@ -23081,6 +22568,44 @@ WJR_INTRINSIC_INLINE uint64_t WJR_PP_CONCAT(asm_, WJR_PP_CONCAT(WJR_addsub, lsh_ #undef WJR_ADDSUB_I #endif +#if WJR_HAS_BUILTIN(ASM_BASECASE_MUL_S) + +#if WJR_HAS_BUILTIN(ASM_BASECASE_MUL_S) == 1 +extern void __wjr_asm_basecase_mul_s_impl(uint64_t *dst, const uint64_t *src0, size_t rdx, + const uint64_t *src1, size_t m) noexcept; +#else +extern "C" WJR_MS_ABI void __wjr_asm_basecase_mul_s_impl(uint64_t *dst, + const uint64_t *src0, size_t rdx, + const uint64_t *src1, + size_t m) noexcept; +#endif + +inline void asm_basecase_mul_s(uint64_t *dst, const uint64_t *src0, size_t n, + const uint64_t *src1, size_t m) noexcept { + WJR_ASSERT(n >= m); + WJR_ASSERT(m >= 1); + __wjr_asm_basecase_mul_s_impl(dst, src0, n, src1, m); +} + +#endif + +#if WJR_HAS_BUILTIN(ASM_BASECASE_SQR) + +#if WJR_HAS_BUILTIN(ASM_BASECASE_SQR) == 1 +extern void __wjr_asm_basecase_sqr_impl(uint64_t *dst, const uint64_t *src, + size_t rdx) noexcept; +#else +extern "C" WJR_MS_ABI void __wjr_asm_basecase_sqr_impl(uint64_t *dst, const uint64_t *src, + size_t rdx) noexcept; +#endif + +inline void asm_basecase_sqr(uint64_t *dst, const uint64_t *src, size_t n) noexcept { + WJR_ASSERT(n >= 1); + __wjr_asm_basecase_sqr_impl(dst, src, n); +} + +#endif + } // namespace wjr #endif // WJR_X86_MATH_MUL_HPP__ @@ -23480,8 +23005,14 @@ WJR_INTRINSIC_CONSTEXPR20 uint64_t try_addmul_1(uint64_t *dst, const uint64_t *s return 0; } - if (ml == 0) { - return 0; + if constexpr (maxn <= 3) { + if (ml == 0) { + return 0; + } + } else { + if (WJR_UNLIKELY(ml == 0)) { + return 0; + } } if constexpr (maxn == 1) { @@ -23517,7 +23048,6 @@ inline constexpr size_t toom3_sqr_threshold = WJR_TOOM3_SQR_THRESHOLD; enum class __mul_mode : uint8_t { toom22 = 0x00, toom33 = 0x01, - toom44 = 0x02, all = 0x03, }; @@ -23716,6 +23246,7 @@ void __mul_n(uint64_t *WJR_RESTRICT dst, const uint64_t *src0, const uint64_t *s } else { c_out = cf0 * cf1; } + c_out += try_addmul_1(dst + n, src1, n, cf0); c_out += try_addmul_1(dst + n, src0, n, cf1); } @@ -24391,22 +23922,22 @@ inline uint128_t div128by64to128(uint64_t &rem, uint64_t lo, uint64_t hi, inline uint128_t div128by64to128(uint64_t &rem, uint64_t lo, uint64_t hi, uint64_t div) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, - size_t n, - const div2by1_divider &div) noexcept; +WJR_INTRINSIC_INLINE void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, + size_t n, + const div2by1_divider &div) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, - size_t n, uint64_t div) noexcept; +WJR_INTRINSIC_INLINE void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, + size_t n, uint64_t div) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, - const div3by2_divider &div) noexcept; +WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, + const div3by2_divider &div) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, const uint64_t *div) noexcept; +WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, const uint64_t *div) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, const uint64_t *div, size_t m) noexcept; +WJR_INTRINSIC_INLINE void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, const uint64_t *div, size_t m) noexcept; WJR_INTRINSIC_CONSTEXPR20 uint64_t divexact_dbm1c(uint64_t *dst, const uint64_t *src, size_t n, uint64_t bd, @@ -24867,9 +24398,9 @@ inline uint128_t div128by64to128(uint64_t &rem, uint64_t lo, uint64_t hi, } // reference : https://ieeexplore.ieee.org/document/5487506 -WJR_INLINE_CONSTEXPR20 uint64_t -div_qr_1_noshift(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n, - const div2by1_divider_noshift &div) noexcept { +inline uint64_t div_qr_1_noshift(uint64_t *dst, uint64_t &rem, const uint64_t *src, + size_t n, + const div2by1_divider_noshift &div) noexcept { WJR_ASSERT_ASSUME(n >= 1); WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 1, src, n - 1)); @@ -24905,52 +24436,10 @@ div_qr_1_noshift(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n, return qh; } -WJR_INLINE_CONSTEXPR20 uint64_t -div_qr_1_shift(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n, - const div2by1_divider &div) noexcept { - WJR_ASSERT_ASSUME(n >= 1); - WJR_ASSERT(div.get_shift() != 0); - WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 1, src, n - 1)); - - const uint64_t divisor = div.get_divisor(); - const uint64_t value = div.get_value(); - const auto shift = div.get_shift(); - - uint64_t qh; - uint64_t lo, hi; - - uint64_t rbp = src[n - 1]; - --n; - hi = rbp >> (64 - shift); +extern uint64_t div_qr_1_shift(uint64_t *dst, uint64_t &rem, const uint64_t *src, + size_t n, const div2by1_divider &div) noexcept; - do { - if (WJR_UNLIKELY(n == 0)) { - qh = div.divide(divisor, value, rbp << shift, hi); - break; - } - - lo = src[n - 1]; - qh = div.divide(divisor, value, shld(rbp, lo, shift), hi); - rbp = lo; - --n; - - if (WJR_LIKELY(n != 0)) { - do { - lo = src[n - 1]; - dst[n] = div.divide(divisor, value, shld(rbp, lo, shift), hi); - rbp = lo; - --n; - } while (WJR_LIKELY(n != 0)); - } - - dst[0] = div.divide(divisor, value, rbp << shift, hi); - } while (0); - - rem = hi >> shift; - return qh; -} - -WJR_INTRINSIC_CONSTEXPR20 uint64_t +WJR_INTRINSIC_INLINE uint64_t div_qr_1_impl(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n, const div2by1_divider &div) noexcept { if (div.get_shift() == 0) { @@ -24961,9 +24450,9 @@ div_qr_1_impl(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n, } // return high quotient limb -WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, - size_t n, - const div2by1_divider &div) noexcept { +WJR_INTRINSIC_INLINE void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, + size_t n, + const div2by1_divider &div) noexcept { WJR_ASSERT_ASSUME(n >= 1); if (WJR_UNLIKELY(div.is_zero_or_single_bit())) { @@ -25025,102 +24514,14 @@ WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint dst[n - 1] = div_qr_1_impl(dst, rem, src, n, div2by1_divider(div)); } -WJR_INLINE_CONSTEXPR20 uint64_t -div_qr_2_noshift(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, - const div3by2_divider_noshift &div) noexcept { - WJR_ASSERT_ASSUME(n >= 2); - WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 2, src, n - 2)); - WJR_ASSERT_L2(WJR_IS_SEPARATE_P(dst, n - 2, rem, 2)); - - const uint64_t divisor0 = div.get_divisor0(); - const uint64_t divisor1 = div.get_divisor1(); - const uint64_t value = div.get_value(); - - uint64_t qh = 0; - uint64_t u0, u1, u2; - - u2 = src[n - 1]; - u1 = src[n - 2]; - - if (__less_equal_128(divisor0, divisor1, u1, u2)) { - __sub_128(u1, u2, u1, u2, divisor0, divisor1); - qh = 1; - } - - do { - if (WJR_UNLIKELY(n == 2)) { - break; - } - - n -= 2; - - do { - u0 = src[n - 1]; - dst[n - 1] = div.divide(divisor0, divisor1, value, u0, u1, u2); - --n; - } while (WJR_LIKELY(n != 0)); - - } while (0); - - rem[0] = u1; - rem[1] = u2; - return qh; -} - -WJR_INLINE_CONSTEXPR20 uint64_t -div_qr_2_shift(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, - const div3by2_divider &div) noexcept { - WJR_ASSERT_ASSUME(n >= 2); - WJR_ASSERT(div.get_shift() != 0); - WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 2, src, n - 2)); - WJR_ASSERT_L2(WJR_IS_SEPARATE_P(dst, n - 2, rem, 2)); - - const uint64_t divisor0 = div.get_divisor0(); - const uint64_t divisor1 = div.get_divisor1(); - const uint64_t value = div.get_value(); - const auto shift = div.get_shift(); - - uint64_t qh; - uint64_t u0, u1, u2; - uint64_t rbp; - - rbp = src[n - 2]; - u2 = src[n - 1]; - u1 = shld(u2, rbp, shift); - u2 >>= (64 - shift); +extern uint64_t div_qr_2_noshift(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, + const div3by2_divider_noshift &div) noexcept; - n -= 2; +extern uint64_t div_qr_2_shift(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, const div3by2_divider &div) noexcept; - do { - if (WJR_UNLIKELY(n == 0)) { - qh = div.divide(divisor0, divisor1, value, rbp << shift, u1, u2); - break; - } - - u0 = src[n - 1]; - qh = div.divide(divisor0, divisor1, value, shld(rbp, u0, shift), u1, u2); - rbp = u0; - --n; - - if (WJR_LIKELY(n != 0)) { - do { - u0 = src[n - 1]; - dst[n] = - div.divide(divisor0, divisor1, value, shld(rbp, u0, shift), u1, u2); - rbp = u0; - --n; - } while (WJR_LIKELY(n != 0)); - } - - dst[0] = div.divide(divisor0, divisor1, value, rbp << shift, u1, u2); - } while (0); - - rem[0] = shrd(u1, u2, shift); - rem[1] = u2 >> shift; - return qh; -} - -WJR_INTRINSIC_CONSTEXPR20 uint64_t +WJR_INTRINSIC_INLINE uint64_t div_qr_2_impl(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, const div3by2_divider &div) noexcept { if (div.get_shift() == 0) { @@ -25130,16 +24531,16 @@ div_qr_2_impl(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, return div_qr_2_shift(dst, rem, src, n, div); } -WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, - const div3by2_divider &div) noexcept { +WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, + const div3by2_divider &div) noexcept { WJR_ASSERT_ASSUME(n >= 2); dst[n - 2] = div_qr_2_impl(dst, rem, src, n, div); } -WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, const uint64_t *div) noexcept { +WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, const uint64_t *div) noexcept { WJR_ASSERT_ASSUME(n >= 2); dst[n - 2] = @@ -25157,9 +24558,8 @@ extern uint64_t dc_div_qr_s(uint64_t *dst, uint64_t *src, size_t n, const uint64 extern void __div_qr_s_impl(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, const uint64_t *div, size_t m) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, const uint64_t *div, - size_t m) noexcept { +WJR_INTRINSIC_INLINE void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, const uint64_t *div, size_t m) noexcept { WJR_ASSERT_ASSUME(m >= 1); WJR_ASSERT_ASSUME(n >= m); diff --git a/include/wjr/math/div-impl.hpp b/include/wjr/math/div-impl.hpp index 44e91456..af16820a 100644 --- a/include/wjr/math/div-impl.hpp +++ b/include/wjr/math/div-impl.hpp @@ -31,22 +31,22 @@ inline uint128_t div128by64to128(uint64_t &rem, uint64_t lo, uint64_t hi, inline uint128_t div128by64to128(uint64_t &rem, uint64_t lo, uint64_t hi, uint64_t div) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, - size_t n, - const div2by1_divider &div) noexcept; +WJR_INTRINSIC_INLINE void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, + size_t n, + const div2by1_divider &div) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, - size_t n, uint64_t div) noexcept; +WJR_INTRINSIC_INLINE void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, + size_t n, uint64_t div) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, - const div3by2_divider &div) noexcept; +WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, + const div3by2_divider &div) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, const uint64_t *div) noexcept; +WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, const uint64_t *div) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, const uint64_t *div, size_t m) noexcept; +WJR_INTRINSIC_INLINE void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, const uint64_t *div, size_t m) noexcept; WJR_INTRINSIC_CONSTEXPR20 uint64_t divexact_dbm1c(uint64_t *dst, const uint64_t *src, size_t n, uint64_t bd, diff --git a/include/wjr/math/div.hpp b/include/wjr/math/div.hpp index 85e9d927..20a30ee8 100644 --- a/include/wjr/math/div.hpp +++ b/include/wjr/math/div.hpp @@ -131,9 +131,9 @@ inline uint128_t div128by64to128(uint64_t &rem, uint64_t lo, uint64_t hi, } // reference : https://ieeexplore.ieee.org/document/5487506 -WJR_INLINE_CONSTEXPR20 uint64_t -div_qr_1_noshift(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n, - const div2by1_divider_noshift &div) noexcept { +inline uint64_t div_qr_1_noshift(uint64_t *dst, uint64_t &rem, const uint64_t *src, + size_t n, + const div2by1_divider_noshift &div) noexcept { WJR_ASSERT_ASSUME(n >= 1); WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 1, src, n - 1)); @@ -169,52 +169,10 @@ div_qr_1_noshift(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n, return qh; } -WJR_INLINE_CONSTEXPR20 uint64_t -div_qr_1_shift(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n, - const div2by1_divider &div) noexcept { - WJR_ASSERT_ASSUME(n >= 1); - WJR_ASSERT(div.get_shift() != 0); - WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 1, src, n - 1)); - - const uint64_t divisor = div.get_divisor(); - const uint64_t value = div.get_value(); - const auto shift = div.get_shift(); - - uint64_t qh; - uint64_t lo, hi; - - uint64_t rbp = src[n - 1]; - --n; - hi = rbp >> (64 - shift); - - do { - if (WJR_UNLIKELY(n == 0)) { - qh = div.divide(divisor, value, rbp << shift, hi); - break; - } - - lo = src[n - 1]; - qh = div.divide(divisor, value, shld(rbp, lo, shift), hi); - rbp = lo; - --n; +extern uint64_t div_qr_1_shift(uint64_t *dst, uint64_t &rem, const uint64_t *src, + size_t n, const div2by1_divider &div) noexcept; - if (WJR_LIKELY(n != 0)) { - do { - lo = src[n - 1]; - dst[n] = div.divide(divisor, value, shld(rbp, lo, shift), hi); - rbp = lo; - --n; - } while (WJR_LIKELY(n != 0)); - } - - dst[0] = div.divide(divisor, value, rbp << shift, hi); - } while (0); - - rem = hi >> shift; - return qh; -} - -WJR_INTRINSIC_CONSTEXPR20 uint64_t +WJR_INTRINSIC_INLINE uint64_t div_qr_1_impl(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n, const div2by1_divider &div) noexcept { if (div.get_shift() == 0) { @@ -225,9 +183,9 @@ div_qr_1_impl(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n, } // return high quotient limb -WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, - size_t n, - const div2by1_divider &div) noexcept { +WJR_INTRINSIC_INLINE void div_qr_1(uint64_t *dst, uint64_t &rem, const uint64_t *src, + size_t n, + const div2by1_divider &div) noexcept { WJR_ASSERT_ASSUME(n >= 1); if (WJR_UNLIKELY(div.is_zero_or_single_bit())) { @@ -289,102 +247,14 @@ WJR_INTRINSIC_CONSTEXPR20 void div_qr_1(uint64_t *dst, uint64_t &rem, const uint dst[n - 1] = div_qr_1_impl(dst, rem, src, n, div2by1_divider(div)); } -WJR_INLINE_CONSTEXPR20 uint64_t -div_qr_2_noshift(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, - const div3by2_divider_noshift &div) noexcept { - WJR_ASSERT_ASSUME(n >= 2); - WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 2, src, n - 2)); - WJR_ASSERT_L2(WJR_IS_SEPARATE_P(dst, n - 2, rem, 2)); - - const uint64_t divisor0 = div.get_divisor0(); - const uint64_t divisor1 = div.get_divisor1(); - const uint64_t value = div.get_value(); - - uint64_t qh = 0; - uint64_t u0, u1, u2; - - u2 = src[n - 1]; - u1 = src[n - 2]; - - if (__less_equal_128(divisor0, divisor1, u1, u2)) { - __sub_128(u1, u2, u1, u2, divisor0, divisor1); - qh = 1; - } - - do { - if (WJR_UNLIKELY(n == 2)) { - break; - } - - n -= 2; - - do { - u0 = src[n - 1]; - dst[n - 1] = div.divide(divisor0, divisor1, value, u0, u1, u2); - --n; - } while (WJR_LIKELY(n != 0)); - - } while (0); - - rem[0] = u1; - rem[1] = u2; - return qh; -} - -WJR_INLINE_CONSTEXPR20 uint64_t -div_qr_2_shift(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, - const div3by2_divider &div) noexcept { - WJR_ASSERT_ASSUME(n >= 2); - WJR_ASSERT(div.get_shift() != 0); - WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 2, src, n - 2)); - WJR_ASSERT_L2(WJR_IS_SEPARATE_P(dst, n - 2, rem, 2)); +extern uint64_t div_qr_2_noshift(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, + const div3by2_divider_noshift &div) noexcept; - const uint64_t divisor0 = div.get_divisor0(); - const uint64_t divisor1 = div.get_divisor1(); - const uint64_t value = div.get_value(); - const auto shift = div.get_shift(); +extern uint64_t div_qr_2_shift(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, const div3by2_divider &div) noexcept; - uint64_t qh; - uint64_t u0, u1, u2; - uint64_t rbp; - - rbp = src[n - 2]; - u2 = src[n - 1]; - u1 = shld(u2, rbp, shift); - u2 >>= (64 - shift); - - n -= 2; - - do { - if (WJR_UNLIKELY(n == 0)) { - qh = div.divide(divisor0, divisor1, value, rbp << shift, u1, u2); - break; - } - - u0 = src[n - 1]; - qh = div.divide(divisor0, divisor1, value, shld(rbp, u0, shift), u1, u2); - rbp = u0; - --n; - - if (WJR_LIKELY(n != 0)) { - do { - u0 = src[n - 1]; - dst[n] = - div.divide(divisor0, divisor1, value, shld(rbp, u0, shift), u1, u2); - rbp = u0; - --n; - } while (WJR_LIKELY(n != 0)); - } - - dst[0] = div.divide(divisor0, divisor1, value, rbp << shift, u1, u2); - } while (0); - - rem[0] = shrd(u1, u2, shift); - rem[1] = u2 >> shift; - return qh; -} - -WJR_INTRINSIC_CONSTEXPR20 uint64_t +WJR_INTRINSIC_INLINE uint64_t div_qr_2_impl(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, const div3by2_divider &div) noexcept { if (div.get_shift() == 0) { @@ -394,16 +264,16 @@ div_qr_2_impl(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, return div_qr_2_shift(dst, rem, src, n, div); } -WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, - const div3by2_divider &div) noexcept { +WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, + const div3by2_divider &div) noexcept { WJR_ASSERT_ASSUME(n >= 2); dst[n - 2] = div_qr_2_impl(dst, rem, src, n, div); } -WJR_INTRINSIC_CONSTEXPR20 void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, const uint64_t *div) noexcept { +WJR_INTRINSIC_INLINE void div_qr_2(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, const uint64_t *div) noexcept { WJR_ASSERT_ASSUME(n >= 2); dst[n - 2] = @@ -421,9 +291,8 @@ extern uint64_t dc_div_qr_s(uint64_t *dst, uint64_t *src, size_t n, const uint64 extern void __div_qr_s_impl(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, const uint64_t *div, size_t m) noexcept; -WJR_INTRINSIC_CONSTEXPR20 void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src, - size_t n, const uint64_t *div, - size_t m) noexcept { +WJR_INTRINSIC_INLINE void div_qr_s(uint64_t *dst, uint64_t *rem, const uint64_t *src, + size_t n, const uint64_t *div, size_t m) noexcept { WJR_ASSERT_ASSUME(m >= 1); WJR_ASSERT_ASSUME(n >= m); diff --git a/include/wjr/math/mul.hpp b/include/wjr/math/mul.hpp index 2664dab7..f218be7a 100644 --- a/include/wjr/math/mul.hpp +++ b/include/wjr/math/mul.hpp @@ -416,8 +416,14 @@ WJR_INTRINSIC_CONSTEXPR20 uint64_t try_addmul_1(uint64_t *dst, const uint64_t *s return 0; } - if (ml == 0) { - return 0; + if constexpr (maxn <= 3) { + if (ml == 0) { + return 0; + } + } else { + if (WJR_UNLIKELY(ml == 0)) { + return 0; + } } if constexpr (maxn == 1) { @@ -453,7 +459,6 @@ inline constexpr size_t toom3_sqr_threshold = WJR_TOOM3_SQR_THRESHOLD; enum class __mul_mode : uint8_t { toom22 = 0x00, toom33 = 0x01, - toom44 = 0x02, all = 0x03, }; @@ -652,6 +657,7 @@ void __mul_n(uint64_t *WJR_RESTRICT dst, const uint64_t *src0, const uint64_t *s } else { c_out = cf0 * cf1; } + c_out += try_addmul_1(dst + n, src1, n, cf0); c_out += try_addmul_1(dst + n, src0, n, cf1); } diff --git a/include/wjr/preprocessor/compiler/attribute.hpp b/include/wjr/preprocessor/compiler/attribute.hpp index 54c7bcd0..bc82dfc6 100644 --- a/include/wjr/preprocessor/compiler/attribute.hpp +++ b/include/wjr/preprocessor/compiler/attribute.hpp @@ -123,10 +123,11 @@ #if defined(WJR_COMPILER_MSVC) #define WJR_MS_ABI #define WJR_HAS_FEATURE_MS_ABI WJR_HAS_DEF -#elif defined(WJR_COMPILER_CLANG) || defined(WJR_COMPILER_GCC) +#elif WJR_HAS_ATTRIBUTE(__ms_abi__) #define WJR_MS_ABI __attribute__((__ms_abi__)) #define WJR_HAS_FEATURE_MS_ABI WJR_HAS_DEF -#else +#elif defined(WJR_ENABLE_ASSEMBLY) +#undef WJR_ENABLE_ASSEMBLY #endif #define WJR_ASSUME_MAY_NOT_PURE(expr) \ @@ -238,6 +239,12 @@ #define WJR_MALLOC #endif +#if WJR_HAS_ATTRIBUTE(nonnull) +#define WJR_NONNULL(...) __attribute__((__VA_ARGS__)) +#else +#define WJR_NONNULL(...) +#endif + #define WJR_INLINE inline #define WJR_CONSTEXPR constexpr diff --git a/include/wjr/x86/math/mul-impl.hpp b/include/wjr/x86/math/mul-impl.hpp index bd22dc65..bae55d61 100644 --- a/include/wjr/x86/math/mul-impl.hpp +++ b/include/wjr/x86/math/mul-impl.hpp @@ -47,18 +47,6 @@ namespace wjr { #if defined(__BMI2__) && defined(__ADX__) -#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) -#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF -#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF -#elif defined(WJR_ENABLE_ASSEMBLY) -#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF_VAR(3) -#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF_VAR(3) -#endif - -#endif - -#if defined(__BMI2__) && defined(__ADX__) - #if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) #define WJR_HAS_BUILTIN_ASM_SUBMUL_1 WJR_HAS_DEF #elif defined(WJR_ENABLE_ASSEMBLY) @@ -79,6 +67,18 @@ namespace wjr { #endif +#if defined(__BMI2__) && defined(__ADX__) + +#if WJR_HAS_FEATURE(GCC_STYLE_INLINE_ASM) +#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF +#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF +#elif defined(WJR_ENABLE_ASSEMBLY) +#define WJR_HAS_BUILTIN_ASM_BASECASE_MUL_S WJR_HAS_DEF_VAR(3) +#define WJR_HAS_BUILTIN_ASM_BASECASE_SQR WJR_HAS_DEF_VAR(3) +#endif + +#endif + } // namespace wjr #endif // WJR_X86_MATH_MUL_IMPL_HPP__ \ No newline at end of file diff --git a/include/wjr/x86/math/mul.hpp b/include/wjr/x86/math/mul.hpp index 1ab3e7d4..4674dd16 100644 --- a/include/wjr/x86/math/mul.hpp +++ b/include/wjr/x86/math/mul.hpp @@ -32,162 +32,12 @@ WJR_INTRINSIC_INLINE uint64_t builtin_umul128(uint64_t a, uint64_t b, #if WJR_HAS_BUILTIN(ASM_MUL_1) #if WJR_HAS_BUILTIN(ASM_MUL_1) == 1 - -inline uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, - uint64_t rdx) noexcept { - size_t rcx = n / 8; - uint64_t r8, r9, r10 = n, r11; - - const auto pdst = dst; - const auto psrc = src; - - (void)(pdst); - (void)(psrc); - - asm volatile( - "and{l $7, %k[r10]| %k[r10], 7}\n\t" - "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" - "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + " - "%[r10] * 4]}\n\t" - "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" - "jmp{q *%[r10]| %[r10]}\n\t" - - ".align 8\n\t" - ".Llookup%=:\n\t" - ".long .Ll0%=-.Llookup%=\n\t" - ".long .Ll1%=-.Llookup%=\n\t" - ".long .Ll2%=-.Llookup%=\n\t" - ".long .Ll3%=-.Llookup%=\n\t" - ".long .Ll4%=-.Llookup%=\n\t" - ".long .Ll5%=-.Llookup%=\n\t" - ".long .Ll6%=-.Llookup%=\n\t" - ".long .Ll7%=-.Llookup%=\n\t" - ".align 16\n\t" - - ".Ll0%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "jmp .Lb0%=\n\t" - - ".Ll2%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q -48(%[src]), %[src]| %[src], [%[src] - 48]}\n\t" - "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb2%=\n\t" - - ".Ll3%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q -40(%[src]), %[src]| %[src], [%[src] - 40]}\n\t" - "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb3%=\n\t" - - ".Ll4%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q -32(%[src]), %[src]| %[src], [%[src] - 32]}\n\t" - "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb4%=\n\t" - - ".Ll5%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q -24(%[src]), %[src]| %[src], [%[src] - 24]}\n\t" - "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb5%=\n\t" - - ".Ll6%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q -16(%[src]), %[src]| %[src], [%[src] - 16]}\n\t" - "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb6%=\n\t" - - ".Ll7%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q -8(%[src]), %[src]| %[src], [%[src] - 8]}\n\t" - "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t" - "inc %[rcx]\n\t" - "jmp .Lb7%=\n\t" - - ".Ld1%=:\n\t" - "jmp .Ldone%=\n\t" - - ".Ll1%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 8(%[src]), %[src]| %[src], [%[src] + 8]}\n\t" - "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" - "jrcxz .Ld1%=\n\t" - - ".align 32\n\t" - ".Lloop%=:\n\t" - - ".Lb1%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - - ".Lb0%=:\n\t" - "mulx{q 8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 8]}\n\t" - "mov{q %[r10], (%[dst])| [%[dst]], %[r10]}\n\t" - "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - - ".Lb7%=:\n\t" - "mulx{q 16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 16]}\n\t" - "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t" - "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - - ".Lb6%=:\n\t" - "mulx{q 24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 24]}\n\t" - "mov{q %[r10], 16(%[dst])| [%[dst] + 16], %[r10]}\n\t" - "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - - ".Lb5%=:\n\t" - "mulx{q 32(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 32]}\n\t" - "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t" - "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - - ".Lb4%=:\n\t" - "mulx{q 40(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 40]}\n\t" - "mov{q %[r10], 32(%[dst])| [%[dst] + 32], %[r10]}\n\t" - "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - - ".Lb3%=:\n\t" - "mulx{q 48(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 48]}\n\t" - "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t" - "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - - ".Lb2%=:\n\t" - "mulx{q 56(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 56]}\n\t" - "mov{q %[r10], 48(%[dst])| [%[dst] + 48], %[r10]}\n\t" - "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - - "lea{q 64(%[src]), %[src]| %[src], [%[src] + 64]}\n\t" - "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" - - "dec %[rcx]\n\t" - "jne .Lloop%=\n\t" - - "adc{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" - ".Ldone%=:\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - - : [dst] "+&r"(dst), [src] "+&r"(src), [rcx] "+&c"(rcx), [r8] "=&r"(r8), - [r9] "=&r"(r9), [r10] "+&r"(r10), [r11] "=&r"(r11) - : "d"(rdx) - : "cc", "memory"); - - WJR_ASSERT_ASSUME(rcx == 0); - WJR_ASSERT_ASSUME(dst == pdst + n); - WJR_ASSERT_ASSUME(src == psrc + n); - - return r9; -} - +extern uint64_t __wjr_asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, + uint64_t rdx) noexcept; #else - extern "C" WJR_MS_ABI uint64_t __wjr_asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept; +#endif WJR_INTRINSIC_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept { @@ -196,185 +46,49 @@ WJR_INTRINSIC_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size #endif -#endif - #if WJR_HAS_BUILTIN(ASM_ADDMUL_1) #if WJR_HAS_BUILTIN(ASM_ADDMUL_1) == 1 +extern uint64_t __wjr_asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, + uint64_t rdx) noexcept; +#else +extern "C" WJR_MS_ABI uint64_t __wjr_asm_addmul_1(uint64_t *dst, const uint64_t *src, + size_t n, uint64_t rdx) noexcept; +#endif -inline uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, - uint64_t rdx) noexcept { - size_t rcx = n / 8; - uint64_t r8, r9, r10 = n, r11; - - const auto pdst = dst; - const auto psrc = src; - - (void)(pdst); - (void)(psrc); - - asm volatile( - "and{l $7, %k[r10]| %k[r10], 7}\n\t" - "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" - "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + " - "%[r10] * 4]}\n\t" - "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" - "jmp{q *%[r10]| %[r10]}\n\t" - - ".align 8\n\t" - ".Llookup%=:\n\t" - ".long .Ll0%=-.Llookup%=\n\t" - ".long .Ll1%=-.Llookup%=\n\t" - ".long .Ll2%=-.Llookup%=\n\t" - ".long .Ll3%=-.Llookup%=\n\t" - ".long .Ll4%=-.Llookup%=\n\t" - ".long .Ll5%=-.Llookup%=\n\t" - ".long .Ll6%=-.Llookup%=\n\t" - ".long .Ll7%=-.Llookup%=\n\t" - ".align 16\n\t" - - ".Ll0%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "jmp .Lb0%=\n\t" - - ".Ll2%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q 16(%[src]), %[src]| %[src], [%[src] + 16]}\n\t" - "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t" - "jmp .Lb2%=\n\t" - - ".Ll3%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 24(%[src]), %[src]| %[src], [%[src] + 24]}\n\t" - "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t" - "jmp .Lb3%=\n\t" - - ".Ll4%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q 32(%[src]), %[src]| %[src], [%[src] + 32]}\n\t" - "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t" - "jmp .Lb4%=\n\t" - - ".Ll5%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 40(%[src]), %[src]| %[src], [%[src] + 40]}\n\t" - "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t" - "jmp .Lb5%=\n\t" - - ".Ll6%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q -16(%[src]), %[src]| %[src], [%[src] - 16]}\n\t" - "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t" - "jmp .Lb6%=\n\t" - - ".Ll7%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q -8(%[src]), %[src]| %[src], [%[src] - 8]}\n\t" - "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t" - "jmp .Lb7%=\n\t" - - ".Ld1%=:\n\t" - "add{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - "adc{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" - "jmp .Ldone%=\n\t" - - ".Ll1%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 8(%[src]), %[src]| %[src], [%[src] + 8]}\n\t" - "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" - "jrcxz .Ld1%=\n\t" - - ".align 32\n\t" - ".Lloop%=:\n\t" - - ".Lb1%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - - ".Lb0%=:\n\t" - "mulx{q 8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 8]}\n\t" - "lea{q -1(%[rcx]), %[rcx]| %[rcx], [%[rcx] - 1]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q (%[dst]), %[r10]| %[r10], [%[dst]]}\n\t" - "mov{q %[r10], (%[dst])| [%[dst]], %[r10]}\n\t" - - ".Lb7%=:\n\t" - "mulx{q 16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 16]}\n\t" - "adcx{q 8(%[dst]), %[r8]| %[r8], [%[dst] + 8]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t" - - ".Lb6%=:\n\t" - "mulx{q 24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 24]}\n\t" - "lea{q 64(%[src]), %[src]| %[src], [%[src] + 64]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 16(%[dst]), %[r10]| %[r10], [%[dst] + 16]}\n\t" - "mov{q %[r10], 16(%[dst])| [%[dst] + 16], %[r10]}\n\t" - - ".Lb5%=:\n\t" - "mulx{q -32(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 32]}\n\t" - "adcx{q 24(%[dst]), %[r8]| %[r8], [%[dst] + 24]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t" - - ".Lb4%=:\n\t" - "mulx{q -24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 24]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 32(%[dst]), %[r10]| %[r10], [%[dst] + 32]}\n\t" - "mov{q %[r10], 32(%[dst])| [%[dst] + 32], %[r10]}\n\t" - - ".Lb3%=:\n\t" - "mulx{q -16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 16]}\n\t" - "adcx{q 40(%[dst]), %[r8]| %[r8], [%[dst] + 40]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t" - - ".Lb2%=:\n\t" - "mulx{q -8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 8]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 48(%[dst]), %[r10]| %[r10], [%[dst] + 48]}\n\t" - "mov{q %[r10], 48(%[dst])| [%[dst] + 48], %[r10]}\n\t" - - "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" - - "jrcxz .Lloop_out%=\n\t" - "jmp .Lloop%=\n\t" - ".Lloop_out%=:\n\t" - - "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - "adox{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" - "adc{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" - - ".Ldone%=:" - - : [dst] "+&r"(dst), [src] "+&r"(src), [rcx] "+&c"(rcx), [r8] "=&r"(r8), - [r9] "=&r"(r9), [r10] "+&r"(r10), [r11] "=&r"(r11) - : "d"(rdx) - : "cc", "memory"); +WJR_INTRINSIC_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, + uint64_t rdx) noexcept { + return __wjr_asm_addmul_1(dst, src, n, rdx); +} - WJR_ASSERT_ASSUME(rcx == 0); - WJR_ASSERT_ASSUME(dst == pdst + n); - WJR_ASSERT_ASSUME(src == psrc + n); +#endif - return r9; -} +#if WJR_HAS_BUILTIN(ASM_SUBMUL_1) +#if WJR_HAS_BUILTIN(ASM_SUBMUL_1) == 1 +// slower than asm_addmul_1 +extern uint64_t __wjr_asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, + uint64_t rdx) noexcept; #else - -extern "C" WJR_MS_ABI uint64_t __wjr_asm_addmul_1(uint64_t *dst, const uint64_t *src, +extern "C" WJR_MS_ABI uint64_t __wjr_asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept; +#endif -WJR_INTRINSIC_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, +WJR_INTRINSIC_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t rdx) noexcept { - return __wjr_asm_addmul_1(dst, src, n, rdx); + return __wjr_asm_submul_1(dst, src, n, rdx); } #endif +#if WJR_HAS_BUILTIN(ASM_ADDLSH_N) +#define WJR_ADDSUB_I 1 +#include +#endif + +#if WJR_HAS_BUILTIN(ASM_RSBLSH_N) +#define WJR_ADDSUB_I 0 +#include #endif #if WJR_HAS_BUILTIN(ASM_BASECASE_MUL_S) @@ -415,202 +129,6 @@ inline void asm_basecase_sqr(uint64_t *dst, const uint64_t *src, size_t n) noexc #endif -#if WJR_HAS_BUILTIN(ASM_SUBMUL_1) - -#if WJR_HAS_BUILTIN(ASM_SUBMUL_1) == 1 - -// slower than asm_addmul_1 -inline uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, - uint64_t rdx) noexcept { - WJR_ASSERT(n != 0); - - size_t rcx = n / 8; - uint64_t r8, r9, r10 = static_cast(n), r11; - - asm volatile( - // set CF = 1, OF = 0 - "and{l $7, %k[r10]| %k[r10], 7}\n\t" - "stc\n\t" - - "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" - "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + %[r10] * " - "4]}\n\t" - "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" - "jmp{q *%[r10]| %[r10]}\n\t" - - ".align 8\n\t" - ".Llookup%=:\n\t" - ".long .Ll0%=-.Llookup%=\n\t" - ".long .Ll1%=-.Llookup%=\n\t" - ".long .Ll2%=-.Llookup%=\n\t" - ".long .Ll3%=-.Llookup%=\n\t" - ".long .Ll4%=-.Llookup%=\n\t" - ".long .Ll5%=-.Llookup%=\n\t" - ".long .Ll6%=-.Llookup%=\n\t" - ".long .Ll7%=-.Llookup%=\n\t" - ".align 16\n\t" - - ".Ll0%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "jmp .Lb0%=\n\t" - - ".Ll2%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q 16(%[src]), %[src]| %[src], [%[src] + 16]}\n\t" - "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t" - "jmp .Lb2%=\n\t" - - ".Ll3%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 24(%[src]), %[src]| %[src], [%[src] + 24]}\n\t" - "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t" - "jmp .Lb3%=\n\t" - - ".Ll4%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q 32(%[src]), %[src]| %[src], [%[src] + 32]}\n\t" - "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t" - "jmp .Lb4%=\n\t" - - ".Ll5%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q 40(%[src]), %[src]| %[src], [%[src] + 40]}\n\t" - "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t" - "jmp .Lb5%=\n\t" - - ".Ll6%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "lea{q -16(%[src]), %[src]| %[src], [%[src] - 16]}\n\t" - "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t" - "jmp .Lb6%=\n\t" - - ".Ll7%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "lea{q -8(%[src]), %[src]| %[src], [%[src] - 8]}\n\t" - "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t" - "jmp .Lb7%=\n\t" - - ".Ld1%=:\n\t" - "adc{q (%[dst]), %[r8]| %[r8], [%[dst]]}\n\t" - "sbb{q $-1, %[r9]| %[r9], -1}\n\t" - "mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t" - "jmp .Ldone%=\n\t" - - ".Ll1%=:\n\t" - "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" - "not %[r8]\n\t" - "jrcxz .Ld1%=\n\t" - "lea{q 8(%[src]), %[src]| %[src], [%[src] + 8]}\n\t" - "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" - - ".align 32\n\t" - ".Lloop%=:\n\t" - - ".Lb1%=:\n\t" - "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" - "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - - ".Lb0%=:\n\t" - "not %[r10]\n\t" - "mulx{q 8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 8]}\n\t" - "lea{q -1(%[rcx]), %[rcx]| %[rcx], [%[rcx] - 1]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q (%[dst]), %[r10]| %[r10], [%[dst]]}\n\t" - "mov{q %[r10], (%[dst])| [%[dst]], %[r10]}\n\t" - - ".Lb7%=:\n\t" - "not %[r8]\n\t" - "mulx{q 16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 16]}\n\t" - "adcx{q 8(%[dst]), %[r8]| %[r8], [%[dst] + 8]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t" - - ".Lb6%=:\n\t" - "not %[r10]\n\t" - "mulx{q 24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 24]}\n\t" - "lea{q 64(%[src]), %[src]| %[src], [%[src] + 64]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 16(%[dst]), %[r10]| %[r10], [%[dst] + 16]}\n\t" - "mov{q %[r10], 16(%[dst])| [%[dst] + 16], %[r10]}\n\t" - - ".Lb5%=:\n\t" - "not %[r8]\n\t" - "mulx{q -32(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 32]}\n\t" - "adcx{q 24(%[dst]), %[r8]| %[r8], [%[dst] + 24]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t" - - ".Lb4%=:\n\t" - "mulx{q -24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 24]}\n\t" - "not %[r10]\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 32(%[dst]), %[r10]| %[r10], [%[dst] + 32]}\n\t" - "mov{q %[r10], 32(%[dst])| [%[dst] + 32], %[r10]}\n\t" - - ".Lb3%=:\n\t" - "not %[r8]\n\t" - "mulx{q -16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 16]}\n\t" - "adcx{q 40(%[dst]), %[r8]| %[r8], [%[dst] + 40]}\n\t" - "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" - "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t" - - ".Lb2%=:\n\t" - "not %[r10]\n\t" - "mulx{q -8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 8]}\n\t" - "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" - "adcx{q 48(%[dst]), %[r10]| %[r10], [%[dst] + 48]}\n\t" - "not %[r8]\n\t" - "mov{q %[r10], 48(%[dst])| [%[dst] + 48], %[r10]}\n\t" - - "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" - - "jrcxz .Lloop_out%=\n\t" - "jmp .Lloop%=\n\t" - ".Lloop_out%=:\n\t" - - "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" - "adox{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" - "sbb{q $-1, %[r9]| %[r9], -1}\n\t" - - ".Ldone%=:" - - : [dst] "+&r"(dst), [src] "+&r"(src), [rcx] "+&c"(rcx), [r8] "=&r"(r8), - [r9] "=&r"(r9), [r10] "+&r"(r10), [r11] "=&r"(r11) - : "d"(rdx) - : "cc", "memory"); - - WJR_ASSUME(rcx == 0); - - return r9; -} - -#else - -extern "C" WJR_MS_ABI uint64_t __wjr_asm_submul_1(uint64_t *dst, const uint64_t *src, - size_t n, uint64_t rdx) noexcept; - -WJR_INTRINSIC_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, - uint64_t rdx) noexcept { - return __wjr_asm_submul_1(dst, src, n, rdx); -} - -#endif - -#endif - -#if WJR_HAS_BUILTIN(ASM_ADDLSH_N) -#define WJR_ADDSUB_I 1 -#include -#endif - -#if WJR_HAS_BUILTIN(ASM_RSBLSH_N) -#define WJR_ADDSUB_I 0 -#include -#endif - } // namespace wjr #endif // WJR_X86_MATH_MUL_HPP__ \ No newline at end of file diff --git a/src/wjr/math/div.cpp b/src/wjr/math/div.cpp index 3e57cf53..7e7b193e 100644 --- a/src/wjr/math/div.cpp +++ b/src/wjr/math/div.cpp @@ -7,6 +7,143 @@ namespace { inline constexpr size_t dc_div_qr_threshold = WJR_DC_DIV_QR_THRESHOLD; } +uint64_t div_qr_1_shift(uint64_t *dst, uint64_t &rem, const uint64_t *src, size_t n, + const div2by1_divider &div) noexcept { + WJR_ASSERT_ASSUME(n >= 1); + WJR_ASSERT(div.get_shift() != 0); + WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 1, src, n - 1)); + + const uint64_t divisor = div.get_divisor(); + const uint64_t value = div.get_value(); + const auto shift = div.get_shift(); + + uint64_t qh; + uint64_t lo, hi; + + uint64_t rbp = src[n - 1]; + --n; + hi = rbp >> (64 - shift); + + do { + if (WJR_UNLIKELY(n == 0)) { + qh = div.divide(divisor, value, rbp << shift, hi); + break; + } + + lo = src[n - 1]; + qh = div.divide(divisor, value, shld(rbp, lo, shift), hi); + rbp = lo; + --n; + + if (WJR_LIKELY(n != 0)) { + do { + lo = src[n - 1]; + dst[n] = div.divide(divisor, value, shld(rbp, lo, shift), hi); + rbp = lo; + --n; + } while (WJR_LIKELY(n != 0)); + } + + dst[0] = div.divide(divisor, value, rbp << shift, hi); + } while (0); + + rem = hi >> shift; + return qh; +} + +uint64_t div_qr_2_noshift(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, + const div3by2_divider_noshift &div) noexcept { + WJR_ASSERT_ASSUME(n >= 2); + WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 2, src, n - 2)); + WJR_ASSERT_L2(WJR_IS_SEPARATE_P(dst, n - 2, rem, 2)); + + const uint64_t divisor0 = div.get_divisor0(); + const uint64_t divisor1 = div.get_divisor1(); + const uint64_t value = div.get_value(); + + uint64_t qh = 0; + uint64_t u0, u1, u2; + + u2 = src[n - 1]; + u1 = src[n - 2]; + + if (__less_equal_128(divisor0, divisor1, u1, u2)) { + __sub_128(u1, u2, u1, u2, divisor0, divisor1); + qh = 1; + } + + do { + if (WJR_UNLIKELY(n == 2)) { + break; + } + + n -= 2; + + do { + u0 = src[n - 1]; + dst[n - 1] = div.divide(divisor0, divisor1, value, u0, u1, u2); + --n; + } while (WJR_LIKELY(n != 0)); + + } while (0); + + rem[0] = u1; + rem[1] = u2; + return qh; +} + +uint64_t div_qr_2_shift(uint64_t *dst, uint64_t *rem, const uint64_t *src, size_t n, + const div3by2_divider &div) noexcept { + WJR_ASSERT_ASSUME(n >= 2); + WJR_ASSERT(div.get_shift() != 0); + WJR_ASSERT_L2(WJR_IS_SAME_OR_DECR_P(dst, n - 2, src, n - 2)); + WJR_ASSERT_L2(WJR_IS_SEPARATE_P(dst, n - 2, rem, 2)); + + const uint64_t divisor0 = div.get_divisor0(); + const uint64_t divisor1 = div.get_divisor1(); + const uint64_t value = div.get_value(); + const auto shift = div.get_shift(); + + uint64_t qh; + uint64_t u0, u1, u2; + uint64_t rbp; + + rbp = src[n - 2]; + u2 = src[n - 1]; + u1 = shld(u2, rbp, shift); + u2 >>= (64 - shift); + + n -= 2; + + do { + if (WJR_UNLIKELY(n == 0)) { + qh = div.divide(divisor0, divisor1, value, rbp << shift, u1, u2); + break; + } + + u0 = src[n - 1]; + qh = div.divide(divisor0, divisor1, value, shld(rbp, u0, shift), u1, u2); + rbp = u0; + --n; + + if (WJR_LIKELY(n != 0)) { + do { + u0 = src[n - 1]; + dst[n] = + div.divide(divisor0, divisor1, value, shld(rbp, u0, shift), u1, u2); + rbp = u0; + --n; + } while (WJR_LIKELY(n != 0)); + } + + dst[0] = div.divide(divisor0, divisor1, value, rbp << shift, u1, u2); + } while (0); + + rem[0] = shrd(u1, u2, shift); + rem[1] = u2 >> shift; + return qh; +} + uint64_t sb_div_qr_s(uint64_t *dst, uint64_t *src, size_t n, const uint64_t *div, size_t m, uint64_t dinv) noexcept { using divider = div3by2_divider; diff --git a/src/wjr/math/mul.cpp b/src/wjr/math/mul.cpp index e7ee2ac5..ec15c259 100644 --- a/src/wjr/math/mul.cpp +++ b/src/wjr/math/mul.cpp @@ -1677,11 +1677,11 @@ void toom53_mul_s(uint64_t *WJR_RESTRICT dst, const uint64_t *src0, size_t n, } // W2 = T0 * T2; f(1) - __mul_n<__mul_mode::toom44, 4, 2>(w2p, t0p, t2p, l, stk, cf2, cft0, cft2); + __mul_n<__mul_mode::all, in_place_max, 2>(w2p, t0p, t2p, l, stk, cf2, cft0, cft2); // W3 = T1 * W5; f(-1) neg0 ^= neg1; - __mul_n<__mul_mode::toom44, 2, 1>(w3p, t1p, w5p, l, stk, cf3, cft1, cf5); + __mul_n<__mul_mode::all, 2, 1>(w3p, t1p, w5p, l, stk, cf3, cft1, cf5); // W5 = (W5 + V2) << 1 - V0; v(-2) if (!neg1) { @@ -1733,10 +1733,10 @@ void toom53_mul_s(uint64_t *WJR_RESTRICT dst, const uint64_t *src0, size_t n, // W1 = T1 * W5; f(-2) neg1 ^= neg2; - __mul_n<__mul_mode::toom44, in_place_max, 4>(w1p, t1p, w5p, l, stk, cf1, cft1, cf5); + __mul_n<__mul_mode::all>(w1p, t1p, w5p, l, stk, cf1, cft1, cf5); // W4 = T0 * T2; f(2) - __mul_n<__mul_mode::toom44, in_place_max, 6>(w4p, t0p, t2p, l, stk, cf4, cft0, cft2); + __mul_n<__mul_mode::all>(w4p, t0p, t2p, l, stk, cf4, cft0, cft2); // T0 = (((2U0 + U1) << 1 + U2) << 1 + U3) << 1 + U4; 16 * u(1/2) cft0 = addlsh_n(t0p, u1p, u0p, l, 1); @@ -1751,10 +1751,10 @@ void toom53_mul_s(uint64_t *WJR_RESTRICT dst, const uint64_t *src0, size_t n, WJR_ASSERT(cft1 <= 6); // W5 = T0 * T1; 64 * f(1/2) - __mul_n<__mul_mode::toom44>(w5p, t0p, t1p, l, stk, cf5, cft0, cft1); + __mul_n<__mul_mode::all>(w5p, t0p, t1p, l, stk, cf5, cft0, cft1); // W0 = U0 * V0; f(0) - __mul_n<__mul_mode::toom44>(w0p, u0p, v0p, l, stk); + __mul_n<__mul_mode::all>(w0p, u0p, v0p, l, stk); // W6 = U4 * V2; f(inf) if (rn >= rm) { @@ -1902,11 +1902,11 @@ void toom44_mul_s(uint64_t *WJR_RESTRICT dst, const uint64_t *src0, size_t n, } // W2 = T0 * W4; f(1) - __mul_n<__mul_mode::toom44, 3, 3>(w2p, t0p, w4p, l, stk, cf2, cft0, cf4); + __mul_n<__mul_mode::all, 3, 3>(w2p, t0p, w4p, l, stk, cf2, cft0, cf4); // W3 = T1 * W1; f(-1) neg0 ^= neg1; - __mul_n<__mul_mode::toom44, 1, 1>(w3p, t1p, w1p, l, stk, cf3, cft1, cf1); + __mul_n<__mul_mode::all, 1, 1>(w3p, t1p, w1p, l, stk, cf3, cft1, cf1); // T0 = U0 + 4U2; cft0 = addlsh_n(t0p, u0p, u2p, l, 2); @@ -1949,11 +1949,11 @@ void toom44_mul_s(uint64_t *WJR_RESTRICT dst, const uint64_t *src0, size_t n, } // W4 = T0 * W1; f(2) - __mul_n<__mul_mode::toom44>(w4p, t0p, w1p, l, stk, cf4, cft0, cf1); + __mul_n<__mul_mode::all>(w4p, t0p, w1p, l, stk, cf4, cft0, cf1); // W1 = T1 * W5; f(-2) neg1 ^= neg2; - __mul_n<__mul_mode::toom44>(w1p, t1p, w5p, l, stk, cf1, cft1, cf5); + __mul_n<__mul_mode::all>(w1p, t1p, w5p, l, stk, cf1, cft1, cf5); // T0 = ((2U0 + U1) << 1 + U2) << 1 + U3; 8 * u(1/2) cft0 = addlsh_n(t0p, u1p, u0p, l, 1); @@ -1968,10 +1968,10 @@ void toom44_mul_s(uint64_t *WJR_RESTRICT dst, const uint64_t *src0, size_t n, WJR_ASSERT(cft1 <= 14); // W5 = T0 * T1; 64 * f(1/2) - __mul_n<__mul_mode::toom44>(w5p, t0p, t1p, l, stk, cf5, cft0, cft1); + __mul_n<__mul_mode::all>(w5p, t0p, t1p, l, stk, cf5, cft0, cft1); // W0 = U0 * V0; f(0) - __mul_n<__mul_mode::toom44>(w0p, u0p, v0p, l, stk); + __mul_n<__mul_mode::all>(w0p, u0p, v0p, l, stk); // W6 = U3 * V3; f(inf) if (rn >= rm) { @@ -2084,10 +2084,10 @@ void toom4_sqr(uint64_t *WJR_RESTRICT dst, const uint64_t *src, size_t n, WJR_ASSERT(cft0 <= 3); // W2 = T0 ^ 2; f(1) - __sqr<__mul_mode::toom44, 3>(w2p, t0p, l, stk, cf2, cft0); + __sqr<__mul_mode::all, 3>(w2p, t0p, l, stk, cf2, cft0); // W3 = T1 ^ 2; f(-1) - __sqr<__mul_mode::toom44, 1>(w3p, t1p, l, stk, cf3, cft1); + __sqr<__mul_mode::all, 1>(w3p, t1p, l, stk, cf3, cft1); // T0 = U0 + 4U2; cft0 = addlsh_n(t0p, u0p, u2p, l, 2); @@ -2109,10 +2109,10 @@ void toom4_sqr(uint64_t *WJR_RESTRICT dst, const uint64_t *src, size_t n, WJR_ASSERT(cft0 <= 14); // W4 = T0 ^ 2; f(2) - __sqr<__mul_mode::toom44>(w4p, t0p, l, stk, cf4, cft0); + __sqr<__mul_mode::all>(w4p, t0p, l, stk, cf4, cft0); // W1 = T1 * W5; f(-2) - __sqr<__mul_mode::toom44>(w1p, t1p, l, stk, cf1, cft1); + __sqr<__mul_mode::all>(w1p, t1p, l, stk, cf1, cft1); // T0 = ((2U0 + U1) << 1 + U2) << 1 + U3; 8 * u(1/2) cft0 = addlsh_n(t0p, u1p, u0p, l, 1); @@ -2121,13 +2121,13 @@ void toom4_sqr(uint64_t *WJR_RESTRICT dst, const uint64_t *src, size_t n, WJR_ASSERT(cft0 <= 14); // W5 = T0 ^ 2; 64 * f(1/2) - __sqr<__mul_mode::toom44>(w5p, t0p, l, stk, cf5, cft0); + __sqr<__mul_mode::all>(w5p, t0p, l, stk, cf5, cft0); // W0 = U0 * V0; f(0) - __sqr<__mul_mode::toom44>(w0p, u0p, l, stk); + __sqr<__mul_mode::all>(w0p, u0p, l, stk); // W6 = U3 * V3; f(inf) - __sqr<__mul_mode::toom44>(w6p, u3p, rn, stk); + __sqr<__mul_mode::all>(w6p, u3p, rn, stk); toom_interpolation_7p_s(dst, w1p, l, rn, rn, toom_interpolation_7p_struct{0, 0, cf1, cf2, cf3, cf4, cf5}); diff --git a/src/wjr/x86/math/mul.cpp b/src/wjr/x86/math/mul.cpp index d56bda12..99ad4ace 100644 --- a/src/wjr/x86/math/mul.cpp +++ b/src/wjr/x86/math/mul.cpp @@ -2,6 +2,475 @@ namespace wjr { +#if WJR_HAS_BUILTIN(ASM_MUL_1) == 1 + +uint64_t __wjr_asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, + uint64_t rdx) noexcept { + size_t rcx = n / 8; + uint64_t r8, r9, r10 = n, r11; + + asm volatile( + "and{l $7, %k[r10]| %k[r10], 7}\n\t" + "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" + "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + " + "%[r10] * 4]}\n\t" + "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" + "jmp{q *%[r10]| %[r10]}\n\t" + + ".align 8\n\t" + ".Llookup%=:\n\t" + ".long .Ll0%=-.Llookup%=\n\t" + ".long .Ll1%=-.Llookup%=\n\t" + ".long .Ll2%=-.Llookup%=\n\t" + ".long .Ll3%=-.Llookup%=\n\t" + ".long .Ll4%=-.Llookup%=\n\t" + ".long .Ll5%=-.Llookup%=\n\t" + ".long .Ll6%=-.Llookup%=\n\t" + ".long .Ll7%=-.Llookup%=\n\t" + ".align 16\n\t" + + ".Ll0%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "jmp .Lb0%=\n\t" + + ".Ll2%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "lea{q -48(%[src]), %[src]| %[src], [%[src] - 48]}\n\t" + "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t" + "inc %[rcx]\n\t" + "jmp .Lb2%=\n\t" + + ".Ll3%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q -40(%[src]), %[src]| %[src], [%[src] - 40]}\n\t" + "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t" + "inc %[rcx]\n\t" + "jmp .Lb3%=\n\t" + + ".Ll4%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "lea{q -32(%[src]), %[src]| %[src], [%[src] - 32]}\n\t" + "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t" + "inc %[rcx]\n\t" + "jmp .Lb4%=\n\t" + + ".Ll5%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q -24(%[src]), %[src]| %[src], [%[src] - 24]}\n\t" + "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t" + "inc %[rcx]\n\t" + "jmp .Lb5%=\n\t" + + ".Ll6%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "lea{q -16(%[src]), %[src]| %[src], [%[src] - 16]}\n\t" + "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t" + "inc %[rcx]\n\t" + "jmp .Lb6%=\n\t" + + ".Ll7%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q -8(%[src]), %[src]| %[src], [%[src] - 8]}\n\t" + "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t" + "inc %[rcx]\n\t" + "jmp .Lb7%=\n\t" + + ".Ld1%=:\n\t" + "jmp .Ldone%=\n\t" + + ".Ll1%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q 8(%[src]), %[src]| %[src], [%[src] + 8]}\n\t" + "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" + "jrcxz .Ld1%=\n\t" + + ".align 32\n\t" + ".Lloop%=:\n\t" + + ".Lb1%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" + "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + + ".Lb0%=:\n\t" + "mulx{q 8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 8]}\n\t" + "mov{q %[r10], (%[dst])| [%[dst]], %[r10]}\n\t" + "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + + ".Lb7%=:\n\t" + "mulx{q 16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 16]}\n\t" + "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t" + "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + + ".Lb6%=:\n\t" + "mulx{q 24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 24]}\n\t" + "mov{q %[r10], 16(%[dst])| [%[dst] + 16], %[r10]}\n\t" + "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + + ".Lb5%=:\n\t" + "mulx{q 32(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 32]}\n\t" + "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t" + "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + + ".Lb4%=:\n\t" + "mulx{q 40(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 40]}\n\t" + "mov{q %[r10], 32(%[dst])| [%[dst] + 32], %[r10]}\n\t" + "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + + ".Lb3%=:\n\t" + "mulx{q 48(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 48]}\n\t" + "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t" + "adc{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + + ".Lb2%=:\n\t" + "mulx{q 56(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 56]}\n\t" + "mov{q %[r10], 48(%[dst])| [%[dst] + 48], %[r10]}\n\t" + "adc{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + + "lea{q 64(%[src]), %[src]| %[src], [%[src] + 64]}\n\t" + "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" + + "dec %[rcx]\n\t" + "jne .Lloop%=\n\t" + + "adc{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" + ".Ldone%=:\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" + + : [dst] "+&r"(dst), [src] "+&r"(src), [rcx] "+&c"(rcx), [r8] "=&r"(r8), + [r9] "=&r"(r9), [r10] "+&r"(r10), [r11] "=&r"(r11) + : "d"(rdx) + : "cc", "memory"); + + return r9; +} + +#endif + +#if WJR_HAS_BUILTIN(ASM_ADDMUL_1) == 1 + +uint64_t __wjr_asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, + uint64_t rdx) noexcept { + size_t rcx = n / 8; + uint64_t r8, r9, r10 = n, r11; + + asm volatile( + "and{l $7, %k[r10]| %k[r10], 7}\n\t" + "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" + "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + " + "%[r10] * 4]}\n\t" + "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" + "jmp{q *%[r10]| %[r10]}\n\t" + + ".align 8\n\t" + ".Llookup%=:\n\t" + ".long .Ll0%=-.Llookup%=\n\t" + ".long .Ll1%=-.Llookup%=\n\t" + ".long .Ll2%=-.Llookup%=\n\t" + ".long .Ll3%=-.Llookup%=\n\t" + ".long .Ll4%=-.Llookup%=\n\t" + ".long .Ll5%=-.Llookup%=\n\t" + ".long .Ll6%=-.Llookup%=\n\t" + ".long .Ll7%=-.Llookup%=\n\t" + ".align 16\n\t" + + ".Ll0%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "jmp .Lb0%=\n\t" + + ".Ll2%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "lea{q 16(%[src]), %[src]| %[src], [%[src] + 16]}\n\t" + "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t" + "jmp .Lb2%=\n\t" + + ".Ll3%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q 24(%[src]), %[src]| %[src], [%[src] + 24]}\n\t" + "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t" + "jmp .Lb3%=\n\t" + + ".Ll4%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "lea{q 32(%[src]), %[src]| %[src], [%[src] + 32]}\n\t" + "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t" + "jmp .Lb4%=\n\t" + + ".Ll5%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q 40(%[src]), %[src]| %[src], [%[src] + 40]}\n\t" + "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t" + "jmp .Lb5%=\n\t" + + ".Ll6%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "lea{q -16(%[src]), %[src]| %[src], [%[src] - 16]}\n\t" + "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t" + "jmp .Lb6%=\n\t" + + ".Ll7%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q -8(%[src]), %[src]| %[src], [%[src] - 8]}\n\t" + "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t" + "jmp .Lb7%=\n\t" + + ".Ld1%=:\n\t" + "add{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" + "adc{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" + "jmp .Ldone%=\n\t" + + ".Ll1%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q 8(%[src]), %[src]| %[src], [%[src] + 8]}\n\t" + "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" + "jrcxz .Ld1%=\n\t" + + ".align 32\n\t" + ".Lloop%=:\n\t" + + ".Lb1%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" + + ".Lb0%=:\n\t" + "mulx{q 8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 8]}\n\t" + "lea{q -1(%[rcx]), %[rcx]| %[rcx], [%[rcx] - 1]}\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "adcx{q (%[dst]), %[r10]| %[r10], [%[dst]]}\n\t" + "mov{q %[r10], (%[dst])| [%[dst]], %[r10]}\n\t" + + ".Lb7%=:\n\t" + "mulx{q 16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 16]}\n\t" + "adcx{q 8(%[dst]), %[r8]| %[r8], [%[dst] + 8]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t" + + ".Lb6%=:\n\t" + "mulx{q 24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 24]}\n\t" + "lea{q 64(%[src]), %[src]| %[src], [%[src] + 64]}\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "adcx{q 16(%[dst]), %[r10]| %[r10], [%[dst] + 16]}\n\t" + "mov{q %[r10], 16(%[dst])| [%[dst] + 16], %[r10]}\n\t" + + ".Lb5%=:\n\t" + "mulx{q -32(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 32]}\n\t" + "adcx{q 24(%[dst]), %[r8]| %[r8], [%[dst] + 24]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t" + + ".Lb4%=:\n\t" + "mulx{q -24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 24]}\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "adcx{q 32(%[dst]), %[r10]| %[r10], [%[dst] + 32]}\n\t" + "mov{q %[r10], 32(%[dst])| [%[dst] + 32], %[r10]}\n\t" + + ".Lb3%=:\n\t" + "mulx{q -16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 16]}\n\t" + "adcx{q 40(%[dst]), %[r8]| %[r8], [%[dst] + 40]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t" + + ".Lb2%=:\n\t" + "mulx{q -8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 8]}\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "adcx{q 48(%[dst]), %[r10]| %[r10], [%[dst] + 48]}\n\t" + "mov{q %[r10], 48(%[dst])| [%[dst] + 48], %[r10]}\n\t" + + "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" + + "jrcxz .Lloop_out%=\n\t" + "jmp .Lloop%=\n\t" + ".Lloop_out%=:\n\t" + + "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" + "adox{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" + "adc{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" + + ".Ldone%=:" + + : [dst] "+&r"(dst), [src] "+&r"(src), [rcx] "+&c"(rcx), [r8] "=&r"(r8), + [r9] "=&r"(r9), [r10] "+&r"(r10), [r11] "=&r"(r11) + : "d"(rdx) + : "cc", "memory"); + + return r9; +} + +#endif + +#if WJR_HAS_BUILTIN(ASM_SUBMUL_1) == 1 + +uint64_t __wjr_asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, + uint64_t rdx) noexcept { + size_t rcx = n / 8; + uint64_t r8, r9, r10 = static_cast(n), r11; + + asm volatile( + // set CF = 1, OF = 0 + "and{l $7, %k[r10]| %k[r10], 7}\n\t" + "stc\n\t" + + "lea{q| %[r9], [rip +} .Llookup%={(%%rip), %[r9]|]}\n\t" + "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + %[r10] * " + "4]}\n\t" + "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" + "jmp{q *%[r10]| %[r10]}\n\t" + + ".align 8\n\t" + ".Llookup%=:\n\t" + ".long .Ll0%=-.Llookup%=\n\t" + ".long .Ll1%=-.Llookup%=\n\t" + ".long .Ll2%=-.Llookup%=\n\t" + ".long .Ll3%=-.Llookup%=\n\t" + ".long .Ll4%=-.Llookup%=\n\t" + ".long .Ll5%=-.Llookup%=\n\t" + ".long .Ll6%=-.Llookup%=\n\t" + ".long .Ll7%=-.Llookup%=\n\t" + ".align 16\n\t" + + ".Ll0%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "jmp .Lb0%=\n\t" + + ".Ll2%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "lea{q 16(%[src]), %[src]| %[src], [%[src] + 16]}\n\t" + "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t" + "jmp .Lb2%=\n\t" + + ".Ll3%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q 24(%[src]), %[src]| %[src], [%[src] + 24]}\n\t" + "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t" + "jmp .Lb3%=\n\t" + + ".Ll4%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "lea{q 32(%[src]), %[src]| %[src], [%[src] + 32]}\n\t" + "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t" + "jmp .Lb4%=\n\t" + + ".Ll5%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q 40(%[src]), %[src]| %[src], [%[src] + 40]}\n\t" + "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t" + "jmp .Lb5%=\n\t" + + ".Ll6%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "lea{q -16(%[src]), %[src]| %[src], [%[src] - 16]}\n\t" + "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t" + "jmp .Lb6%=\n\t" + + ".Ll7%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q -8(%[src]), %[src]| %[src], [%[src] - 8]}\n\t" + "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t" + "jmp .Lb7%=\n\t" + + ".Ld1%=:\n\t" + "adc{q (%[dst]), %[r8]| %[r8], [%[dst]]}\n\t" + "sbb{q $-1, %[r9]| %[r9], -1}\n\t" + "mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t" + "jmp .Ldone%=\n\t" + + ".Ll1%=:\n\t" + "mulx{q (%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "not %[r8]\n\t" + "jrcxz .Ld1%=\n\t" + "lea{q 8(%[src]), %[src]| %[src], [%[src] + 8]}\n\t" + "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" + + ".align 32\n\t" + ".Lloop%=:\n\t" + + ".Lb1%=:\n\t" + "mulx{q (%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t" + "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" + + ".Lb0%=:\n\t" + "not %[r10]\n\t" + "mulx{q 8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 8]}\n\t" + "lea{q -1(%[rcx]), %[rcx]| %[rcx], [%[rcx] - 1]}\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "adcx{q (%[dst]), %[r10]| %[r10], [%[dst]]}\n\t" + "mov{q %[r10], (%[dst])| [%[dst]], %[r10]}\n\t" + + ".Lb7%=:\n\t" + "not %[r8]\n\t" + "mulx{q 16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] + 16]}\n\t" + "adcx{q 8(%[dst]), %[r8]| %[r8], [%[dst] + 8]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t" + + ".Lb6%=:\n\t" + "not %[r10]\n\t" + "mulx{q 24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] + 24]}\n\t" + "lea{q 64(%[src]), %[src]| %[src], [%[src] + 64]}\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "adcx{q 16(%[dst]), %[r10]| %[r10], [%[dst] + 16]}\n\t" + "mov{q %[r10], 16(%[dst])| [%[dst] + 16], %[r10]}\n\t" + + ".Lb5%=:\n\t" + "not %[r8]\n\t" + "mulx{q -32(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 32]}\n\t" + "adcx{q 24(%[dst]), %[r8]| %[r8], [%[dst] + 24]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t" + + ".Lb4%=:\n\t" + "mulx{q -24(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 24]}\n\t" + "not %[r10]\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "adcx{q 32(%[dst]), %[r10]| %[r10], [%[dst] + 32]}\n\t" + "mov{q %[r10], 32(%[dst])| [%[dst] + 32], %[r10]}\n\t" + + ".Lb3%=:\n\t" + "not %[r8]\n\t" + "mulx{q -16(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src] - 16]}\n\t" + "adcx{q 40(%[dst]), %[r8]| %[r8], [%[dst] + 40]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t" + + ".Lb2%=:\n\t" + "not %[r10]\n\t" + "mulx{q -8(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src] - 8]}\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "adcx{q 48(%[dst]), %[r10]| %[r10], [%[dst] + 48]}\n\t" + "not %[r8]\n\t" + "mov{q %[r10], 48(%[dst])| [%[dst] + 48], %[r10]}\n\t" + + "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" + + "jrcxz .Lloop_out%=\n\t" + "jmp .Lloop%=\n\t" + ".Lloop_out%=:\n\t" + + "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" + "adox{q %[rcx], %[r9]| %[r9], %[rcx]}\n\t" + "sbb{q $-1, %[r9]| %[r9], -1}\n\t" + + ".Ldone%=:" + + : [dst] "+&r"(dst), [src] "+&r"(src), [rcx] "+&c"(rcx), [r8] "=&r"(r8), + [r9] "=&r"(r9), [r10] "+&r"(r10), [r11] "=&r"(r11) + : "d"(rdx) + : "cc", "memory"); + + WJR_ASSUME(rcx == 0); + + return r9; +} + +#endif + #if WJR_HAS_BUILTIN(ASM_BASECASE_MUL_S) == 1 void __wjr_asm_basecase_mul_s_impl(uint64_t *dst, const uint64_t *src0, size_t rdx, diff --git a/tests/units/src/details.hpp b/tests/units/src/details.hpp index 5eaea7d3..a25b5f3a 100644 --- a/tests/units/src/details.hpp +++ b/tests/units/src/details.hpp @@ -1,3 +1,6 @@ +#ifndef WJR_UNITS_DETAILS_HPP__ +#define WJR_UNITS_DETAILS_HPP__ + #include #include #include @@ -7,4 +10,6 @@ static std::mt19937_64 __mt_rand(time(0)); static auto mt_rand = std::ref(__mt_rand); -#define WJR_TESTS_NOT_OPT(x) (*(volatile decltype(x) *)&(x)) \ No newline at end of file +#define WJR_TESTS_NOT_OPT(x) (*(volatile decltype(x) *)&(x)) + +#endif // WJR_UNITS_DETAILS_HPP__ \ No newline at end of file diff --git a/tests/units/src/math.cpp b/tests/units/src/math.cpp index 5ba3c714..8c8f1555 100644 --- a/tests/units/src/math.cpp +++ b/tests/units/src/math.cpp @@ -25,7 +25,7 @@ TEST(math, popcount_ctz_clz) { auto ctz_ans = popcount((type)(lowbit(n) - 1)); \ WJR_ASSERT((x == 0 ? std::numeric_limits::digits : fallback_ctz(x)) == \ ctz_ans) \ - WJR_PP_BOOL_IF( \ + WJR_PP_BOOL_IF_NE( \ WJR_HAS_BUILTIN(CTZ), ; \ do { WJR_ASSERT((countr_zero(x) == ctz_ans)); } while (0), ); #define WJR_TEST_CLZ_I(type, x, ans) \ @@ -52,7 +52,7 @@ TEST(math, popcount_ctz_clz) { }(); \ WJR_ASSERT((x == 0 ? std::numeric_limits::digits : fallback_clz(x)) == \ clz_ans) \ - WJR_PP_BOOL_IF( \ + WJR_PP_BOOL_IF_NE( \ WJR_HAS_BUILTIN(CTZ), ; \ do { WJR_ASSERT((countl_zero(x) == clz_ans)); } while (0), ); @@ -170,7 +170,7 @@ TEST(math, addc) { WJR_ASSERT((builtin_addc(x, y, ci, co) == ans && co == ans_co)); \ } while (0), \ {}); \ - WJR_PP_BOOL_IF( \ + WJR_PP_BOOL_IF_NE( \ WJR_HAS_BUILTIN(ASM_ADDC), do { \ if constexpr (std::is_same_v) { \ WJR_ASSERT((asm_addc(x, y, ci, co) == ans && co == ans_co)); \ @@ -357,7 +357,7 @@ TEST(math, sub) { WJR_HAS_BUILTIN(SUBC), ; do { \ WJR_ASSERT((builtin_subc(x, y, ci, co) == ans && co == ans_co)); \ } while (0), ) \ - WJR_PP_BOOL_IF( \ + WJR_PP_BOOL_IF_NE( \ WJR_HAS_BUILTIN(ASM_SUBC), ; do { \ if constexpr (std::is_same_v) { \ WJR_ASSERT((asm_subc(x, y, ci, co) == ans && co == ans_co)); \