From 33af29bc3c2ba4b3c01ab025011ff36d78cb0728 Mon Sep 17 00:00:00 2001 From: wjr <1966336874@qq.com> Date: Fri, 19 Jan 2024 20:01:39 +0800 Subject: [PATCH] fix --- include/wjr/math/mul.hpp | 127 +++++++++------ include/wjr/stack_allocator.hpp | 10 +- include/wjr/x86/mul.hpp | 277 ++++++++++++++++++++------------ 3 files changed, 259 insertions(+), 155 deletions(-) diff --git a/include/wjr/math/mul.hpp b/include/wjr/math/mul.hpp index c7de1fa1..7a76225d 100644 --- a/include/wjr/math/mul.hpp +++ b/include/wjr/math/mul.hpp @@ -286,15 +286,27 @@ WJR_INTRINSIC_INLINE void __rec_mul_n(T *dst, const T *src0, size_t n, const T * template void basecase_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m); +// l = ceil(n / 2) +// stk usage : 2 * l +// recursive stk max usage : 4 * l + 128 template void toom22_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *stk); +// l = max(ceil(n / 3), ceil(m / 2)) +// stk usage : 4 * l +// recursive stk max usage : 6 * l + 128 template void toom32_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *stk); +// l = max(ceil(n / 4), ceil(m / 2)) +// stk usage : 6 * l + 3 +// recursive stk max usage : 8 * l + 131 template void toom42_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *stk); +// l = ceil(n / 3) +// stk usage : 6 * l + 3 +// recursive stk max usage : 9 * l + 576 template void toom33_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *stk); @@ -323,7 +335,7 @@ void mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m) { } if (m < toom33_mul_threshold) { - unique_stack_ptr ptr(math_details::stack_alloc, sizeof(T) * (2 * n + 128)); + unique_stack_ptr ptr(math_details::stack_alloc, sizeof(T) * (6 * m + 131)); T *stk = static_cast(ptr.get()); if (n >= 3 * m) { unique_stack_ptr tmpp(math_details::stack_alloc, sizeof(T) * (4 * m)); @@ -331,7 +343,7 @@ void mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m) { toom42_mul_s(dst, src0, 2 * m, src1, m, stk); n -= 2 * m; - src1 += 2 * m; + src0 += 2 * m; dst += 2 * m; T cf = 0; @@ -339,7 +351,7 @@ void mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m) { while (n >= 3 * m) { toom42_mul_s(tmp, src0, 2 * m, src1, m, stk); n -= 2 * m; - src1 += 2 * m; + src0 += 2 * m; cf = addc_n(dst, dst, tmp, m, cf); std::copy(tmp + m, tmp + 3 * m, dst + m); @@ -373,7 +385,7 @@ void mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m) { return; } - unique_stack_ptr ptr(math_details::stack_alloc, sizeof(T) * (3 * n + 256)); + unique_stack_ptr ptr(math_details::stack_alloc, sizeof(T) * (9 * m + 576)); T *stk = static_cast(ptr.get()); if (n >= 3 * m) { unique_stack_ptr tmpp(math_details::stack_alloc, sizeof(T) * (4 * m)); @@ -381,7 +393,7 @@ void mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m) { toom42_mul_s(dst, src0, 2 * m, src1, m, stk); n -= 2 * m; - src1 += 2 * m; + src0 += 2 * m; dst += 2 * m; T cf = 0; @@ -389,7 +401,7 @@ void mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m) { while (n >= 3 * m) { toom42_mul_s(tmp, src0, 2 * m, src1, m, stk); n -= 2 * m; - src1 += 2 * m; + src0 += 2 * m; cf = addc_n(dst, dst, tmp, m, cf); std::copy(tmp + m, tmp + 3 * m, dst + m); @@ -552,7 +564,7 @@ WJR_INTRINSIC_INLINE void __rec_mul_n(T *dst, const T *src0, const T *src1, size return basecase_mul_s(dst, src0, n, src1, n); } - if (n < toom33_mul_threshold) { + if (mode <= __rec_mul_mode::toom22 || n < toom33_mul_threshold) { toom22_mul_s(dst, src0, n, src1, n, stk); return; } @@ -645,12 +657,17 @@ void toom22_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *s template void toom32_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *stk) { - WJR_ASSERT(3 * m <= 2 * n); - WJR_ASSERT(3 * m > n); + WJR_ASSERT(m + 2 <= n); + WJR_ASSERT(n + 6 <= 3 * m); - const size_t l = (n + 2) / 3; + const size_t l = (2 * n >= 3 * m ? (n + 2) / 3 : (m + 1) / 2); const size_t rn = n - l * 2; const size_t rm = m - l; + const size_t maxr = std::max(rn, rm); + + WJR_ASSERT(0 < rn && rn <= l); + WJR_ASSERT(0 < rm && rm <= l); + WJR_ASSERT(rn + rm >= l); const auto u0p = src0; const auto u1p = src0 + l; @@ -661,10 +678,10 @@ void toom32_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *s auto w0p = dst; auto w1p = stk; - auto w2p = stk + (2 * l + 1); + auto w2p = stk + (2 * l); auto w3p = dst + l * 3; - stk += 2 * (2 * l + 1); + stk += 4 * l; T cf0 = 0, cf1 = 0, cf2 = 0, cf3 = 0; bool neg0 = 0, neg3 = 0; @@ -690,7 +707,6 @@ void toom32_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *s if (WJR_UNLIKELY(cf3 != 0)) { cf1 += addc_n(w1p + l, w1p + l, w2p, l, 0u); } - w1p[l * 2] = cf1; // W0 = W0 - U1 : u(-1) if (cf0) { @@ -714,48 +730,55 @@ void toom32_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *s if (WJR_UNLIKELY(cf0 != 0)) { cf2 += addc_n(w2p + l, w2p + l, w3p, l, 0u); } - w2p[l * 2] = cf2; // W0 = U0 * V0 : (non-negative) r(0) = r0 __rec_mul_n<__rec_mul_mode::toom22>(w0p, u0p, v0p, l, stk); // W3 = U2 * V1 : (non-negative) r(inf) = r3 - __rec_mul_s<__rec_mul_mode::toom22>(w3p, u2p, rn, v1p, rm, stk); + if (maxr == rn) { + __rec_mul_s<__rec_mul_mode::toom22>(w3p, u2p, rn, v1p, rm, stk); + } else { + __rec_mul_s<__rec_mul_mode::toom22>(w3p, v1p, rm, u2p, rn, stk); + } // W1 = (W1 - W2) >> 1 : (non-negative) (r(1) - r(-1)) / 2 { if (!neg0) { - cf1 = subc_n(w1p, w1p, w2p, l * 2 + 1, 0u); + cf1 = cf1 - cf2 - subc_n(w1p, w1p, w2p, l * 2, 0u); } else { - cf1 = addc_n(w1p, w1p, w2p, l * 2 + 1, 0u); + cf1 = cf1 + cf2 + addc_n(w1p, w1p, w2p, l * 2, 0u); } - WJR_ASSERT(cf1 == 0); - rshift_n(w1p, w1p, l * 2 + 1, 1u); + rshift_n(w1p, w1p, l * 2, 1u); + if (cf1 & 1) { + w1p[l * 2 - 1] |= 1ull << 63; + } + cf1 >>= 1; } // W2 = (W1 + W2) - W0 : (non-negative) r2 { if (!neg0) { - cf2 = addc_n(w2p, w1p, w2p, l * 2 + 1, 0u); + cf2 = cf1 + cf2 + addc_n(w2p, w1p, w2p, l * 2, 0u); } else { - cf2 = subc_n(w2p, w1p, w2p, l * 2 + 1, 0u); + cf2 = cf1 - cf2 - subc_n(w2p, w1p, w2p, l * 2, 0u); } - WJR_ASSERT(cf2 == 0); - cf2 -= subc_s(w2p, w2p, l * 2 + 1, w0p, l * 2, 0u); - WJR_ASSERT(cf2 == 0); + cf2 -= subc_n(w2p, w2p, w0p, l * 2, 0u); + if (l != maxr) { + WJR_ASSERT(cf2 == 0); + cf2 = w2p[l + maxr]; + } } // W1 = W1 - W3 : (non-negative) r1 - cf1 = subc_s(w1p, w1p, l * 2 + 1, w3p, rn + rm, 0u); - WJR_ASSERT(cf1 == 0); - cf1 = w1p[l * 2]; + cf1 -= subc_s(w1p, w1p, l * 2, w3p, rn + rm, 0u); // W = W3*x^3+W2*x^2+W1*x+W0 cf0 = addc_n(w0p + l, w0p + l, w1p, l, 0u); cf0 = addc_n(dst + l * 2, w1p + l, w2p, l, cf0); - cf0 = addc_s(w3p, w3p, rn + rm, w2p + l, rn + 1, cf0); + cf0 = addc_n(w3p, w3p, w2p + l, maxr, cf0); + cf0 = addc_1(w3p + maxr, w3p + maxr, (rn + rm) - maxr, cf2, cf0); WJR_ASSERT(cf0 == 0); cf0 = addc_1(w3p, w3p, rn + rm, cf1, 0u); WJR_ASSERT(cf0 == 0); @@ -766,8 +789,9 @@ template , int> = 0> WJR_CONSTEXPR_E void divexact_by3(T *dst, const T *src, size_t n); template -void toom_interpolation_5p_s(T *dst, T *w1p, size_t l, size_t rn, size_t rm, - bool neg2) { +void toom_interpolation_5p_s(T *dst, T *w1p, size_t l, size_t rn, size_t rm, bool neg2) { + const size_t maxr = std::max(rn, rm); + auto w0p = dst; auto w2p = w1p + (2 * l + 1); auto w3p = w1p + (2 * l + 1) * 2; @@ -808,16 +832,12 @@ void toom_interpolation_5p_s(T *dst, T *w1p, size_t l, size_t rn, size_t rm, cf3 = subc_n(w3p, w3p, w2p, l * 2 + 1, 0u); WJR_ASSERT(cf3 == 0); - if (rn != l) { - WJR_ASSERT(w3p[l * 2] == 0); - } - - (void)rshift_n(w3p, w3p, l + rn + 1, 1u); + (void)rshift_n(w3p, w3p, l + maxr + 1, 1u); T cf5 = lshift_n(dst + l * 2, w4p, rn + rm, 1u); cf3 = subc_n(w3p, w3p, dst + l * 2, rn + rm, 0u); - cf3 = subc_1(w3p + rn + rm, w3p + rn + rm, (l + rn + 1) - (rn + rm), cf5, cf3); + cf3 = subc_1(w3p + rn + rm, w3p + rn + rm, (l + maxr + 1) - (rn + rm), cf5, cf3); WJR_ASSERT(cf3 == 0); } @@ -826,7 +846,7 @@ void toom_interpolation_5p_s(T *dst, T *w1p, size_t l, size_t rn, size_t rm, WJR_ASSERT(cf2 == 0); // W3 = W4 * x + W3 : r4 * x + r3 - cf3 = addc_s(w4p, w4p, rn + rm, w3p + l, rn + 1, 0u); + cf3 = addc_s(w4p, w4p, rn + rm, w3p + l, maxr + 1, 0u); // W1 = W2 * x + W1 : cf2 = addc_s(w2p, w2p, l * 2, w1p + l, l + 1, 0u); @@ -836,8 +856,8 @@ void toom_interpolation_5p_s(T *dst, T *w1p, size_t l, size_t rn, size_t rm, cf1 = cf3 + subc_n(dst + l * 2, w2p, w4p, rn + rm, cf1); cf2 += w2p[l * 2]; if (l * 2 != rn + rm) { - cf1 = cf2 - - subc_1(dst + l * 2 + rn + rm, w2p + rn + rm, (l * 2) - (rn + rm), cf1, 0u); + cf1 = cf2 - subc_1(dst + (l * 2) + (rn + rm), w2p + (rn + rm), + (l * 2) - (rn + rm), cf1, 0u); } else { cf1 = cf2 - cf1; } @@ -853,15 +873,16 @@ void toom_interpolation_5p_s(T *dst, T *w1p, size_t l, size_t rn, size_t rm, template void toom42_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *stk) { - WJR_ASSERT(2 * m <= n); - WJR_ASSERT(4 * m > n); - WJR_ASSUME(n >= m); - const size_t l = (n + 3) / 4; + const size_t l = (n >= 2 * m ? (n + 3) / 4 : (m + 1) / 2); const size_t rn = n - l * 3; const size_t rm = m - l; - WJR_ASSERT(rm <= l); + const size_t maxr = std::max(rn, rm); + + WJR_ASSERT(0 < rn && rn <= l); + WJR_ASSERT(0 < rm && rm <= l); + WJR_ASSERT(rn + rm >= l); const auto u0p = src0; const auto u1p = src0 + l; @@ -898,7 +919,7 @@ void toom42_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *s } else { WJR_ASSERT(cf1 == 1); neg3 = 1; - cf3 -= -subc_n(w3p, w1p, w0p, l, 0u); + cf3 -= subc_n(w3p, w1p, w0p, l, 0u); } } else { ptrdiff_t p = abs_subc_n(w3p, w0p, w1p, l); @@ -942,15 +963,17 @@ void toom42_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *s // W0 = U0 +(U1 +(U2 +U3<<1)<<1)<<1 : (non-negative) u(2) { - cf0 = lshift_n(w0p, u3p, rn, 1u); - cf0 += addc_n(w0p, w0p, u2p, rn, 0u); + cf3 = lshift_n(w0p, u3p, rn, 1u); + cf0 = addc_s(w0p, u2p, l, w0p, rn, 0u); if (l != rn) { - cf0 = addc_1(w0p + rn, w0p + rn, l - rn, cf0, 0u); + cf0 += addc_1(w0p + rn, w0p + rn, l - rn, cf3, 0u); + } else { + cf0 += cf3; } cf0 += cf0 + lshift_n(w0p, w0p, l, 1u); cf0 += addc_n(w0p, w0p, u1p, l, 0u); - cf0 += cf0 + lshift_n(w0p, w0p, n, 1u); + cf0 += cf0 + lshift_n(w0p, w0p, l, 1u); cf0 += addc_n(w0p, w0p, u0p, l, 0u); WJR_ASSERT(cf0 <= 14); } @@ -982,7 +1005,11 @@ void toom42_mul_s(T *dst, const T *src0, size_t n, const T *src1, size_t m, T *s __rec_mul_n<__rec_mul_mode::toom22>(w0p, u0p, v0p, l, stk); // W4 = U3 * V1 : (non-negative) r(inf) = r4 - __rec_mul_s<__rec_mul_mode::toom22>(w4p, u3p, rn, v1p, rm, stk); + if (maxr == rn) { + __rec_mul_s<__rec_mul_mode::toom22>(w4p, u3p, rn, v1p, rm, stk); + } else { + __rec_mul_s<__rec_mul_mode::toom22>(w4p, v1p, rm, u3p, rn, stk); + } return toom_interpolation_5p_s(dst, w1p, l, rn, rm, neg2); } diff --git a/include/wjr/stack_allocator.hpp b/include/wjr/stack_allocator.hpp index dbf77af5..1d99fa56 100644 --- a/include/wjr/stack_allocator.hpp +++ b/include/wjr/stack_allocator.hpp @@ -178,6 +178,8 @@ class stack_allocator { static thread_local alloc __alloc; public: + using pointer = void *; + stack_allocator() = default; stack_allocator(const stack_allocator &) = default; stack_allocator &operator=(const stack_allocator &) = default; @@ -203,14 +205,18 @@ thread_local typename stack_allocator class unique_stack_ptr { + using pointer = typename StackAllocator::pointer; + public: WJR_INTRINSIC_CONSTEXPR20 unique_stack_ptr(const StackAllocator &al, size_t size) - : pair(al, Malloc{al.allocate(size), size}) {} + : pair(al, Malloc{nullptr, size}) { + pair.second().ptr = static_cast(al.allocate(size)); + } WJR_INTRINSIC_CONSTEXPR20 ~unique_stack_ptr() { auto &al = pair.first(); auto &mlo = pair.second(); - al.deallocate(mlo.ptr, mlo.size); + al.deallocate(static_cast(mlo.ptr), mlo.size); } unique_stack_ptr(const unique_stack_ptr &) = delete; diff --git a/include/wjr/x86/mul.hpp b/include/wjr/x86/mul.hpp index 033887a7..f3a7975c 100644 --- a/include/wjr/x86/mul.hpp +++ b/include/wjr/x86/mul.hpp @@ -93,6 +93,7 @@ WJR_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint ".long .Ll7%=-.Lasm_mul_1_lookup%=\n\t" ".align 16\n\t" + // UB ".Ld0%=:\n\t" "xor %k[r9], %k[r9]\n\t" "jmp .Ldone%=\n\t" @@ -256,6 +257,7 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, ".long .Ll7%=-.Lasm_mul_1_lookup%=\n\t" ".align 16\n\t" + // UB ".Ld0%=:\n\t" "xor %k[r9], %k[r9]\n\t" "jmp .Ldone%=\n\t" @@ -374,8 +376,8 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, ".Lloop_out%=:\n\t" "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" - "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" "adox %[cx], %[r9]\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" "adc %[cx], %[r9]\n\t" ".Ldone%=:" @@ -409,118 +411,187 @@ WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n, // slower than asm_addmul_1 // TODO : optimize -WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src0, size_t n, - uint64_t src1) { - size_t m = n / 8; +WJR_INLINE uint64_t asm_submul_1(uint64_t *dst, const uint64_t *src, size_t n, + uint64_t dx) { + size_t cx = n / 8; n &= 7; - dst += n - 8; - src0 += n - 8; + uint64_t r8, r9, r11; - uint64_t t0 = n; - uint64_t t1; - uint64_t t2 = 0; - uint64_t t3; + auto cdst = dst; + auto csrc = src; asm volatile( - "xor %k[t1], %k[t1]\n\t" - "mov{q $127, %[t3]| %[t3], 127}\n\t" - // set CF = 0, OF = 1 - "add{b $1, %b[t3]| %b[t3], 1}\n\t" - - "lea{q| %[t3], [rip +} .Lasm_submul_1_lookup%={(%%rip), %[t3]|]}\n\t" - "movs{lq (%[t3], %[t0], 4), %[t0]|xd %[t0], DWORD PTR [%[t3] + " - "%[t0] * " - "4]}\n\t" - "lea{q (%[t3], %[t0], 1), %[t0]| %[t0], [%[t0] + %[t3]]}\n\t" - "jmp{q *%[t0]| %[t0]}\n\t" + // set CF = 1, OF = 0 + "mov{b $255, %b[r11]| %b[r11], 255}\n\t" + "add{b $1, %b[r11]| %b[r11], 1}\n\t" + + "lea{q| %[r9], [rip +} .Lasm_mul_1_lookup%={(%%rip), %[r9]|]}\n\t" + "movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + %[r10] * 4]}\n\t" + "lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t" + "jmp{q *%[r10]| %[r10]}\n\t" ".align 8\n\t" - ".Lasm_submul_1_lookup%=:\n\t" - ".long .Lcase0%=-.Lasm_submul_1_lookup%=\n\t" - ".long .Lcase1%=-.Lasm_submul_1_lookup%=\n\t" - ".long .Lcase2%=-.Lasm_submul_1_lookup%=\n\t" - ".long .Lcase3%=-.Lasm_submul_1_lookup%=\n\t" - ".long .Lcase4%=-.Lasm_submul_1_lookup%=\n\t" - ".long .Lcase5%=-.Lasm_submul_1_lookup%=\n\t" - ".long .Lcase6%=-.Lasm_submul_1_lookup%=\n\t" - ".long .Lcase7%=-.Lasm_submul_1_lookup%=\n\t" - - ".Lasm_submul_1_loop%=:\n\t" - - "lea{q 64(%[src0]), %[src0]| %[src0], [%[src0] + 64]}\n\t" + ".Lasm_mul_1_lookup%=:\n\t" + ".long .Ll0%=-.Lasm_mul_1_lookup%=\n\t" + ".long .Ll1%=-.Lasm_mul_1_lookup%=\n\t" + ".long .Ll2%=-.Lasm_mul_1_lookup%=\n\t" + ".long .Ll3%=-.Lasm_mul_1_lookup%=\n\t" + ".long .Ll4%=-.Lasm_mul_1_lookup%=\n\t" + ".long .Ll5%=-.Lasm_mul_1_lookup%=\n\t" + ".long .Ll6%=-.Lasm_mul_1_lookup%=\n\t" + ".long .Ll7%=-.Lasm_mul_1_lookup%=\n\t" + ".align 16\n\t" + + // UB + ".Ld0%=:\n\t" + "xor %k[r9], %k[r9]\n\t" + "jmp .Ldone%=\n\t" + + ".Ll0%=:\n\t" + "jrcxz .Ld0%=\n\t" + "mulx {(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src]]}\n\t" + "not %[r10]\n\t" + "jmp .Lb0%=\n\t" + + ".Ll2%=:\n\t" + "mulx {(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src]]}\n\t" + "lea{q -48(%[src]), %[src]| %[src], [%[src] - 48]}\n\t" + "not %[r10]\n\t" + "lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t" + "jmp .Lb2%=\n\t" + + ".Ll3%=:\n\t" + "mulx {(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q -40(%[src]), %[src]| %[src], [%[src] - 40]}\n\t" + "not %[r8]\n\t" + "lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t" + "jmp .Lb3%=\n\t" + + ".Ll4%=:\n\t" + "mulx {(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src]]}\n\t" + "lea{q -32(%[src]), %[src]| %[src], [%[src] - 32]}\n\t" + "not %[r10]\n\t" + "lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t" + "jmp .Lb4%=\n\t" + + ".Ll5%=:\n\t" + "mulx {(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q -24(%[src]), %[src]| %[src], [%[src] - 24]}\n\t" + "not %[r8]\n\t" + "lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t" + "jmp .Lb5%=\n\t" + + ".Ll6%=:\n\t" + "mulx {(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src]]}\n\t" + "lea{q -16(%[src]), %[src]| %[src], [%[src] - 16]}\n\t" + "not %[r10]\n\t" + "lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t" + "jmp .Lb6%=\n\t" + + ".Ll7%=:\n\t" + "mulx {(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "lea{q -8(%[src]), %[src]| %[src], [%[src] - 8]}\n\t" + "not %[r8]\n\t" + "lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t" + "jmp .Lb7%=\n\t" + + ".Ld1%=:\n\t" + "adc{q (%[dst]), %[r8]| [%[dst]], %[r8]}\n\t" + "sbb{q $-1, %[r9]| %[r9], -1}\n\t" + "mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t" + "jmp .Ldone%=\n\t" + + ".Ll1%=:\n\t" + "mulx {(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t" + "not %[r8]\n\t" + "jrcxz .Ld1%=\n\t" + "lea{q 8(%[src]), %[src]| %[src], [%[src] + 8]}\n\t" + "lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t" + + ".align 32\n\t" + ".Lasm_addmul_1_loop%=:\n\t" + + ".Lb1%=:\n\t" + "mulx {(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src]]}\n\t" + "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" + "not %[r10]\n\t" + + ".Lb0%=:\n\t" + "mulx {8(%[src]), %[r8], %[r9]|%[r9], %[r8], [%[src] + 8]}\n\t" + "adcx{q (%[dst]), %[r10]| %[r10], [%[dst]]}\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "mov{q %[r10], (%[dst])| [%[dst]], %[r10]}\n\t" + "not %[r8]\n\t" + + "lea{q -1(%[cx]), %[cx]| %[cx], [%[cx] - 1]}\n\t" + + ".Lb7%=:\n\t" + "mulx {16(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src] + 16]}\n\t" + "adcx{q 8(%[dst]), %[r8]| %[r8], [%[dst] + 8]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t" + "not %[r10]\n\t" + + ".Lb6%=:\n\t" + "mulx {24(%[src]), %[r8], %[r9]|%[r9], %[r8], [%[src] + 24]}\n\t" + "adcx{q 16(%[dst]), %[r10]| %[r10], [%[dst] + 16]}\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "mov{q %[r10], 16(%[dst])| [%[dst] + 16], %[r10]}\n\t" + "not %[r8]\n\t" + + ".Lb5%=:\n\t" + "mulx {32(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src] + 32]}\n\t" + "adcx{q 24(%[dst]), %[r8]| %[r8], [%[dst] + 24]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t" + "not %[r10]\n\t" + + ".Lb4%=:\n\t" + "mulx {40(%[src]), %[r8], %[r9]|%[r9], %[r8], [%[src] + 40]}\n\t" + "adcx{q 32(%[dst]), %[r10]| %[r10], [%[dst] + 32]}\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "mov{q %[r10], 32(%[dst])| [%[dst] + 32], %[r10]}\n\t" + "not %[r8]\n\t" + + ".Lb3%=:\n\t" + "mulx {48(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src] + 48]}\n\t" + "adcx{q 40(%[dst]), %[r8]| %[r8], [%[dst] + 40]}\n\t" + "adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t" + "mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t" + "not %[r10]\n\t" + + ".Lb2%=:\n\t" + "mulx {56(%[src]), %[r8], %[r9]|%[r9], %[r8], [%[src] + 56]}\n\t" + "adcx{q 48(%[dst]), %[r10]| %[r10], [%[dst] + 48]}\n\t" + "adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t" + "mov{q %[r10], 48(%[dst])| [%[dst] + 48], %[r10]}\n\t" + "not %[r8]\n\t" + + "lea{q 64(%[src]), %[src]| %[src], [%[src] + 64]}\n\t" "lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t" - "lea{q -1(%[m]), %[m]| %[m], [%[m] - 1]}\n\t" - - "mulx {(%[src0]), %[t0], %[t1]|%[t1], %[t0], [%[src0]]}\n\t" - "adcx{q %[t2], %[t0]| %[t0], %[t2]}\n\t" - "not %[t0]\n\t" - "adox{q (%[dst]), %[t0]| %[t0], [%[dst]]}\n\t" - "mov{q %[t0], (%[dst])| [%[dst]], %[t0]}\n\t" - - ".Lcase7%=:\n\t" - "mulx {8(%[src0]), %[t0], %[t2]|%[t2], %[t0], [%[src0] + 8]}\n\t" - "adcx{q %[t1], %[t0]| %[t0], %[t1]}\n\t" - "not %[t0]\n\t" - "adox{q 8(%[dst]), %[t0]| %[t0], [%[dst] + 8]}\n\t" - "mov{q %[t0], 8(%[dst])| [%[dst] + 8], %[t0]}\n\t" - - ".Lcase6%=:\n\t" - "mulx {16(%[src0]), %[t0], %[t1]|%[t1], %[t0], [%[src0] + 16]}\n\t" - "adcx{q %[t2], %[t0]| %[t0], %[t2]}\n\t" - "not %[t0]\n\t" - "adox{q 16(%[dst]), %[t0]| %[t0], [%[dst] + 16]}\n\t" - "mov{q %[t0], 16(%[dst])| [%[dst] + 16], %[t0]}\n\t" - - ".Lcase5%=:\n\t" - "mulx {24(%[src0]), %[t0], %[t2]|%[t2], %[t0], [%[src0] + 24]}\n\t" - "adcx{q %[t1], %[t0]| %[t0], %[t1]}\n\t" - "not %[t0]\n\t" - "adox{q 24(%[dst]), %[t0]| %[t0], [%[dst] + 24]}\n\t" - "mov{q %[t0], 24(%[dst])| [%[dst] + 24], %[t0]}\n\t" - - ".Lcase4%=:\n\t" - "mulx {32(%[src0]), %[t0], %[t1]|%[t1], %[t0], [%[src0] + 32]}\n\t" - "adcx{q %[t2], %[t0]| %[t0], %[t2]}\n\t" - "not %[t0]\n\t" - "adox{q 32(%[dst]), %[t0]| %[t0], [%[dst] + 32]}\n\t" - "mov{q %[t0], 32(%[dst])| [%[dst] + 32], %[t0]}\n\t" - - ".Lcase3%=:\n\t" - "mulx {40(%[src0]), %[t0], %[t2]|%[t2], %[t0], [%[src0] + 40]}\n\t" - "adcx{q %[t1], %[t0]| %[t0], %[t1]}\n\t" - "not %[t0]\n\t" - "adox{q 40(%[dst]), %[t0]| %[t0], [%[dst] + 40]}\n\t" - "mov{q %[t0], 40(%[dst])| [%[dst] + 40], %[t0]}\n\t" - - ".Lcase2%=:\n\t" - "mulx {48(%[src0]), %[t0], %[t1]|%[t1], %[t0], [%[src0] + 48]}\n\t" - "adcx{q %[t2], %[t0]| %[t0], %[t2]}\n\t" - "not %[t0]\n\t" - "adox{q 48(%[dst]), %[t0]| %[t0], [%[dst] + 48]}\n\t" - "mov{q %[t0], 48(%[dst])| [%[dst] + 48], %[t0]}\n\t" - - ".Lcase1%=:\n\t" - "mulx {56(%[src0]), %[t0], %[t2]|%[t2], %[t0], [%[src0] + 56]}\n\t" - "adcx{q %[t1], %[t0]| %[t0], %[t1]}\n\t" - "not %[t0]\n\t" - "adox{q 56(%[dst]), %[t0]| %[t0], [%[dst] + 56]}\n\t" - "mov{q %[t0], 56(%[dst])| [%[dst] + 56], %[t0]}\n\t" - - ".Lcase0%=:\n\t" - "jrcxz .Lasm_submul_1_loop_out%=\n\t" - "jmp .Lasm_submul_1_loop%=\n\t" - - ".Lasm_submul_1_loop_out%=:\n\t" - "seto %b[t0]\n\t" - "mov{zbl %b[t0], %k[t0]|zx %k[t0], %b[t0]}\n\t" - "adc{q $1, %[t2]| %[t2], 1}\n\t" - "sub{q %[t0], %[t2]| %[t2], %[t0]}" - : [dst] "+r"(dst), [src0] "+r"(src0), [src1] "+d"(src1), [m] "+c"(m), - [t0] "+r"(t0), [t1] "=r"(t1), [t2] "+r"(t2), [t3] "=r"(t3) + + "jrcxz .Lloop_out%=\n\t" + "jmp .Lasm_addmul_1_loop%=\n\t" + ".Lloop_out%=:\n\t" + + "adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t" + "adox{q %[cx], %[r9]| %[r9], %[cx]}\n\t" + "mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t" + "sbb{q $-1, %[r9]| %[r9], -1}\n\t" + + ".Ldone%=:" + + : [dst] "+r"(dst), [src] "+r"(src), "+d"(dx), [cx] "+c"(cx), [r8] "=r"(r8), + [r9] "=r"(r9), [r10] "+r"(n), [r11] "=r"(r11) : : "cc", "memory"); - return t2; + WJR_ASSUME(dst == cdst + n); + WJR_ASSUME(src == csrc + n); + + return r9; } #endif