Skip to content

Commit

Permalink
opt pipeline of addmul_1
Browse files Browse the repository at this point in the history
  • Loading branch information
wjr-z committed Jan 14, 2024
1 parent 624cbe8 commit 995717f
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 77 deletions.
8 changes: 8 additions & 0 deletions include/wjr/preprocessor/preview.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,12 @@
#define WJR_ASM_PIC_JMPL(LABEL, TABLE) ".long " #LABEL "-" #TABLE
#define WJR_ASM_NOPIC_JMPL(LABEL) ".quad " #LABEL

#if defined(__linux__)
#define WJR_ASM_SECTION(...) ".section " WJR_PP_STRS(__VA_ARGS__) "\n\t"
#define WJR_ASM_PREVIOUS() ".previous\n\t"
#else
#define WJR_ASM_SECTION(...) "\n\t"
#define WJR_ASM_PREVIOUS() "\n\t"
#endif

#endif // ! WJR_PREPROCESSOR_PREVIEW_HPP__
218 changes: 141 additions & 77 deletions include/wjr/x86/mul.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ WJR_INTRINSIC_INLINE T mulx(T a, T b, T &hi) {

#if WJR_HAS_BUILTIN(ASM_MUL_1)

WJR_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t ml) {
WJR_NOINLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint64_t ml) {
size_t m = n / 8;
n &= 7;

Expand Down Expand Up @@ -224,105 +224,169 @@ WJR_INLINE uint64_t asm_mul_1(uint64_t *dst, const uint64_t *src, size_t n, uint

#if WJR_HAS_BUILTIN(ASM_ADDMUL_1)

WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src0, size_t n,
uint64_t src1) {
WJR_INLINE uint64_t asm_addmul_1(uint64_t *dst, const uint64_t *src, size_t n,
uint64_t ml) {
size_t m = n / 8;
n &= 7;
dst += n - 8;
src0 += n - 8;

uint64_t t0 = n;
uint64_t t1;
uint64_t t2 = 0;
uint64_t t3;
uint64_t r8, r9, r10 = n, r11;

asm volatile(
"xor %k[t1], %k[t1]\n\t"
"lea{q| %[t3], [rip +} .Lasm_addmul_1_lookup%={(%%rip), %[t3]|]}\n\t"
"movs{lq (%[t3], %[t0], 4), %[t0]|xd %[t0], DWORD PTR [%[t3] + "
"%[t0] * "
"4]}\n\t"
"lea{q (%[t3], %[t0], 1), %[t0]| %[t0], [%[t0] + %[t3]]}\n\t"
"jmp{q *%[t0]| %[t0]}\n\t"
"xor %k[r8], %k[r8]\n\t"
"lea{q| %[r9], [rip +} .Lasm_mul_1_lookup%={(%%rip), %[r9]|]}\n\t"
"movs{lq (%[r9], %[r10], 4), %[r10]|xd %[r10], DWORD PTR [%[r9] + %[r10] * 4]}\n\t"
"lea{q (%[r9], %[r10], 1), %[r10]| %[r10], [%[r10] + %[r9]]}\n\t"
"jmp{q *%[r10]| %[r10]}\n\t"

".align 8\n\t"
".Lasm_addmul_1_lookup%=:\n\t"
".long .Lcase0%=-.Lasm_addmul_1_lookup%=\n\t"
".long .Lcase1%=-.Lasm_addmul_1_lookup%=\n\t"
".long .Lcase2%=-.Lasm_addmul_1_lookup%=\n\t"
".long .Lcase3%=-.Lasm_addmul_1_lookup%=\n\t"
".long .Lcase4%=-.Lasm_addmul_1_lookup%=\n\t"
".long .Lcase5%=-.Lasm_addmul_1_lookup%=\n\t"
".long .Lcase6%=-.Lasm_addmul_1_lookup%=\n\t"
".long .Lcase7%=-.Lasm_addmul_1_lookup%=\n\t"
".Lasm_mul_1_lookup%=:\n\t"
".long .Ll0%=-.Lasm_mul_1_lookup%=\n\t"
".long .Ll1%=-.Lasm_mul_1_lookup%=\n\t"
".long .Ll2%=-.Lasm_mul_1_lookup%=\n\t"
".long .Ll3%=-.Lasm_mul_1_lookup%=\n\t"
".long .Ll4%=-.Lasm_mul_1_lookup%=\n\t"
".long .Ll5%=-.Lasm_mul_1_lookup%=\n\t"
".long .Ll6%=-.Lasm_mul_1_lookup%=\n\t"
".long .Ll7%=-.Lasm_mul_1_lookup%=\n\t"
".align 16\n\t"

".Lasm_addmul_1_loop%=:\n\t"
".Ld0%=:\n\t"
"xor %k[r9], %k[r9]\n\t"
"jmp .Ldone%=\n\t"

"lea{q 64(%[src0]), %[src0]| %[src0], [%[src0] + 64]}\n\t"
"lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t"
"lea{q -1(%[m]), %[m]| %[m], [%[m] - 1]}\n\t"
".Ll0%=:\n\t"
"jrcxz .Ld0%=\n\t"
"mulx {(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t"
"jmp .Lb0%=\n\t"

"mulx {(%[src0]), %[t0], %[t1]|%[t1], %[t0], [%[src0]]}\n\t"
"adcx{q %[t2], %[t0]| %[t0], %[t2]}\n\t"
"adox{q (%[dst]), %[t0]| %[t0], [%[dst]]}\n\t"
"mov{q %[t0], (%[dst])| [%[dst]], %[t0]}\n\t"
".Ld1%=:\n\t"
"add{q (%[dst]), %[r8]| [%[dst]], %[r8]}\n\t"
"adc{q $0, %[r9]| %[r9], 0}\n\t"
"mov{q %[r8], (%[dst])| [%[dst]], %[r8]}\n\t"
"jmp .Ldone%=\n\t"

".Lcase7%=:\n\t"
"mulx {8(%[src0]), %[t0], %[t2]|%[t2], %[t0], [%[src0] + 8]}\n\t"
"adcx{q %[t1], %[t0]| %[t0], %[t1]}\n\t"
"adox{q 8(%[dst]), %[t0]| %[t0], [%[dst] + 8]}\n\t"
"mov{q %[t0], 8(%[dst])| [%[dst] + 8], %[t0]}\n\t"
".Ll1%=:\n\t"
"mulx {(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t"
"jrcxz .Ld1%=\n\t"
"lea{q 8(%[src]), %[src]| %[src], [%[src] + 8]}\n\t"
"lea{q 8(%[dst]), %[dst]| %[dst], [%[dst] + 8]}\n\t"
"jmp .Lb1%=\n\t"

".Lcase6%=:\n\t"
"mulx {16(%[src0]), %[t0], %[t1]|%[t1], %[t0], [%[src0] + 16]}\n\t"
"adcx{q %[t2], %[t0]| %[t0], %[t2]}\n\t"
"adox{q 16(%[dst]), %[t0]| %[t0], [%[dst] + 16]}\n\t"
"mov{q %[t0], 16(%[dst])| [%[dst] + 16], %[t0]}\n\t"
".Ll2%=:\n\t"
"mulx {(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t"
"lea{q 1(%[m]), %[m]| %[m], [%[m] + 1]}\n\t"
"lea{q -48(%[src]), %[src]| %[src], [%[src] - 48]}\n\t"
"lea{q -48(%[dst]), %[dst]| %[dst], [%[dst] - 48]}\n\t"
"jmp .Lb2%=\n\t"

".Lcase5%=:\n\t"
"mulx {24(%[src0]), %[t0], %[t2]|%[t2], %[t0], [%[src0] + 24]}\n\t"
"adcx{q %[t1], %[t0]| %[t0], %[t1]}\n\t"
"adox{q 24(%[dst]), %[t0]| %[t0], [%[dst] + 24]}\n\t"
"mov{q %[t0], 24(%[dst])| [%[dst] + 24], %[t0]}\n\t"
".Ll3%=:\n\t"
"mulx {(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t"
"lea{q 1(%[m]), %[m]| %[m], [%[m] + 1]}\n\t"
"lea{q -40(%[src]), %[src]| %[src], [%[src] - 40]}\n\t"
"lea{q -40(%[dst]), %[dst]| %[dst], [%[dst] - 40]}\n\t"
"jmp .Lb3%=\n\t"

".Lcase4%=:\n\t"
"mulx {32(%[src0]), %[t0], %[t1]|%[t1], %[t0], [%[src0] + 32]}\n\t"
"adcx{q %[t2], %[t0]| %[t0], %[t2]}\n\t"
"adox{q 32(%[dst]), %[t0]| %[t0], [%[dst] + 32]}\n\t"
"mov{q %[t0], 32(%[dst])| [%[dst] + 32], %[t0]}\n\t"
".Ll4%=:\n\t"
"mulx {(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t"
"lea{q 1(%[m]), %[m]| %[m], [%[m] + 1]}\n\t"
"lea{q -32(%[src]), %[src]| %[src], [%[src] - 32]}\n\t"
"lea{q -32(%[dst]), %[dst]| %[dst], [%[dst] - 32]}\n\t"
"jmp .Lb4%=\n\t"

".Lcase3%=:\n\t"
"mulx {40(%[src0]), %[t0], %[t2]|%[t2], %[t0], [%[src0] + 40]}\n\t"
"adcx{q %[t1], %[t0]| %[t0], %[t1]}\n\t"
"adox{q 40(%[dst]), %[t0]| %[t0], [%[dst] + 40]}\n\t"
"mov{q %[t0], 40(%[dst])| [%[dst] + 40], %[t0]}\n\t"
".Ll5%=:\n\t"
"mulx {(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t"
"lea{q 1(%[m]), %[m]| %[m], [%[m] + 1]}\n\t"
"lea{q -24(%[src]), %[src]| %[src], [%[src] - 24]}\n\t"
"lea{q -24(%[dst]), %[dst]| %[dst], [%[dst] - 24]}\n\t"
"jmp .Lb5%=\n\t"

".Lcase2%=:\n\t"
"mulx {48(%[src0]), %[t0], %[t1]|%[t1], %[t0], [%[src0] + 48]}\n\t"
"adcx{q %[t2], %[t0]| %[t0], %[t2]}\n\t"
"adox{q 48(%[dst]), %[t0]| %[t0], [%[dst] + 48]}\n\t"
"mov{q %[t0], 48(%[dst])| [%[dst] + 48], %[t0]}\n\t"
".Ll6%=:\n\t"
"mulx {(%[src]), %[r10], %[r11]| %[r11], %[r10], [%[src]]}\n\t"
"lea{q 1(%[m]), %[m]| %[m], [%[m] + 1]}\n\t"
"lea{q -16(%[src]), %[src]| %[src], [%[src] - 16]}\n\t"
"lea{q -16(%[dst]), %[dst]| %[dst], [%[dst] - 16]}\n\t"
"jmp .Lb6%=\n\t"

".Lcase1%=:\n\t"
"mulx {56(%[src0]), %[t0], %[t2]|%[t2], %[t0], [%[src0] + 56]}\n\t"
"adcx{q %[t1], %[t0]| %[t0], %[t1]}\n\t"
"adox{q 56(%[dst]), %[t0]| %[t0], [%[dst] + 56]}\n\t"
"mov{q %[t0], 56(%[dst])| [%[dst] + 56], %[t0]}\n\t"
".Ll7%=:\n\t"
"mulx {(%[src]), %[r8], %[r9]| %[r9], %[r8], [%[src]]}\n\t"
"lea{q 1(%[m]), %[m]| %[m], [%[m] + 1]}\n\t"
"lea{q -8(%[src]), %[src]| %[src], [%[src] - 8]}\n\t"
"lea{q -8(%[dst]), %[dst]| %[dst], [%[dst] - 8]}\n\t"
"jmp .Lb7%=\n\t"

".Lcase0%=:\n\t"
"jrcxz .Lasm_addmul_1_loop_out%=\n\t"
".align 32\n\t"
".Lasm_addmul_1_loop%=:\n\t"

".Lb1%=:\n\t"
"mulx {(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src]]}\n\t"
"adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t"
"adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t"
"mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t"

".Lb0%=:\n\t"
"mulx {8(%[src]), %[r8], %[r9]|%[r9], %[r8], [%[src] + 8]}\n\t"
"adcx{q (%[dst]), %[r10]| %[r10], [%[dst]]}\n\t"
"adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t"
"mov{q %[r10], (%[dst])| [%[dst]], %[r10]}\n\t"

".Lb7%=:\n\t"
"mulx {16(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src] + 16]}\n\t"
"adcx{q 8(%[dst]), %[r8]| %[r8], [%[dst] + 8]}\n\t"
"adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t"
"mov{q %[r8], 8(%[dst])| [%[dst] + 8], %[r8]}\n\t"

".Lb6%=:\n\t"
"mulx {24(%[src]), %[r8], %[r9]|%[r9], %[r8], [%[src] + 24]}\n\t"
"adcx{q 16(%[dst]), %[r10]| %[r10], [%[dst] + 16]}\n\t"
"adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t"
"mov{q %[r10], 16(%[dst])| [%[dst] + 16], %[r10]}\n\t"

".Lb5%=:\n\t"
"mulx {32(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src] + 32]}\n\t"
"adcx{q 24(%[dst]), %[r8]| %[r8], [%[dst] + 24]}\n\t"
"adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t"
"mov{q %[r8], 24(%[dst])| [%[dst] + 24], %[r8]}\n\t"

".Lb4%=:\n\t"
"mulx {40(%[src]), %[r8], %[r9]|%[r9], %[r8], [%[src] + 40]}\n\t"
"adcx{q 32(%[dst]), %[r10]| %[r10], [%[dst] + 32]}\n\t"
"adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t"
"mov{q %[r10], 32(%[dst])| [%[dst] + 32], %[r10]}\n\t"

".Lb3%=:\n\t"
"mulx {48(%[src]), %[r10], %[r11]|%[r11], %[r10], [%[src] + 48]}\n\t"
"adcx{q 40(%[dst]), %[r8]| %[r8], [%[dst] + 40]}\n\t"
"adox{q %[r9], %[r10]| %[r10], %[r9]}\n\t"
"mov{q %[r8], 40(%[dst])| [%[dst] + 40], %[r8]}\n\t"

".Lb2%=:\n\t"
"mulx {56(%[src]), %[r8], %[r9]|%[r9], %[r8], [%[src] + 56]}\n\t"
"adcx{q 48(%[dst]), %[r10]| %[r10], [%[dst] + 48]}\n\t"
"adox{q %[r11], %[r8]| %[r8], %[r11]}\n\t"
"mov{q %[r10], 48(%[dst])| [%[dst] + 48], %[r10]}\n\t"

"lea{q 64(%[src]), %[src]| %[src], [%[src] + 64]}\n\t"
"lea{q 64(%[dst]), %[dst]| %[dst], [%[dst] + 64]}\n\t"
"lea{q -1(%[m]), %[m]| %[m], [%[m] - 1]}\n\t"

"jrcxz .Lloop_out%=\n\t"
"jmp .Lasm_addmul_1_loop%=\n\t"
".Lloop_out%=:\n\t"

"adcx{q -8(%[dst]), %[r8]| %[r8], [%[dst] - 8]}\n\t"
"seto %b[r10]\n\t"
"mov{zbl %b[r10], %k[r10]|zx %k[r10], %b[r10]}\n\t"
"adcx{q %[r10], %[r9]| %[r9], %[r10]}\n\t"
"mov{q %[r8], -8(%[dst])| [%[dst] - 8], %[r8]}\n\t"

".Lasm_addmul_1_loop_out%=:\n\t"
"seto %b[t0]\n\t"
"mov{zbl %b[t0], %k[t0]|zx %k[t0], %b[t0]}\n\t"
"adc{q %[t0], %[t2]| %[t2], %[t0]}"
: [dst] "+r"(dst), [src0] "+r"(src0), [src1] "+d"(src1), [m] "+c"(m),
[t0] "+r"(t0), [t1] "=r"(t1), [t2] "+r"(t2), [t3] "=r"(t3)
".Ldone%=:"

: [dst] "+r"(dst), [src] "+r"(src), "+d"(ml), [m] "+c"(m), [r8] "=r"(r8),
[r9] "=r"(r9), [r10] "+r"(r10), [r11] "=r"(r11)
:
: "cc", "memory");

return t2;
return r9;
}

#endif
Expand Down

0 comments on commit 995717f

Please sign in to comment.