From 43c95d627d7b0248d093d5c3bd679d9078b5d245 Mon Sep 17 00:00:00 2001 From: Santeri Hannula Date: Thu, 10 Oct 2024 17:47:00 +0300 Subject: [PATCH 1/3] ext: fix generated gmp x86_64 linux assemby sources --- ext/gmp/build.zig | 2 +- ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s | 14 +- ext/gmp/gen/x86_64-linux/mpn/add_n.s | 207 +-- ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s | 179 +-- ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s | 194 ++- ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s | 300 ++-- ext/gmp/gen/x86_64-linux/mpn/addmul_1.s | 218 ++- ext/gmp/gen/x86_64-linux/mpn/addmul_2.s | 244 ++- ext/gmp/gen/x86_64-linux/mpn/and_n.s | 63 +- ext/gmp/gen/x86_64-linux/mpn/andn_n.s | 61 +- ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s | 77 +- ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s | 160 +- ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s | 165 +- ext/gmp/gen/x86_64-linux/mpn/com.s | 303 +--- ext/gmp/gen/x86_64-linux/mpn/copyd.s | 213 +-- ext/gmp/gen/x86_64-linux/mpn/copyi.s | 249 +-- ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s | 6 +- ext/gmp/gen/x86_64-linux/mpn/divrem_1.s | 15 - ext/gmp/gen/x86_64-linux/mpn/gcd_11.s | 164 +- ext/gmp/gen/x86_64-linux/mpn/gcd_22.s | 319 +++- ext/gmp/gen/x86_64-linux/mpn/hamdist.s | 204 +-- ext/gmp/gen/x86_64-linux/mpn/ior_n.s | 63 +- ext/gmp/gen/x86_64-linux/mpn/iorn_n.s | 61 +- ext/gmp/gen/x86_64-linux/mpn/lshift.s | 237 ++- ext/gmp/gen/x86_64-linux/mpn/lshiftc.s | 259 ++-- ext/gmp/gen/x86_64-linux/mpn/mul_1.s | 219 +-- ext/gmp/gen/x86_64-linux/mpn/mul_2.s | 190 ++- ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s | 625 ++++---- ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s | 641 ++++---- .../gen/x86_64-linux/mpn/mulmid_basecase.s | 573 +++++++ ext/gmp/gen/x86_64-linux/mpn/nand_n.s | 63 +- ext/gmp/gen/x86_64-linux/mpn/nior_n.s | 63 +- ext/gmp/gen/x86_64-linux/mpn/popcount.s | 189 +-- ext/gmp/gen/x86_64-linux/mpn/redc_1.s | 792 ++++++---- ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s | 179 +-- ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s | 194 ++- ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s | 300 ++-- ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s | 83 +- ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s | 83 +- ext/gmp/gen/x86_64-linux/mpn/rshift.s | 251 ++- ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s | 197 +-- ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s | 1372 ++++++++--------- ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s | 14 +- ext/gmp/gen/x86_64-linux/mpn/sub_n.s | 207 +-- ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s | 171 +- ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s | 190 --- ext/gmp/gen/x86_64-linux/mpn/submul_1.s | 203 ++- ext/gmp/gen/x86_64-linux/mpn/xnor_n.s | 61 +- ext/gmp/gen/x86_64-linux/mpn/xor_n.s | 63 +- 49 files changed, 5334 insertions(+), 5566 deletions(-) create mode 100644 ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s delete mode 100644 ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s diff --git a/ext/gmp/build.zig b/ext/gmp/build.zig index efa83a9895..cdfad41394 100644 --- a/ext/gmp/build.zig +++ b/ext/gmp/build.zig @@ -685,6 +685,7 @@ const x86_64_linux_asm_sources = [_][]const u8{ "gen/x86_64-linux/mpn/mul_2.s", "gen/x86_64-linux/mpn/mul_basecase.s", "gen/x86_64-linux/mpn/mullo_basecase.s", + "gen/x86_64-linux/mpn/mulmid_basecase.s", "gen/x86_64-linux/mpn/nand_n.s", "gen/x86_64-linux/mpn/nior_n.s", "gen/x86_64-linux/mpn/popcount.s", @@ -703,7 +704,6 @@ const x86_64_linux_asm_sources = [_][]const u8{ "gen/x86_64-linux/mpn/sub_err3_n.s", "gen/x86_64-linux/mpn/sub_n.s", "gen/x86_64-linux/mpn/sublsh1_n.s", - "gen/x86_64-linux/mpn/sublsh2_n.s", "gen/x86_64-linux/mpn/submul_1.s", "gen/x86_64-linux/mpn/xnor_n.s", "gen/x86_64-linux/mpn/xor_n.s", diff --git a/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s b/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s index 6c2ae338b4..2cbba6ad10 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s @@ -189,20 +189,20 @@ __gmpn_add_err1_n: .align 32, 0x90 .Lloop: - mov (%rsi,%r9,8), %r14 shr $1, %al mov -8(%r8), %r10 mov $0, %r13d + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 adc (%rdx,%r9,8), %r14 cmovnc %r13, %r10 - mov %r14, (%rdi,%r9,8) - mov 8(%rsi,%r9,8), %r15 - mov 16(%rsi,%r9,8), %r14 adc 8(%rdx,%r9,8), %r15 mov -16(%r8), %r11 + mov %r14, (%rdi,%r9,8) + mov 16(%rsi,%r9,8), %r14 + mov %r15, 8(%rdi,%r9,8) cmovnc %r13, %r11 mov -24(%r8), %r12 - mov %r15, 8(%rdi,%r9,8) adc 16(%rdx,%r9,8), %r14 cmovnc %r13, %r12 mov 24(%rsi,%r9,8), %r15 @@ -215,12 +215,12 @@ __gmpn_add_err1_n: adc $0, %rbp add %r12, %rbx adc $0, %rbp - lea -32(%r8), %r8 mov %r14, 16(%rdi,%r9,8) add %r13, %rbx + lea -32(%r8), %r8 adc $0, %rbp + mov %r15, 24(%rdi,%r9,8) add $4, %r9 - mov %r15, -8(%rdi,%r9,8) jnz .Lloop .Lend: diff --git a/ext/gmp/gen/x86_64-linux/mpn/add_n.s b/ext/gmp/gen/x86_64-linux/mpn/add_n.s index 400fe976ec..14cc32b0b9 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/add_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/add_n.s @@ -94,20 +94,18 @@ __gmpn_add_nc: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 - neg %r8 + shr $2, %rcx + and $3, %eax + bt $0, %r8 + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax + mov (%rsi), %r8 + mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid .size __gmpn_add_nc,.-__gmpn_add_nc - .align 16, 0x90 .globl __gmpn_add_n .type __gmpn_add_n,@function @@ -115,159 +113,82 @@ __gmpn_add_nc: __gmpn_add_n: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 + shr $2, %rcx + and $3, %eax + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax - - -.L0: mov (%rsi), %r8 + mov (%rsi), %r8 mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid + +.Llt4: dec %eax + mov (%rsi), %r8 + jnz .L2 adc (%rdx), %r8 - jmp .Le0 + mov %r8, (%rdi) + adc %eax, %eax + + ret -.L4: mov (%rsi), %r8 +.L2: dec %eax mov 8(%rsi), %r9 + jnz .L3 adc (%rdx), %r8 - lea -32(%rsi), %rsi - lea -32(%rdx), %rdx - lea -32(%rdi), %rdi - inc %rcx - jmp .Le4 - -.L5: mov (%rsi), %r11 - mov 8(%rsi), %r8 - mov 16(%rsi), %r9 - adc (%rdx), %r11 - lea -24(%rsi), %rsi - lea -24(%rdx), %rdx - lea -24(%rdi), %rdi - inc %rcx - jmp .Le5 - -.L6: mov (%rsi), %r10 - adc (%rdx), %r10 - mov 8(%rsi), %r11 - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - inc %rcx - jmp .Le6 - -.L7: mov (%rsi), %r9 - mov 8(%rsi), %r10 - adc (%rdx), %r9 - adc 8(%rdx), %r10 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - inc %rcx - jmp .Le7 + adc 8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + adc %eax, %eax + + ret - .align 16, 0x90 -.Ltop: -.Le3: mov %r9, 40(%rdi) -.Le2: mov %r10, 48(%rdi) -.Le1: mov (%rsi), %r8 - mov 8(%rsi), %r9 +.L3: mov 16(%rsi), %r10 adc (%rdx), %r8 - mov %r11, 56(%rdi) - lea 64(%rdi), %rdi -.Le0: mov 16(%rsi), %r10 adc 8(%rdx), %r9 adc 16(%rdx), %r10 mov %r8, (%rdi) -.Le7: mov 24(%rsi), %r11 mov %r9, 8(%rdi) -.Le6: mov 32(%rsi), %r8 - mov 40(%rsi), %r9 - adc 24(%rdx), %r11 mov %r10, 16(%rdi) -.Le5: adc 32(%rdx), %r8 - mov %r11, 24(%rdi) -.Le4: mov 48(%rsi), %r10 - mov 56(%rsi), %r11 - mov %r8, 32(%rdi) - lea 64(%rsi), %rsi - adc 40(%rdx), %r9 - adc 48(%rdx), %r10 - adc 56(%rdx), %r11 - lea 64(%rdx), %rdx - dec %rcx - jnz .Ltop - -.Lend: mov %r9, 40(%rdi) - mov %r10, 48(%rdi) - mov %r11, 56(%rdi) - mov %ecx, %eax - adc %ecx, %eax + setc %al ret .align 16, 0x90 -.L3: mov (%rsi), %r9 - mov 8(%rsi), %r10 - mov 16(%rsi), %r11 - adc (%rdx), %r9 - adc 8(%rdx), %r10 - adc 16(%rdx), %r11 - jrcxz .Lx3 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea -40(%rdi), %rdi - jmp .Le3 -.Lx3: mov %r9, (%rdi) - mov %r10, 8(%rdi) - mov %r11, 16(%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Ltop: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + dec %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Ltop - .align 16, 0x90 -.L1: mov (%rsi), %r11 - adc (%rdx), %r11 - jrcxz .Lx1 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea -56(%rdi), %rdi - jmp .Le1 -.Lx1: mov %r11, (%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Lend: lea 32(%rsi), %rsi + adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + lea 32(%rdx), %rdx + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + lea 32(%rdi), %rdi - .align 16, 0x90 -.L2: mov (%rsi), %r10 - mov 8(%rsi), %r11 - adc (%rdx), %r10 - adc 8(%rdx), %r11 - jrcxz .Lx2 - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea -48(%rdi), %rdi - jmp .Le2 -.Lx2: mov %r10, (%rdi) - mov %r11, 8(%rdi) - mov %ecx, %eax - adc %ecx, %eax + inc %eax + dec %eax + jnz .Llt4 + adc %eax, %eax ret .size __gmpn_add_n,.-__gmpn_add_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s b/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s index cac8dd4b70..e3d3aae6c0 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s @@ -46,15 +46,6 @@ - - - - - - - - - @@ -77,6 +68,9 @@ + + + @@ -86,29 +80,6 @@ .text - .align 16, 0x90 - .globl __gmpn_addlsh1_nc - .type __gmpn_addlsh1_nc,@function - -__gmpn_addlsh1_nc: - - - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $63, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_addlsh1_nc,.-__gmpn_addlsh1_nc - .align 16, 0x90 .globl __gmpn_addlsh1_n .type __gmpn_addlsh1_n,@function @@ -117,96 +88,92 @@ __gmpn_addlsh1_n: push %rbp - xor %ebp, %ebp + mov (%rdx), %r8 - shrd $63, %r8, %rbp mov %ecx, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + xor %ebp, %ebp and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 sbb %eax, %eax - sub $3, %rcx - ja .Ltop - jmp .Lend - -.Lb01: add %eax, %eax - adc (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi + add (%rsi,%rcx,8), %r8 + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + adc 16(%rsi,%rcx,8), %r10 + mov %r10, 16(%rdi,%rcx,8) + sbb %ebp, %ebp + add $3, %rcx + jmp .Lent + +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 sbb %eax, %eax - sub $1, %rcx - ja .Ltop - jmp .Lend - -.Lb10: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi + add (%rsi,%rcx,8), %r8 + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb %ebp, %ebp + add $2, %rcx + jmp .Lent + +.Lb01: add %r8, %r8 sbb %eax, %eax - sub $2, %rcx - ja .Ltop - jmp .Lend + add (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $63, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - mov 24(%rdx), %r11 - shrd $63, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - adc 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi +.Ltop: add %eax, %eax + + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + sbb %eax, %eax - sub $4, %rcx - jnz .Ltop + add %ebp, %ebp + + adc (%rsi,%rcx,8), %r8 + nop + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + adc 16(%rsi,%rcx,8), %r10 + adc 24(%rsi,%rcx,8), %r11 + mov %r10, 16(%rdi,%rcx,8) + mov %r11, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp + add $4, %rcx + js .Ltop + +.Lend: + + add %ebp, %eax + neg %eax + -.Lend: shr $63, %rbp - add %eax, %eax - adc $0, %rbp - mov %rbp, %rax pop %rbp ret .size __gmpn_addlsh1_n,.-__gmpn_addlsh1_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s b/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s index 313daa83e2..00e20905cc 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s @@ -46,10 +46,13 @@ - - - - + + + + + + + @@ -87,30 +90,11 @@ - .text - .align 16, 0x90 - .globl __gmpn_addlsh2_nc - .type __gmpn_addlsh2_nc,@function - -__gmpn_addlsh2_nc: - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $62, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_addlsh2_nc,.-__gmpn_addlsh2_nc + + .text .align 16, 0x90 .globl __gmpn_addlsh2_n .type __gmpn_addlsh2_n,@function @@ -118,96 +102,102 @@ __gmpn_addlsh2_nc: __gmpn_addlsh2_n: - push %rbp - xor %ebp, %ebp + push %r12 + push %r13 + push %r14 + push %r15 + mov (%rdx), %r8 - shrd $62, %r8, %rbp + lea (,%r8,4), %r12 + shr $62, %r8 + mov %ecx, %eax - and $3, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + and $3, %al je .Lb00 - cmp $2, %eax + cmp $2, %al jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - sbb %eax, %eax - sub $3, %rcx - ja .Ltop +.Lb11: mov 8(%rdx,%rcx,8), %r10 + lea (%r8,%r10,4), %r14 + shr $62, %r10 + mov 16(%rdx,%rcx,8), %r11 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r14 + adc 16(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r14, 8(%rdi,%rcx,8) + mov %r15, 16(%rdi,%rcx,8) + add $3, %rcx + js .Ltop jmp .Lend -.Lb01: add %eax, %eax - adc (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - sbb %eax, %eax - sub $1, %rcx - ja .Ltop +.Lb01: mov %r8, %r11 + add (%rsi,%rcx,8), %r12 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + add $1, %rcx + js .Ltop jmp .Lend -.Lb10: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - sbb %eax, %eax - sub $2, %rcx - ja .Ltop +.Lb10: mov 8(%rdx,%rcx,8), %r11 + lea (%r8,%r11,4), %r15 + shr $62, %r11 + add (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r15, 8(%rdi,%rcx,8) + add $2, %rcx + js .Ltop jmp .Lend +.Lb00: mov 8(%rdx,%rcx,8), %r9 + mov 16(%rdx,%rcx,8), %r10 + jmp .Le00 + .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $62, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - mov 24(%rdx), %r11 - shrd $62, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - adc (%rsi), %rbp - adc 8(%rsi), %r8 - adc 16(%rsi), %r9 - adc 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi - sbb %eax, %eax - sub $4, %rcx - jnz .Ltop - -.Lend: shr $62, %rbp - add %eax, %eax - adc $0, %rbp - mov %rbp, %rax - pop %rbp +.Ltop: mov 16(%rdx,%rcx,8), %r10 + mov (%rdx,%rcx,8), %r8 + mov 8(%rdx,%rcx,8), %r9 + lea (%r11,%r8,4), %r12 + shr $62, %r8 +.Le00: lea (%r8,%r9,4), %r13 + shr $62, %r9 + mov 24(%rdx,%rcx,8), %r11 + lea (%r9,%r10,4), %r14 + shr $62, %r10 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add %eax, %eax + adc (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r13 + adc 16(%rsi,%rcx,8), %r14 + adc 24(%rsi,%rcx,8), %r15 + mov %r12, (%rdi,%rcx,8) + mov %r13, 8(%rdi,%rcx,8) + mov %r14, 16(%rdi,%rcx,8) + sbb %eax, %eax + mov %r15, 24(%rdi,%rcx,8) + add $4, %rcx + js .Ltop +.Lend: + + + sub %r11d, %eax + neg %eax + + pop %r15 + pop %r14 + pop %r13 + pop %r12 ret .size __gmpn_addlsh2_n,.-__gmpn_addlsh2_n diff --git a/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s b/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s index 00e16c8d00..2d261d5e37 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s @@ -65,32 +65,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -103,7 +78,7 @@ .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_addlsh_n .type __gmpn_addlsh_n,@function @@ -111,142 +86,143 @@ __gmpn_addlsh_n: + push %r12 + push %rbp + push %rbx + + mov (%rdx), %rax + + mov $0, %ebp + sub %rcx, %rbp + + lea -16(%rsi,%rcx,8), %rsi + lea -16(%rdi,%rcx,8), %rdi + lea 16(%rdx,%rcx,8), %r12 + + mov %rcx, %r9 + + mov %r8, %rcx + mov $1, %r8d + shl %cl, %r8 + + mul %r8 + + and $3, %r9d + jz .Lb0 + cmp $2, %r9d + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %r11 + add 16(%rsi,%rbp,8), %r11 + mov -8(%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov (%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $3, %rbp + jnz .Llo3 + jmp .Lcj3 + +.Lb2: mov %rax, %rbx + mov -8(%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $2, %rbp + jz .Lcj2 + mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + xor %ecx, %ecx + jmp .Llo2 + +.Lb1: mov %rax, %r9 + mov %rdx, %r10 + add $1, %rbp + jnz .Lgt1 + add 8(%rsi,%rbp,8), %r9 + jmp .Lcj1 +.Lgt1: mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add 8(%rsi,%rbp,8), %r9 + adc 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + jmp .Llo1 + +.Lb0: mov %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jz .Lend - mov (%rdx), %r10 - - mov %ecx, %eax - shr $3, %rcx - xor %r9d, %r9d - sub %r8, %r9 - and $7, %eax - - lea .Ltab(%rip), %r11 - - movslq (%r11,%rax,4), %rax - add %r11, %rax - jmp *%rax - - -.L0: lea 32(%rsi), %rsi - lea 32(%rdx), %rdx - lea 32(%rdi), %rdi - xor %r11d, %r11d - jmp .Le0 - -.L7: mov %r10, %r11 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - xor %r10d, %r10d - jmp .Le7 - -.L6: lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le6 - -.L5: mov %r10, %r11 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le5 - -.Lend: adc 24(%rsi), %rax - mov %rax, -40(%rdi) - .byte 0xc4,194,179,0xf7,195 - adc %rcx, %rax + .align 8, 0x90 +.Ltop: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Llo3: mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %r11, -8(%rdi,%rbp,8) +.Llo2: mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add %ecx, %ecx + adc (%rsi,%rbp,8), %rbx + adc 8(%rsi,%rbp,8), %r9 + adc 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rbx, (%rdi,%rbp,8) +.Llo1: mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov %r9, 8(%rdi,%rbp,8) +.Llo0: mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jnz .Ltop + +.Lend: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Lcj3: mov %r11, -8(%rdi,%rbp,8) +.Lcj2: add %ecx, %ecx + adc (%rsi,%rbp,8), %rbx + adc 8(%rsi,%rbp,8), %r9 + mov %rbx, (%rdi,%rbp,8) +.Lcj1: mov %r9, 8(%rdi,%rbp,8) + mov %rdx, %rax + adc $0, %rax + pop %rbx + pop %rbp + pop %r12 ret - - .align 32, 0x90 -.Ltop: jrcxz .Lend - mov -32(%rdx), %r10 - adc 24(%rsi), %rax - lea 64(%rsi), %rsi - .byte 0xc4,66,179,0xf7,219 - mov %rax, -40(%rdi) -.Le0: dec %rcx - .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -24(%rdx), %r11 - adc -32(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -32(%rdi) -.Le7: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov -16(%rdx), %r10 - adc -24(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -24(%rdi) -.Le6: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -8(%rdx), %r11 - adc -16(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -16(%rdi) -.Le5: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov (%rdx), %r10 - adc -8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -8(%rdi) -.Le4: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 8(%rdx), %r11 - adc (%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, (%rdi) -.Le3: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov 16(%rdx), %r10 - adc 8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, 8(%rdi) -.Le2: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 24(%rdx), %r11 - adc 16(%rsi), %rax - lea 64(%rdx), %rdx - .byte 0xc4,66,179,0xf7,210 - mov %rax, 16(%rdi) - lea 64(%rdi), %rdi -.Le1: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - jmp .Ltop - -.L4: xor %r11d, %r11d - jmp .Le4 - -.L3: mov %r10, %r11 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le3 - -.L2: lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le2 - -.L1: mov %r10, %r11 - lea -24(%rsi), %rsi - lea 40(%rdx), %rdx - lea 40(%rdi), %rdi - xor %r10d, %r10d - jmp .Le1 .size __gmpn_addlsh_n,.-__gmpn_addlsh_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab - diff --git a/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s b/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s index 715dc68504..8daf1ac3cd 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s @@ -67,8 +67,6 @@ - - @@ -77,136 +75,122 @@ + + + + + + + + .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_addmul_1 .type __gmpn_addmul_1,@function __gmpn_addmul_1: - - mov %rcx, %r10 - mov %rdx, %rcx - mov %edx, %r8d - shr $3, %rcx - and $7, %r8d - mov %r10, %rdx - lea .Ltab(%rip), %r10 - movslq (%r10,%r8,4), %r8 - lea (%r8, %r10), %r10 - jmp *%r10 - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .Lf0-.Ltab - .long .Lf1-.Ltab - .long .Lf2-.Ltab - .long .Lf3-.Ltab - .long .Lf4-.Ltab - .long .Lf5-.Ltab - .long .Lf6-.Ltab - .long .Lf7-.Ltab - .text -.Lf0: .byte 0xc4,98,171,0xf6,6 - lea -8(%rsi), %rsi - lea -8(%rdi), %rdi - lea -1(%rcx), %rcx - jmp .Lb0 - -.Lf3: .byte 0xc4,226,179,0xf6,6 - lea 16(%rsi), %rsi - lea -48(%rdi), %rdi - jmp .Lb3 - -.Lf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea -40(%rdi), %rdi - jmp .Lb4 - -.Lf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea -32(%rdi), %rdi - jmp .Lb5 - -.Lf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea -24(%rdi), %rdi - jmp .Lb6 - -.Lf1: .byte 0xc4,226,179,0xf6,6 - jrcxz .L1 - jmp .Lb1 -.L1: add (%rdi), %r9 - mov %r9, (%rdi) - adc %rcx, %rax - - ret -.Lend: .byte 0xf3,76,0x0f,0x38,0xf6,15 - mov %r9, (%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,193 - adc %rcx, %rax - - ret - nop;nop;nop;nop - -.Lf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,226,179,0xf6,6 - - .align 32, 0x90 -.Ltop: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - jrcxz .Lend -.Lb1: .byte 0xc4,98,171,0xf6,70,8 - .byte 0xf3,76,0x0f,0x38,0xf6,15 - lea -1(%rcx), %rcx - mov %r9, (%rdi) - .byte 0x66,76,0x0f,0x38,0xf6,208 -.Lb0: .byte 0xc4,226,179,0xf6,70,16 - .byte 0x66,77,0x0f,0x38,0xf6,200 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) -.Lb7: .byte 0xc4,98,171,0xf6,70,24 - lea 64(%rsi), %rsi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,16 - mov %r9, 16(%rdi) -.Lb6: .byte 0xc4,226,179,0xf6,70,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 24(%rdi) -.Lb5: .byte 0xc4,98,171,0xf6,70,232 - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,32 - mov %r9, 32(%rdi) -.Lb4: .byte 0xc4,226,179,0xf6,70,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 40(%rdi) -.Lb3: .byte 0xf3,76,0x0f,0x38,0xf6,79,48 - .byte 0xc4,98,171,0xf6,70,248 - mov %r9, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xc4,226,179,0xf6,6 - jmp .Ltop - -.Lf7: .byte 0xc4,226,179,0xf6,6 - lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - jmp .Lb7 - .size __gmpn_addmul_1,.-__gmpn_addmul_1 + mov (%rsi), %rax + push %rbx + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 + + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + add %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 + + .align 16, 0x90 +.Ltop: add %r10, (%rdi,%r11,8) + adc %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + add %r9, 8(%rdi,%r11,8) + adc %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + add %r8, 16(%rdi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + add %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + adc %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + add %r10, (%rdi,%r11,8) + adc %rax, %r9 + adc %r8, %rdx + add %r9, 8(%rdi,%r11,8) +.Lret: adc $0, %rdx + mov %rdx, %rax + + pop %rbx + + + ret + .size __gmpn_addmul_1,.-__gmpn_addmul_1 diff --git a/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s b/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s index 7fd478bd41..5883dab926 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s +++ b/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s @@ -83,171 +83,125 @@ + .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_addmul_2 .type __gmpn_addmul_2,@function __gmpn_addmul_2: + mov %rdx, %r11 push %rbx push %rbp - push %r12 - push %r13 - mov (%rcx), %r8 + mov 0(%rcx), %r8 mov 8(%rcx), %r9 - mov %rdx, %r11 - shr $2, %r11 - - test $1, %dl - jnz .Lbx1 - -.Lbx0: mov (%rdi), %r12 - mov 8(%rdi), %r13 - test $2, %dl - jnz .Lb10 - -.Lb00: mov (%rsi), %rdx - lea 16(%rsi), %rsi - .byte 0xc4,194,251,0xf6,200 - add %rax, %r12 - .byte 0xc4,194,251,0xf6,233 - adc $0, %rcx - mov %r12, (%rdi) - add %rax, %r13 - adc $0, %rbp - mov -8(%rsi), %rdx - lea 16(%rdi), %rdi - jmp .Llo0 + mov %edx, %ebx + mov (%rsi), %rax + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + mul %r8 + neg %r11 + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %rcx + mov %rdx, %rbp + xor %r10d, %r10d + mov 8(%rsi,%r11,8), %rax + dec %r11 + jmp .Llo3 -.Lb10: mov (%rsi), %rdx - inc %r11 - .byte 0xc4,194,251,0xf6,200 - add %rax, %r12 - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - mov %r12, (%rdi) - mov 16(%rdi), %r12 - add %rax, %r13 - adc $0, %rbp - xor %rbx, %rbx +.Lb2: mov %rax, %rbp + mov 8(%rsi,%r11,8), %rax + mov %rdx, %r10 + xor %ebx, %ebx + add $-2, %r11 jmp .Llo2 -.Lbx1: mov (%rdi), %r13 - mov 8(%rdi), %r12 - test $2, %dl - jnz .Lb11 - -.Lb01: mov (%rsi), %rdx - .byte 0xc4,66,251,0xf6,208 - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - mov 8(%rsi), %rdx - mov %r13, (%rdi) - mov 16(%rdi), %r13 - .byte 0xc4,194,251,0xf6,200 - lea 24(%rdi), %rdi - lea 24(%rsi), %rsi +.Lb1: mov %rax, %r10 + mov 8(%rsi,%r11,8), %rax + mov %rdx, %rbx + xor %ecx, %ecx + inc %r11 jmp .Llo1 -.Lb11: mov (%rsi), %rdx - inc %r11 - .byte 0xc4,66,251,0xf6,208 - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - mov %r13, (%rdi) - mov 8(%rsi), %rdx - .byte 0xc4,194,251,0xf6,200 - lea 8(%rdi), %rdi - lea 8(%rsi), %rsi - jmp .Llo3 +.Lb0: mov $0, %r10d + mov %rax, %rbx + mov 8(%rsi,%r11,8), %rax + mov %rdx, %rcx + xor %ebp, %ebp + jmp .Llo0 + + .align 32, 0x90 +.Ltop: mov $0, %ecx + mul %r8 + add %rax, %r10 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %ecx +.Llo1: mul %r9 + add %r10, (%rdi,%r11,8) + mov $0, %r10d + adc %rax, %rbx + mov $0, %ebp + mov 8(%rsi,%r11,8), %rax + adc %rdx, %rcx + mul %r8 + add %rax, %rbx + mov 8(%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp +.Llo0: mul %r9 + add %rbx, 8(%rdi,%r11,8) + adc %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rcx + adc %rdx, %rbp + adc $0, %r10d + mov 16(%rsi,%r11,8), %rax +.Llo3: mul %r9 + add %rcx, 16(%rdi,%r11,8) + adc %rax, %rbp + adc %rdx, %r10 + xor %ebx, %ebx + mov 24(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rbp + mov 24(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx +.Llo2: mul %r9 + add %rbp, 24(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx + mov 32(%rsi,%r11,8), %rax + add $4, %r11 + js .Ltop + +.Lend: xor %ecx, %ecx + mul %r8 + add %rax, %r10 + mov (%rsi), %rax + adc %rdx, %rbx + adc %ecx, %ecx + mul %r9 + add %r10, (%rdi) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%rdi) + mov %rcx, %rax - .align 16, 0x90 -.Ltop: .byte 0xc4,66,251,0xf6,208 - add %rbx, %r13 - adc $0, %rbp - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - lea 32(%rdi), %rdi - add %rcx, %r13 - mov -16(%rsi), %rdx - mov %r13, -24(%rdi) - adc $0, %r10 - add %rbp, %r12 - mov -8(%rdi), %r13 - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx -.Llo1: add %rax, %r12 - .byte 0xc4,194,251,0xf6,233 - adc $0, %rcx - add %r10, %r12 - mov %r12, -16(%rdi) - adc $0, %rcx - add %rax, %r13 - adc $0, %rbp - add %rbx, %r13 - mov -8(%rsi), %rdx - adc $0, %rbp -.Llo0: .byte 0xc4,66,251,0xf6,208 - add %rax, %r13 - adc $0, %r10 - mov (%rdi), %r12 - .byte 0xc4,194,251,0xf6,217 - add %rax, %r12 - adc $0, %rbx - add %rcx, %r13 - mov %r13, -8(%rdi) - adc $0, %r10 - mov (%rsi), %rdx - add %rbp, %r12 - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx -.Llo3: add %rax, %r12 - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - add %r10, %r12 - mov 8(%rdi), %r13 - mov %r12, (%rdi) - mov 16(%rdi), %r12 - adc $0, %rcx - add %rax, %r13 - adc $0, %rbp -.Llo2: mov 8(%rsi), %rdx - lea 32(%rsi), %rsi - dec %r11 - jnz .Ltop - -.Lend: .byte 0xc4,66,251,0xf6,208 - add %rbx, %r13 - adc $0, %rbp - add %rax, %r13 - adc $0, %r10 - .byte 0xc4,194,235,0xf6,193 - add %rcx, %r13 - mov %r13, 8(%rdi) - adc $0, %r10 - add %rbp, %rdx - adc $0, %rax - add %r10, %rdx - mov %rdx, 16(%rdi) - adc $0, %rax - - pop %r13 - pop %r12 pop %rbp pop %rbx diff --git a/ext/gmp/gen/x86_64-linux/mpn/and_n.s b/ext/gmp/gen/x86_64-linux/mpn/and_n.s index 0bdc08b1fb..946906ecf6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/and_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/and_n.s @@ -90,7 +90,6 @@ - .text @@ -103,50 +102,42 @@ __gmpn_and_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: and (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: and (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx +.Lb01: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - and (%rsi), %r8 - and 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - and 16(%rsi), %r8 - and 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + and (%rsi,%rcx,8), %r8 + and 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + and 16(%rsi,%rcx,8), %r8 + and 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/andn_n.s b/ext/gmp/gen/x86_64-linux/mpn/andn_n.s index 73fe85c5fd..aee1df4efc 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/andn_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/andn_n.s @@ -92,7 +92,6 @@ - .text @@ -106,54 +105,46 @@ __gmpn_andn_n: mov (%rdx), %r8 not %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: and (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: and (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 +.Ltop: mov (%rdx,%rcx,8), %r8 not %r8 -.Lb00: mov 8(%rdx), %r9 +.Lb00: mov 8(%rdx,%rcx,8), %r9 not %r9 - and (%rsi), %r8 - and 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 + and (%rsi,%rcx,8), %r8 + and 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 not %r8 -.Le10: mov 24(%rdx), %r9 +.Le10: mov 24(%rdx,%rcx,8), %r9 not %r9 - lea 32(%rdx), %rdx - and 16(%rsi), %r8 - and 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + and 16(%rsi,%rcx,8), %r8 + and 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s b/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s index 3c96e43ecb..4f58778551 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s @@ -145,63 +145,46 @@ __gmpn_pi1_bdiv_q_1: dec %r10 jz .Lone - lea 8(%rsi,%r10,8), %rsi + mov 8(%rsi), %rdx + lea (%rsi,%r10,8), %rsi lea (%rdi,%r10,8), %rdi neg %r10 - test %ecx, %ecx - jnz .Lunorm + shrd %cl, %rdx, %rax + xor %ebx, %ebx - jmp .Lnent + jmp .Lent .align 8, 0x90 -.Lntop:mul %r11 - mov -8(%rsi,%r10,8), %rax +.Ltop: + + + + + + + + mul %r11 + mov (%rsi,%r10,8), %rax + mov 8(%rsi,%r10,8), %r9 + shrd %cl, %r9, %rax + nop sub %rbx, %rax - setc %bl + setc %bl sub %rdx, %rax - adc $0, %ebx -.Lnent:imul %r8, %rax - mov %rax, (%rdi,%r10,8) - inc %r10 - jnz .Lntop - - mov -8(%rsi), %r9 - jmp .Lcom - -.Lunorm: - mov (%rsi,%r10,8), %r9 - shr %cl, %rax - neg %ecx - shl %cl, %r9 - neg %ecx - or %r9, %rax - xor %ebx, %ebx - jmp .Luent - - .align 8, 0x90 -.Lutop:mul %r11 - mov (%rsi,%r10,8), %rax - shl %cl, %rax - neg %ecx - or %r9, %rax + adc $0, %ebx +.Lent: imul %r8, %rax + mov %rax, (%rdi,%r10,8) + inc %r10 + jnz .Ltop + + mul %r11 + mov (%rsi), %rax + shr %cl, %rax sub %rbx, %rax - setc %bl sub %rdx, %rax - adc $0, %ebx -.Luent:imul %r8, %rax - mov (%rsi,%r10,8), %r9 - shr %cl, %r9 - neg %ecx - mov %rax, (%rdi,%r10,8) - inc %r10 - jnz .Lutop - -.Lcom: mul %r11 - sub %rbx, %r9 - sub %rdx, %r9 - imul %r8, %r9 - mov %r9, (%rdi) + imul %r8, %rax + mov %rax, (%rdi) pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s b/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s index 063d5dc7d7..b046e3642c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s @@ -60,11 +60,6 @@ - - - - - @@ -73,7 +68,9 @@ - + + + @@ -92,92 +89,101 @@ __gmpn_cnd_add_n: push %rbx + push %rbp + push %r12 + push %r13 + push %r14 neg %rdi - sbb %rbx, %rbx - - test $1, %r8b - jz .Lx0 -.Lx1: test $2, %r8b - jz .Lb1 - -.Lb3: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - and %rbx, %rdi - and %rbx, %r9 - and %rbx, %r10 - add (%rdx), %rdi - mov %rdi, (%rsi) - adc 8(%rdx), %r9 - mov %r9, 8(%rsi) - adc 16(%rdx), %r10 - mov %r10, 16(%rsi) + sbb %rdi, %rdi + + lea (%rcx,%r8,8), %rcx + lea (%rdx,%r8,8), %rdx + lea (%rsi,%r8,8), %rsi + + mov %r8d, %eax + neg %r8 + and $3, %eax + jz .Ltop + cmp $2, %eax + jc .Lb1 + jz .Lb2 + +.Lb3: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + add %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + adc %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) sbb %eax, %eax - lea 24(%rdx), %rdx - lea 24(%rcx), %rcx - lea 24(%rsi), %rsi - sub $3, %r8 - jnz .Ltop + add $3, %r8 + js .Ltop jmp .Lend -.Lx0: xor %eax, %eax - test $2, %r8b - jz .Ltop - -.Lb2: mov (%rcx), %rdi - mov 8(%rcx), %r9 - and %rbx, %rdi - and %rbx, %r9 - add (%rdx), %rdi - mov %rdi, (%rsi) - adc 8(%rdx), %r9 - mov %r9, 8(%rsi) +.Lb2: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r13 + add %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) sbb %eax, %eax - lea 16(%rdx), %rdx - lea 16(%rcx), %rcx - lea 16(%rsi), %rsi - sub $2, %r8 - jnz .Ltop + add $2, %r8 + js .Ltop jmp .Lend -.Lb1: mov (%rcx), %rdi - and %rbx, %rdi - add (%rdx), %rdi - mov %rdi, (%rsi) +.Lb1: mov (%rcx,%r8,8), %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + add %r12, %r10 + mov %r10, (%rsi,%r8,8) sbb %eax, %eax - lea 8(%rdx), %rdx - lea 8(%rcx), %rcx - lea 8(%rsi), %rsi - dec %r8 - jz .Lend + add $1, %r8 + jns .Lend .align 16, 0x90 -.Ltop: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - mov 24(%rcx), %r11 - lea 32(%rcx), %rcx - and %rbx, %rdi - and %rbx, %r9 - and %rbx, %r10 - and %rbx, %r11 +.Ltop: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + mov 24(%rcx,%r8,8), %r11 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + and %rdi, %r11 + mov 24(%rdx,%r8,8), %r9 add %eax, %eax - adc (%rdx), %rdi - mov %rdi, (%rsi) - adc 8(%rdx), %r9 - mov %r9, 8(%rsi) - adc 16(%rdx), %r10 - mov %r10, 16(%rsi) - adc 24(%rdx), %r11 - lea 32(%rdx), %rdx - mov %r11, 24(%rsi) - lea 32(%rsi), %rsi + adc %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + adc %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) + adc %r11, %r9 + mov %r9, 24(%rsi,%r8,8) sbb %eax, %eax - sub $4, %r8 - jnz .Ltop + add $4, %r8 + js .Ltop .Lend: neg %eax + pop %r14 + pop %r13 + pop %r12 + pop %rbp pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s b/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s index 40b0e30be4..596dd8fd48 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s @@ -60,14 +60,6 @@ - - - - - - - - @@ -75,8 +67,6 @@ - - @@ -102,102 +92,95 @@ __gmpn_cnd_sub_n: push %rbp push %r12 push %r13 + push %r14 neg %rdi - sbb %rbx, %rbx - - test $1, %r8b - jz .Lx0 -.Lx1: test $2, %r8b - jz .Lb1 - -.Lb3: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - and %rbx, %rdi - mov (%rdx), %r12 - and %rbx, %r9 - mov 8(%rdx), %r13 - and %rbx, %r10 - mov 16(%rdx), %rbp - sub %rdi, %r12 - mov %r12, (%rsi) - sbb %r9, %r13 - mov %r13, 8(%rsi) - sbb %r10, %rbp - mov %rbp, 16(%rsi) + sbb %rdi, %rdi + + lea (%rcx,%r8,8), %rcx + lea (%rdx,%r8,8), %rdx + lea (%rsi,%r8,8), %rsi + + mov %r8d, %eax + neg %r8 + and $3, %eax + jz .Ltop + cmp $2, %eax + jc .Lb1 + jz .Lb2 + +.Lb3: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + sbb %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) sbb %eax, %eax - lea 24(%rdx), %rdx - lea 24(%rcx), %rcx - lea 24(%rsi), %rsi - sub $3, %r8 - jnz .Ltop + add $3, %r8 + js .Ltop jmp .Lend -.Lx0: xor %eax, %eax - test $2, %r8b - jz .Ltop - -.Lb2: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov (%rdx), %r12 - and %rbx, %rdi - mov 8(%rdx), %r13 - and %rbx, %r9 - sub %rdi, %r12 - mov %r12, (%rsi) - sbb %r9, %r13 - mov %r13, 8(%rsi) +.Lb2: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r13 + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) sbb %eax, %eax - lea 16(%rdx), %rdx - lea 16(%rcx), %rcx - lea 16(%rsi), %rsi - sub $2, %r8 - jnz .Ltop + add $2, %r8 + js .Ltop jmp .Lend -.Lb1: mov (%rcx), %rdi - mov (%rdx), %r12 - and %rbx, %rdi - sub %rdi, %r12 - mov %r12, (%rsi) +.Lb1: mov (%rcx,%r8,8), %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) sbb %eax, %eax - lea 8(%rdx), %rdx - lea 8(%rcx), %rcx - lea 8(%rsi), %rsi - dec %r8 - jz .Lend + add $1, %r8 + jns .Lend .align 16, 0x90 -.Ltop: mov (%rcx), %rdi - mov 8(%rcx), %r9 - mov 16(%rcx), %r10 - mov 24(%rcx), %r11 - lea 32(%rcx), %rcx - and %rbx, %rdi - mov (%rdx), %r12 - and %rbx, %r9 - mov 8(%rdx), %r13 - and %rbx, %r10 - mov 16(%rdx), %rbp - and %rbx, %r11 +.Ltop: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + mov 24(%rcx,%r8,8), %r11 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + and %rdi, %r11 + mov 24(%rdx,%r8,8), %r9 add %eax, %eax - mov 24(%rdx), %rax - lea 32(%rdx), %rdx - sbb %rdi, %r12 - mov %r12, (%rsi) - sbb %r9, %r13 - mov %r13, 8(%rsi) - sbb %r10, %rbp - mov %rbp, 16(%rsi) - sbb %r11, %rax - mov %rax, 24(%rsi) - lea 32(%rsi), %rsi + sbb %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + sbb %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) + sbb %r11, %r9 + mov %r9, 24(%rsi,%r8,8) sbb %eax, %eax - sub $4, %r8 - jnz .Ltop + add $4, %r8 + js .Ltop .Lend: neg %eax + pop %r14 pop %r13 pop %r12 pop %rbp diff --git a/ext/gmp/gen/x86_64-linux/mpn/com.s b/ext/gmp/gen/x86_64-linux/mpn/com.s index 9d4f49cfc0..ff14001990 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/com.s +++ b/ext/gmp/gen/x86_64-linux/mpn/com.s @@ -39,44 +39,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -99,237 +61,50 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_com .type __gmpn_com,@function __gmpn_com: - - cmp $7, %rdx - jbe .Lbc - - pcmpeqb %xmm5, %xmm5 - - test $8, %dil - jz .Lrp_aligned - - mov (%rsi), %r8 - lea 8(%rsi), %rsi - not %r8 - mov %r8, (%rdi) - lea 8(%rdi), %rdi - dec %rdx - -.Lrp_aligned: - test $8, %sil - jnz .Luent - - jmp .Lam - - .align 16, 0x90 -.Latop:movaps 0(%rsi), %xmm0 - movaps 16(%rsi), %xmm1 - movaps 32(%rsi), %xmm2 - movaps 48(%rsi), %xmm3 - lea 64(%rsi), %rsi - pxor %xmm5, %xmm0 - pxor %xmm5, %xmm1 - pxor %xmm5, %xmm2 - pxor %xmm5, %xmm3 - movaps %xmm0, (%rdi) - movaps %xmm1, 16(%rdi) - movaps %xmm2, 32(%rdi) - movaps %xmm3, 48(%rdi) - lea 64(%rdi), %rdi -.Lam: sub $8, %rdx - jnc .Latop - - test $4, %dl - jz 1f - movaps (%rsi), %xmm0 - movaps 16(%rsi), %xmm1 - lea 32(%rsi), %rsi - pxor %xmm5, %xmm0 - pxor %xmm5, %xmm1 - movaps %xmm0, (%rdi) - movaps %xmm1, 16(%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps (%rsi), %xmm0 - lea 16(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - not %r8 - mov %r8, (%rdi) - -1: - ret - -.Luent: - - - - - lea -40(%rsi), %rax - sub %rdi, %rax - cmp $80, %rax - jbe .Lbc - - sub $16, %rdx - jc .Luend - - movaps 120(%rsi), %xmm3 - - sub $16, %rdx - jmp .Lum - - .align 16, 0x90 -.Lutop:movaps 120(%rsi), %xmm3 - pxor %xmm5, %xmm0 - movaps %xmm0, -128(%rdi) - sub $16, %rdx -.Lum: movaps 104(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 88(%rsi), %xmm1 - pxor %xmm5, %xmm3 - movaps %xmm3, 112(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 72(%rsi), %xmm0 - pxor %xmm5, %xmm2 - movaps %xmm2, 96(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps 56(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 80(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - movaps 40(%rsi), %xmm2 - pxor %xmm5, %xmm0 - movaps %xmm0, 64(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 24(%rsi), %xmm1 - pxor %xmm5, %xmm3 - movaps %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 8(%rsi), %xmm0 - pxor %xmm5, %xmm2 - movaps %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 128(%rsi), %rsi - lea 128(%rdi), %rdi - jnc .Lutop - - pxor %xmm5, %xmm0 - movaps %xmm0, -128(%rdi) - -.Luend:test $8, %dl - jz 1f - movaps 56(%rsi), %xmm3 - movaps 40(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 24(%rsi), %xmm1 - pxor %xmm5, %xmm3 - movaps %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 8(%rsi), %xmm0 - pxor %xmm5, %xmm2 - movaps %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 64(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 64(%rdi), %rdi - -1: test $4, %dl - jz 1f - movaps 24(%rsi), %xmm1 - movaps 8(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - pxor %xmm5, %xmm1 - movaps %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 32(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps 8(%rsi), %xmm0 - movaps -8(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 16(%rsi), %rsi - pxor %xmm5, %xmm0 - movaps %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - not %r8 - mov %r8, (%rdi) - -1: - ret - - - - -.Lbc: lea -8(%rdi), %rdi - sub $4, %edx - jc .Lend - - .align 16, 0x90 -.Ltop: mov (%rsi), %r8 - mov 8(%rsi), %r9 - lea 32(%rdi), %rdi - mov 16(%rsi), %r10 - mov 24(%rsi), %r11 - lea 32(%rsi), %rsi - not %r8 - not %r9 - not %r10 - not %r11 - mov %r8, -24(%rdi) - mov %r9, -16(%rdi) - sub $4, %edx - mov %r10, -8(%rdi) - mov %r11, (%rdi) - jnc .Ltop - -.Lend: test $1, %dl - jz 1f - mov (%rsi), %r8 - not %r8 - mov %r8, 8(%rdi) - lea 8(%rdi), %rdi - lea 8(%rsi), %rsi -1: test $2, %dl - jz 1f - mov (%rsi), %r8 - mov 8(%rsi), %r9 - not %r8 - not %r9 - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) -1: + movq (%rsi), %r8 + movl %edx, %eax + leaq (%rsi,%rdx,8), %rsi + leaq (%rdi,%rdx,8), %rdi + negq %rdx + andl $3, %eax + je .Lb00 + cmpl $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: notq %r8 + movq %r8, (%rdi,%rdx,8) + decq %rdx + jmp .Le11 +.Lb10: addq $-2, %rdx + jmp .Le10 + .byte 0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: notq %r8 + movq %r8, (%rdi,%rdx,8) + incq %rdx + jz .Lret + +.Loop: movq (%rsi,%rdx,8), %r8 +.Lb00: movq 8(%rsi,%rdx,8), %r9 + notq %r8 + notq %r9 + movq %r8, (%rdi,%rdx,8) + movq %r9, 8(%rdi,%rdx,8) +.Le11: movq 16(%rsi,%rdx,8), %r8 +.Le10: movq 24(%rsi,%rdx,8), %r9 + notq %r8 + notq %r9 + movq %r8, 16(%rdi,%rdx,8) + movq %r9, 24(%rdi,%rdx,8) + addq $4, %rdx + jnc .Loop +.Lret: ret .size __gmpn_com,.-__gmpn_com - diff --git a/ext/gmp/gen/x86_64-linux/mpn/copyd.s b/ext/gmp/gen/x86_64-linux/mpn/copyd.s index 583e8c9ec5..f375481084 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/copyd.s +++ b/ext/gmp/gen/x86_64-linux/mpn/copyd.s @@ -45,35 +45,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -102,178 +73,36 @@ __gmpn_copyd: - - lea -8(%rsi,%rdx,8), %rsi - lea -8(%rdi,%rdx,8), %rdi - - cmp $7, %rdx - jbe .Lbc - - test $8, %dil - jnz .Lrp_aligned - - mov (%rsi), %rax - mov %rax, (%rdi) - lea -8(%rsi), %rsi - lea -8(%rdi), %rdi - dec %rdx - -.Lrp_aligned: - test $8, %sil - jz .Luent - - jmp .Lam - - .align 16, 0x90 -.Latop:movaps -8(%rsi), %xmm0 - movaps -24(%rsi), %xmm1 - movaps -40(%rsi), %xmm2 - movaps -56(%rsi), %xmm3 - lea -64(%rsi), %rsi - movaps %xmm0, -8(%rdi) - movaps %xmm1, -24(%rdi) - movaps %xmm2, -40(%rdi) - movaps %xmm3, -56(%rdi) - lea -64(%rdi), %rdi -.Lam: sub $8, %rdx - jnc .Latop - - test $4, %dl - jz 1f - movaps -8(%rsi), %xmm0 - movaps -24(%rsi), %xmm1 - lea -32(%rsi), %rsi - movaps %xmm0, -8(%rdi) - movaps %xmm1, -24(%rdi) - lea -32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps -8(%rsi), %xmm0 - lea -16(%rsi), %rsi - movaps %xmm0, -8(%rdi) - lea -16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - -.Luent:sub $16, %rdx - movaps (%rsi), %xmm0 - jc .Luend - - .align 16, 0x90 -.Lutop:sub $16, %rdx - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - movaps -32(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,202,8 - movaps %xmm1, -24(%rdi) - movaps -48(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,211,8 - movaps %xmm2, -40(%rdi) - movaps -64(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,216,8 - movaps %xmm3, -56(%rdi) - movaps -80(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -72(%rdi) - movaps -96(%rsi), %xmm2 - .byte 0x66,0x0f,0x3a,0x0f,202,8 - movaps %xmm1, -88(%rdi) - movaps -112(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,211,8 - movaps %xmm2, -104(%rdi) - movaps -128(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,216,8 - movaps %xmm3, -120(%rdi) - lea -128(%rsi), %rsi - lea -128(%rdi), %rdi - jnc .Lutop - -.Luend:test $8, %dl - jz 1f - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - movaps -32(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps %xmm1, -24(%rdi) - movaps -48(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -40(%rdi) - movaps -64(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps %xmm1, -56(%rdi) - lea -64(%rsi), %rsi - lea -64(%rdi), %rdi - -1: test $4, %dl - jz 1f - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - movaps -32(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps %xmm1, -24(%rdi) - lea -32(%rsi), %rsi - lea -32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movaps -16(%rsi), %xmm1 - .byte 0x66,0x0f,0x3a,0x0f,193,8 - movaps %xmm0, -8(%rdi) - lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - - - - -.Lbc: sub $4, %edx + lea (%rdi,%rdx,8), %rdi + sub $4, %rdx jc .Lend + nop - .align 16, 0x90 -.Ltop: mov (%rsi), %r8 +.Ltop: mov (%rsi), %rax mov -8(%rsi), %r9 lea -32(%rdi), %rdi mov -16(%rsi), %r10 mov -24(%rsi), %r11 lea -32(%rsi), %rsi - mov %r8, 32(%rdi) - mov %r9, 24(%rdi) - - mov %r10, 16(%rdi) - mov %r11, 8(%rdi) - - -.Lend: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) + mov %rax, 24(%rdi) + mov %r9, 16(%rdi) + sub $4, %rdx + mov %r10, 8(%rdi) + mov %r11, (%rdi) + jnc .Ltop + +.Lend: shr %edx + jnc 1f + mov (%rsi), %rax + mov %rax, -8(%rdi) lea -8(%rdi), %rdi lea -8(%rsi), %rsi -1: test $2, %dl - jz 1f - mov (%rsi), %r8 +1: shr %edx + jnc 1f + mov (%rsi), %rax mov -8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, -8(%rdi) -1: - ret + mov %rax, -8(%rdi) + mov %r9, -16(%rdi) +1: ret .size __gmpn_copyd,.-__gmpn_copyd - diff --git a/ext/gmp/gen/x86_64-linux/mpn/copyi.s b/ext/gmp/gen/x86_64-linux/mpn/copyi.s index a5c971baa6..dc746b2270 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/copyi.s +++ b/ext/gmp/gen/x86_64-linux/mpn/copyi.s @@ -45,38 +45,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -100,225 +68,40 @@ .text .align 64, 0x90 + .byte 0,0,0,0,0,0 .globl __gmpn_copyi .type __gmpn_copyi,@function __gmpn_copyi: - - - cmp $7, %rdx - jbe .Lbc - - test $8, %dil - jz .Lrp_aligned - - movsq - dec %rdx - -.Lrp_aligned: - test $8, %sil - jnz .Luent - - jmp .Lam - - .align 16, 0x90 -.Latop:movdqa 0(%rsi), %xmm0 - movdqa 16(%rsi), %xmm1 - movdqa 32(%rsi), %xmm2 - movdqa 48(%rsi), %xmm3 - lea 64(%rsi), %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 16(%rdi) - movdqa %xmm2, 32(%rdi) - movdqa %xmm3, 48(%rdi) - lea 64(%rdi), %rdi -.Lam: sub $8, %rdx - jnc .Latop - - test $4, %dl - jz 1f - movdqa (%rsi), %xmm0 - movdqa 16(%rsi), %xmm1 - lea 32(%rsi), %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 16(%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movdqa (%rsi), %xmm0 - lea 16(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - -.Luent: - - - cmp $16, %rdx - jc .Lued0 - - - - - - - movaps 120(%rsi), %xmm7 - movaps 104(%rsi), %xmm6 - movaps 88(%rsi), %xmm5 - movaps 72(%rsi), %xmm4 - movaps 56(%rsi), %xmm3 - movaps 40(%rsi), %xmm2 - lea 128(%rsi), %rsi - sub $32, %rdx - jc .Lued1 - - .align 16, 0x90 -.Lutop:movaps -104(%rsi), %xmm1 - sub $16, %rdx - movaps -120(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,254,8 - movaps -136(%rsi), %xmm8 - movdqa %xmm7, 112(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,245,8 - movaps 120(%rsi), %xmm7 - movdqa %xmm6, 96(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,236,8 - movaps 104(%rsi), %xmm6 - movdqa %xmm5, 80(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,227,8 - movaps 88(%rsi), %xmm5 - movdqa %xmm4, 64(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movaps 72(%rsi), %xmm4 - movdqa %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movaps 56(%rsi), %xmm3 - movdqa %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps 40(%rsi), %xmm2 - movdqa %xmm1, 16(%rdi) - .byte 0x66,65,0x0f,0x3a,0x0f,192,8 - lea 128(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 128(%rdi), %rdi - jnc .Lutop - -.Lued1:movaps -104(%rsi), %xmm1 - movaps -120(%rsi), %xmm0 - movaps -136(%rsi), %xmm8 - .byte 0x66,0x0f,0x3a,0x0f,254,8 - movdqa %xmm7, 112(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,245,8 - movdqa %xmm6, 96(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,236,8 - movdqa %xmm5, 80(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,227,8 - movdqa %xmm4, 64(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movdqa %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movdqa %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movdqa %xmm1, 16(%rdi) - .byte 0x66,65,0x0f,0x3a,0x0f,192,8 - movdqa %xmm0, (%rdi) - lea 128(%rdi), %rdi - - - - - - -.Lued0:test $8, %dl - jz 1f - movaps 56(%rsi), %xmm3 - movaps 40(%rsi), %xmm2 - movaps 24(%rsi), %xmm1 - movaps 8(%rsi), %xmm0 - movaps -8(%rsi), %xmm4 - .byte 0x66,0x0f,0x3a,0x0f,218,8 - movdqa %xmm3, 48(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,209,8 - movdqa %xmm2, 32(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movdqa %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,196,8 - lea 64(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 64(%rdi), %rdi - -1: test $4, %dl - jz 1f - movaps 24(%rsi), %xmm1 - movaps 8(%rsi), %xmm0 - .byte 0x66,0x0f,0x3a,0x0f,200,8 - movaps -8(%rsi), %xmm3 - movdqa %xmm1, 16(%rdi) - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 32(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 32(%rdi), %rdi - -1: test $2, %dl - jz 1f - movdqa 8(%rsi), %xmm0 - movdqa -8(%rsi), %xmm3 - .byte 0x66,0x0f,0x3a,0x0f,195,8 - lea 16(%rsi), %rsi - movdqa %xmm0, (%rdi) - lea 16(%rdi), %rdi - -1: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, (%rdi) - -1: - ret - - - - -.Lbc: lea -8(%rdi), %rdi - sub $4, %edx + lea -8(%rdi), %rdi + sub $4, %rdx jc .Lend - .align 16, 0x90 -.Ltop: mov (%rsi), %r8 +.Ltop: mov (%rsi), %rax mov 8(%rsi), %r9 lea 32(%rdi), %rdi mov 16(%rsi), %r10 mov 24(%rsi), %r11 lea 32(%rsi), %rsi - mov %r8, -24(%rdi) + mov %rax, -24(%rdi) mov %r9, -16(%rdi) - + sub $4, %rdx mov %r10, -8(%rdi) mov %r11, (%rdi) + jnc .Ltop - -.Lend: test $1, %dl - jz 1f - mov (%rsi), %r8 - mov %r8, 8(%rdi) +.Lend: shr %edx + jnc 1f + mov (%rsi), %rax + mov %rax, 8(%rdi) lea 8(%rdi), %rdi lea 8(%rsi), %rsi -1: test $2, %dl - jz 1f - mov (%rsi), %r8 +1: shr %edx + jnc 1f + mov (%rsi), %rax mov 8(%rsi), %r9 - mov %r8, 8(%rdi) + mov %rax, 8(%rdi) mov %r9, 16(%rdi) -1: - ret +1: ret .size __gmpn_copyi,.-__gmpn_copyi - diff --git a/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s b/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s index 652beccbf2..fd8ce8e9e6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s @@ -158,17 +158,18 @@ __gmpn_div_qr_1n_pi1: dec %r8 mov %rcx, %rax jz .Lfinal + mov $0, %r14d .align 16, 0x90 + .Lloop: - mov %r9, %r14 + cmovc %r9, %r14 mov %r12, %r15 - and %r12, %r14 neg %r15 mul %r9 add %rdx, %r14 @@ -195,6 +196,7 @@ __gmpn_div_qr_1n_pi1: mov %r10, %rax adc %rdx, %rax mov %r14, (%rdi, %r8, 8) + mov $0, %r14d sbb %r12, %r12 dec %r8 mov %rax, %rcx diff --git a/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s b/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s index 5363432e8d..e689bd27f4 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s @@ -64,20 +64,6 @@ - - - - - - - - - - - - - - @@ -347,4 +333,3 @@ __gmpn_divrem_1: ret .size __gmpn_divrem_1,.-__gmpn_divrem_1 - diff --git a/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s b/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s index 4647639cd9..cf35d253b9 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s +++ b/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s @@ -76,13 +76,140 @@ - - - - - - - + .section .rodata + .align 64, 0x90 +ctz_table: + + .byte 7 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + + .size ctz_table,.-ctz_table @@ -99,22 +226,31 @@ __gmpn_gcd_11: - jmp .Lodd + mov ctz_table@GOTPCREL(%rip), %r8 + + + jmp .Lent .align 16, 0x90 .Ltop: cmovc %rdx, %rdi cmovc %rax, %rsi +.Lmid: and $127, %edx + movzbl (%r8,%rdx), %ecx + jz .Lshift_alot shr %cl, %rdi -.Lodd: mov %rsi, %rdx - sub %rdi, %rdx - bsf %rdx, %rcx - mov %rdi, %rax - sub %rsi, %rdi +.Lent: mov %rdi, %rax + mov %rsi, %rdx + sub %rdi, %rdx + sub %rsi, %rdi jnz .Ltop .Lend: ret - .size __gmpn_gcd_11,.-__gmpn_gcd_11 +.Lshift_alot: + shr $7, %rdi + mov %rdi, %rdx + jmp .Lmid + .size __gmpn_gcd_11,.-__gmpn_gcd_11 diff --git a/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s b/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s index e3d86b92e4..60f4c714c9 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s +++ b/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s @@ -67,6 +67,276 @@ + + + + + + + + + .section .rodata + .align 64, 0x90 +ctz_table: + + .byte 8 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 7 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + + .size ctz_table,.-ctz_table @@ -92,32 +362,40 @@ __gmpn_gcd_22: + mov %rcx, %rax + + mov ctz_table@GOTPCREL(%rip), %r10 + + .align 16, 0x90 -.Ltop: mov %rcx, %r10 - sub %rsi, %r10 +.Ltop: mov %rax, %rcx + sub %rsi, %rcx jz .Llowz mov %rdx, %r11 sbb %rdi, %r11 - rep;bsf %r10, %rax - mov %rsi, %r8 - sub %rcx, %rsi mov %rdi, %r9 + + sub %rax, %rsi sbb %rdx, %rdi -.Lbck: cmovc %r10, %rsi +.Lbck: cmovc %rcx, %rsi cmovc %r11, %rdi - cmovc %r8, %rcx + cmovc %r8, %rax cmovc %r9, %rdx - xor %r10d, %r10d - sub %rax, %r10 - .byte 0xc4,98,169,0xf7,207 - .byte 0xc4,226,251,0xf7,246 - .byte 0xc4,226,251,0xf7,255 - or %r9, %rsi + and $255, %ecx + movzbl (%r10,%rcx), %ecx + jz .Lcount_better + +.Lshr: shr %cl, %rsi + mov %rdi, %r11 + shr %cl, %rdi + neg %rcx + shl %cl, %r11 + or %r11, %rsi test %rdx, %rdx jnz .Ltop @@ -125,29 +403,32 @@ __gmpn_gcd_22: jnz .Ltop .Lgcd_11: - mov %rcx, %rdi + mov %rax, %rdi jmp __gmpn_gcd_11@PLT +.Lcount_better: + rep;bsf %rsi, %rcx + jmp .Lshr + .Llowz: - mov %rdx, %r10 - sub %rdi, %r10 + mov %rdx, %rcx + sub %rdi, %rcx je .Lend xor %r11, %r11 mov %rsi, %r8 mov %rdi, %r9 - rep;bsf %r10, %rax mov %rdi, %rsi xor %rdi, %rdi sub %rdx, %rsi jmp .Lbck -.Lend: mov %rcx, %rax +.Lend: + -.Lret: ret .size __gmpn_gcd_22,.-__gmpn_gcd_22 diff --git a/ext/gmp/gen/x86_64-linux/mpn/hamdist.s b/ext/gmp/gen/x86_64-linux/mpn/hamdist.s index 1c5d6e4192..1ab3a8cca6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/hamdist.s +++ b/ext/gmp/gen/x86_64-linux/mpn/hamdist.s @@ -60,21 +60,16 @@ - - - - - - - - - - - - - - - + + + + + + + + + + @@ -91,119 +86,82 @@ __gmpn_hamdist: - + push %rbx + mov $0x5555555555555555, %r10 push %rbp - - mov (%rdi), %r10 - xor (%rsi), %r10 - - mov %edx, %r8d - and $3, %r8d - - xor %ecx, %ecx - .byte 0xf3,0x49,0x0f,0xb8,0xc2 - - lea .Ltab(%rip), %r9 - - movslq (%r9,%r8,4), %r8 - add %r9, %r8 - jmp *%r8 - - -.L3: mov 8(%rdi), %r10 - mov 16(%rdi), %r11 - xor 8(%rsi), %r10 - xor 16(%rsi), %r11 - xor %ebp, %ebp - sub $4, %rdx - jle .Lx3 - mov 24(%rdi), %r8 - mov 32(%rdi), %r9 - add $24, %rdi - add $24, %rsi - jmp .Le3 - -.L0: mov 8(%rdi), %r9 - xor 8(%rsi), %r9 - mov 16(%rdi), %r10 - mov 24(%rdi), %r11 - xor %ebx, %ebx - xor 16(%rsi), %r10 - xor 24(%rsi), %r11 - add $32, %rdi - add $32, %rsi - sub $4, %rdx - jle .Lx4 + mov $0x3333333333333333, %r11 + push %r12 + lea (%rdi,%rdx,8), %rdi + mov $0x0f0f0f0f0f0f0f0f, %rcx + lea (%rsi,%rdx,8), %rsi + neg %rdx + mov $0x0101010101010101, %r12 + xor %eax, %eax + test $1, %dl + jz .Ltop + + mov (%rdi,%rdx,8), %r8 + xor (%rsi,%rdx,8), %r8 + + mov %r8, %r9 + shr %r8 + and %r10, %r8 + sub %r8, %r9 + + mov %r9, %r8 + shr $2, %r9 + and %r11, %r8 + and %r11, %r9 + add %r8, %r9 + + dec %rdx + jmp .Lmid .align 16, 0x90 -.Ltop: -.Le0: .byte 0xf3,0x49,0x0f,0xb8,0xe9 - mov (%rdi), %r8 - mov 8(%rdi), %r9 - add %rbx, %rax -.Le3: .byte 0xf3,0x49,0x0f,0xb8,0xda - xor (%rsi), %r8 - xor 8(%rsi), %r9 - add %rbp, %rcx -.Le2: .byte 0xf3,0x49,0x0f,0xb8,0xeb - mov 16(%rdi), %r10 - mov 24(%rdi), %r11 - add $32, %rdi - add %rbx, %rax -.Le1: .byte 0xf3,0x49,0x0f,0xb8,0xd8 - xor 16(%rsi), %r10 - xor 24(%rsi), %r11 - add $32, %rsi - add %rbp, %rcx - sub $4, %rdx - jg .Ltop - -.Lx4: .byte 0xf3,0x49,0x0f,0xb8,0xe9 - add %rbx, %rax -.Lx3: .byte 0xf3,0x49,0x0f,0xb8,0xda - add %rbp, %rcx - .byte 0xf3,0x49,0x0f,0xb8,0xeb - add %rbx, %rax - add %rbp, %rcx -.Lx2: add %rcx, %rax -.Lx1: pop %rbp +.Ltop: mov (%rdi,%rdx,8), %r8 + mov 8(%rdi,%rdx,8), %rbx + xor (%rsi,%rdx,8), %r8 + xor 8(%rsi,%rdx,8), %rbx + + mov %r8, %r9 + mov %rbx, %rbp + shr %r8 + shr %rbx + and %r10, %r8 + and %r10, %rbx + sub %r8, %r9 + sub %rbx, %rbp + + mov %r9, %r8 + mov %rbp, %rbx + shr $2, %r9 + shr $2, %rbp + and %r11, %r8 + and %r11, %r9 + and %r11, %rbx + and %r11, %rbp + add %r8, %r9 + add %rbx, %rbp + + add %rbp, %r9 +.Lmid: mov %r9, %r8 + shr $4, %r9 + and %rcx, %r8 + and %rcx, %r9 + add %r8, %r9 + + imul %r12, %r9 + shr $56, %r9 + + add %r9, %rax + add $2, %rdx + jnc .Ltop + +.Lend: + pop %r12 + pop %rbp pop %rbx ret - -.L2: mov 8(%rdi), %r11 - xor 8(%rsi), %r11 - sub $2, %rdx - jle .Ln2 - mov 16(%rdi), %r8 - mov 24(%rdi), %r9 - xor %ebx, %ebx - xor 16(%rsi), %r8 - xor 24(%rsi), %r9 - add $16, %rdi - add $16, %rsi - jmp .Le2 -.Ln2: .byte 0xf3,0x49,0x0f,0xb8,0xcb - jmp .Lx2 - -.L1: dec %rdx - jle .Lx1 - mov 8(%rdi), %r8 - mov 16(%rdi), %r9 - xor 8(%rsi), %r8 - xor 16(%rsi), %r9 - xor %ebp, %ebp - mov 24(%rdi), %r10 - mov 32(%rdi), %r11 - add $40, %rdi - add $8, %rsi - jmp .Le1 - .size __gmpn_hamdist,.-__gmpn_hamdist - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/ior_n.s b/ext/gmp/gen/x86_64-linux/mpn/ior_n.s index fc23fd7190..6509f28b3b 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/ior_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/ior_n.s @@ -90,7 +90,6 @@ - .text @@ -103,50 +102,42 @@ __gmpn_ior_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: or (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: or (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx +.Lb01: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - or (%rsi), %r8 - or 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - or 16(%rsi), %r8 - or 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + or (%rsi,%rcx,8), %r8 + or 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + or 16(%rsi,%rcx,8), %r8 + or 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s b/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s index e13105d814..b199ca33ff 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s @@ -92,7 +92,6 @@ - .text @@ -106,54 +105,46 @@ __gmpn_iorn_n: mov (%rdx), %r8 not %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: or (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: or (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 +.Ltop: mov (%rdx,%rcx,8), %r8 not %r8 -.Lb00: mov 8(%rdx), %r9 +.Lb00: mov 8(%rdx,%rcx,8), %r9 not %r9 - or (%rsi), %r8 - or 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 + or (%rsi,%rcx,8), %r8 + or 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 not %r8 -.Le10: mov 24(%rdx), %r9 +.Le10: mov 24(%rdx,%rcx,8), %r9 not %r9 - lea 32(%rdx), %rdx - or 16(%rsi), %r8 - or 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + or 16(%rsi,%rcx,8), %r8 + or 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/lshift.s b/ext/gmp/gen/x86_64-linux/mpn/lshift.s index ebd4035c21..89e9566e3c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/lshift.s +++ b/ext/gmp/gen/x86_64-linux/mpn/lshift.s @@ -41,32 +41,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -89,123 +63,124 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_lshift .type __gmpn_lshift,@function __gmpn_lshift: - movd %ecx, %xmm4 - mov $64, %eax - sub %ecx, %eax - movd %eax, %xmm5 - - neg %ecx + neg %ecx mov -8(%rsi,%rdx,8), %rax - shr %cl, %rax - - cmp $3, %rdx - jle .Lbc + shr %cl, %rax - lea (%rdi,%rdx,8), %ecx - test $8, %cl - jz .Lrp_aligned - - - movq -8(%rsi,%rdx,8), %xmm0 - movq -16(%rsi,%rdx,8), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, -8(%rdi,%rdx,8) - dec %rdx - -.Lrp_aligned: + neg %ecx lea 1(%rdx), %r8d - - and $6, %r8d - jz .Lba0 - cmp $4, %r8d - jz .Lba4 - jc .Lba2 -.Lba6: add $-4, %rdx - jmp .Li56 -.Lba0: add $-6, %rdx - jmp .Li70 -.Lba4: add $-2, %rdx - jmp .Li34 -.Lba2: add $-8, %rdx - jle .Lend - + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + neg %ecx +.L1x: + cmp $1, %rdx + je .Last + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -24(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + mov %r10, -8(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) + sub $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + + sub $4, %rdx + jb .Lend .align 16, 0x90 -.Ltop: movdqu 40(%rsi,%rdx,8), %xmm1 - movdqu 48(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 48(%rdi,%rdx,8) -.Li70: - movdqu 24(%rsi,%rdx,8), %xmm1 - movdqu 32(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 32(%rdi,%rdx,8) -.Li56: - movdqu 8(%rsi,%rdx,8), %xmm1 - movdqu 16(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, 16(%rdi,%rdx,8) -.Li34: - movdqu -8(%rsi,%rdx,8), %xmm1 - movdqu (%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, (%rdi,%rdx,8) - sub $8, %rdx - jg .Ltop - -.Lend: test $1, %dl - jnz .Lend8 - - movdqu (%rsi), %xmm1 - pxor %xmm0, %xmm0 - punpcklqdq %xmm1, %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movdqa %xmm0, (%rdi) +.Ltop: - ret - + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + mov %r10, 24(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + + mov 0(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + shr %cl, %r9 - .align 16, 0x90 -.Lbc: dec %edx - jz .Lend8 - - movq (%rsi,%rdx,8), %xmm1 - movq -8(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, (%rdi,%rdx,8) - sub $2, %edx - jl .Lend8 - movq 8(%rsi), %xmm1 - movq (%rsi), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, 8(%rdi) - -.Lend8:movq (%rsi), %xmm0 - psllq %xmm4, %xmm0 - movq %xmm0, (%rdi) + + neg %ecx + mov 8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shl %cl, %r10 + or %r10, %r8 + shl %cl, %r11 + or %r11, %r9 + mov %r8, 8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) + + mov -8(%rsi,%rdx,8), %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r10 + shl %cl, %r11 + + sub $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov 8(%rsi), %r8 + shr %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shr %cl, %r9 + or %r9, %r11 + mov %r10, 16(%rdi) + mov %r11, 8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shl %cl, %r10 + mov %r10, (%rdi) ret .size __gmpn_lshift,.-__gmpn_lshift - diff --git a/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s b/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s index 1ed069b688..680994041a 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s +++ b/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s @@ -41,32 +41,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -89,134 +63,135 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_lshiftc .type __gmpn_lshiftc,@function __gmpn_lshiftc: - movd %ecx, %xmm4 - mov $64, %eax - sub %ecx, %eax - movd %eax, %xmm5 - - neg %ecx + neg %ecx mov -8(%rsi,%rdx,8), %rax - shr %cl, %rax - - pcmpeqb %xmm3, %xmm3 - - cmp $3, %rdx - jle .Lbc - - lea (%rdi,%rdx,8), %ecx - test $8, %cl - jz .Lrp_aligned - - - movq -8(%rsi,%rdx,8), %xmm0 - movq -16(%rsi,%rdx,8), %xmm1 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, -8(%rdi,%rdx,8) - dec %rdx + shr %cl, %rax -.Lrp_aligned: + neg %ecx lea 1(%rdx), %r8d - - and $6, %r8d - jz .Lba0 - cmp $4, %r8d - jz .Lba4 - jc .Lba2 -.Lba6: add $-4, %rdx - jmp .Li56 -.Lba0: add $-6, %rdx - jmp .Li70 -.Lba4: add $-2, %rdx - jmp .Li34 -.Lba2: add $-8, %rdx - jle .Lend - + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + neg %ecx +.L1x: + cmp $1, %rdx + je .Last + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -24(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, -8(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) + sub $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + + sub $4, %rdx + jb .Lend .align 16, 0x90 -.Ltop: movdqu 40(%rsi,%rdx,8), %xmm1 - movdqu 48(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 48(%rdi,%rdx,8) -.Li70: - movdqu 24(%rsi,%rdx,8), %xmm1 - movdqu 32(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 32(%rdi,%rdx,8) -.Li56: - movdqu 8(%rsi,%rdx,8), %xmm1 - movdqu 16(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, 16(%rdi,%rdx,8) -.Li34: - movdqu -8(%rsi,%rdx,8), %xmm1 - movdqu (%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm0 - psrlq %xmm5, %xmm1 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, (%rdi,%rdx,8) - sub $8, %rdx - jg .Ltop - -.Lend: test $1, %dl - jnz .Lend8 - - movdqu (%rsi), %xmm1 - pxor %xmm0, %xmm0 - punpcklqdq %xmm1, %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movdqa %xmm0, (%rdi) +.Ltop: - ret - + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 24(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + + mov 0(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + shr %cl, %r9 - .align 16, 0x90 -.Lbc: dec %edx - jz .Lend8 - - movq (%rsi,%rdx,8), %xmm1 - movq -8(%rsi,%rdx,8), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, (%rdi,%rdx,8) - sub $2, %edx - jl .Lend8 - movq 8(%rsi), %xmm1 - movq (%rsi), %xmm0 - psllq %xmm4, %xmm1 - psrlq %xmm5, %xmm0 - por %xmm1, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, 8(%rdi) - -.Lend8:movq (%rsi), %xmm0 - psllq %xmm4, %xmm0 - pxor %xmm3, %xmm0 - movq %xmm0, (%rdi) + + neg %ecx + mov 8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shl %cl, %r10 + or %r10, %r8 + shl %cl, %r11 + or %r11, %r9 + not %r8 + not %r9 + mov %r8, 8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) + + mov -8(%rsi,%rdx,8), %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r10 + shl %cl, %r11 + + sub $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov 8(%rsi), %r8 + shr %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 16(%rdi) + mov %r11, 8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shl %cl, %r10 + not %r10 + mov %r10, (%rdi) ret .size __gmpn_lshiftc,.-__gmpn_lshiftc - diff --git a/ext/gmp/gen/x86_64-linux/mpn/mul_1.s b/ext/gmp/gen/x86_64-linux/mpn/mul_1.s index e8de366075..1644074e4d 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mul_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mul_1.s @@ -66,8 +66,6 @@ - - @@ -81,120 +79,127 @@ + .text - .align 32, 0x90 + .align 16, 0x90 + .globl __gmpn_mul_1c + .type __gmpn_mul_1c,@function + +__gmpn_mul_1c: + + + + + push %rbx + mov %r8, %r10 + + jmp .Lcommon + .size __gmpn_mul_1c,.-__gmpn_mul_1c + .globl __gmpn_mul_1 .type __gmpn_mul_1,@function __gmpn_mul_1: - mov %rcx, %r10 - mov %rdx, %rcx - mov %edx, %r8d - shr $3, %rcx - and $7, %r8d - mov %r10, %rdx - lea .Ltab(%rip), %r10 - movslq (%r10,%r8,4), %r8 - lea (%r8, %r10), %r10 - jmp *%r10 - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .Lf0-.Ltab - .long .Lf1-.Ltab - .long .Lf2-.Ltab - .long .Lf3-.Ltab - .long .Lf4-.Ltab - .long .Lf5-.Ltab - .long .Lf6-.Ltab - .long .Lf7-.Ltab - .text -.Lf0: .byte 0xc4,98,171,0xf6,6 - lea 56(%rsi), %rsi - lea -8(%rdi), %rdi - jmp .Lb0 - -.Lf3: .byte 0xc4,226,179,0xf6,6 - lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - inc %rcx - jmp .Lb3 - -.Lf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - inc %rcx - jmp .Lb4 - -.Lf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - inc %rcx - jmp .Lb5 - -.Lf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - inc %rcx - jmp .Lb6 - -.Lf7: .byte 0xc4,226,179,0xf6,6 - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - inc %rcx - jmp .Lb7 - -.Lf1: .byte 0xc4,226,179,0xf6,6 - test %rcx, %rcx - jnz .Lb1 -.L1: mov %r9, (%rdi) - ret -.Lf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,226,179,0xf6,6 - test %rcx, %rcx - jz .Lend - - .align 32, 0x90 -.Ltop: mov %r10, -8(%rdi) - adc %r8, %r9 -.Lb1: .byte 0xc4,98,171,0xf6,70,8 - adc %rax, %r10 - lea 64(%rsi), %rsi - mov %r9, (%rdi) -.Lb0: mov %r10, 8(%rdi) - .byte 0xc4,226,179,0xf6,70,208 - lea 64(%rdi), %rdi - adc %r8, %r9 -.Lb7: .byte 0xc4,98,171,0xf6,70,216 - mov %r9, -48(%rdi) - adc %rax, %r10 -.Lb6: mov %r10, -40(%rdi) - .byte 0xc4,226,179,0xf6,70,224 - adc %r8, %r9 -.Lb5: .byte 0xc4,98,171,0xf6,70,232 - mov %r9, -32(%rdi) - adc %rax, %r10 -.Lb4: .byte 0xc4,226,179,0xf6,70,240 - mov %r10, -24(%rdi) - adc %r8, %r9 -.Lb3: .byte 0xc4,98,171,0xf6,70,248 - adc %rax, %r10 - mov %r9, -16(%rdi) - dec %rcx - .byte 0xc4,226,179,0xf6,6 - jnz .Ltop - -.Lend: mov %r10, -8(%rdi) - adc %r8, %r9 - mov %r9, (%rdi) - adc %rcx, %rax + + push %rbx + xor %r10, %r10 +.Lcommon: + mov (%rsi), %rax + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 + + add %r10, %rax + adc $0, %rdx + + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + mov %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 + + .align 16, 0x90 +.Ltop: mov %r10, (%rdi,%r11,8) + add %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + mov %r9, 8(%rdi,%r11,8) + add %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + mov %r8, 16(%rdi,%r11,8) + add %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + mov %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + add %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + mov %r10, (%rdi,%r11,8) + add %rax, %r9 + adc %r8, %rdx + mov %r9, 8(%rdi,%r11,8) + add %r8, %rdx +.Lret: mov %rdx, %rax + + pop %rbx + + ret .size __gmpn_mul_1,.-__gmpn_mul_1 - diff --git a/ext/gmp/gen/x86_64-linux/mpn/mul_2.s b/ext/gmp/gen/x86_64-linux/mpn/mul_2.s index 395391597e..0c3310dfad 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mul_2.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mul_2.s @@ -81,13 +81,17 @@ + + + + .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_mul_2 .type __gmpn_mul_2,@function @@ -100,88 +104,112 @@ __gmpn_mul_2: mov (%rcx), %r8 mov 8(%rcx), %r9 - lea 3(%rdx), %r11 - shr $2, %r11 - - test $1, %dl - jnz .Lbx1 + mov (%rsi), %rax + + mov %rdx, %r11 + neg %r11 + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + + and $3, %edx + jz .Lm2p0 + cmp $2, %edx + jc .Lm2p1 + jz .Lm2p2 +.Lm2p3: + mul %r8 + xor %r10d, %r10d + mov %rax, %rcx + mov %rdx, %rbp + mov 8(%rsi,%r11,8), %rax + add $-1, %r11 + mul %r9 + add %rax, %rbp + jmp .Lm23 +.Lm2p0: + mul %r8 + xor %ebp, %ebp + mov %rax, %rbx + mov %rdx, %rcx + jmp .Lm20 +.Lm2p1: + mul %r8 + xor %r10d, %r10d + xor %ebx, %ebx + xor %ecx, %ecx + add $1, %r11 + jmp .Lm2top +.Lm2p2: + mul %r8 + xor %ebx, %ebx + xor %ecx, %ecx + mov %rax, %rbp + mov %rdx, %r10 + mov 8(%rsi,%r11,8), %rax + add $-2, %r11 + jmp .Lm22 -.Lbx0: xor %rbx, %rbx - test $2, %dl - mov (%rsi), %rdx - .byte 0xc4,194,211,0xf6,200 - jz .Llo0 -.Lb10: lea -16(%rdi), %rdi - lea -16(%rsi), %rsi - jmp .Llo2 - -.Lbx1: xor %rbp, %rbp - test $2, %dl - mov (%rsi), %rdx - .byte 0xc4,66,227,0xf6,208 - jnz .Lb11 - -.Lb01: lea -24(%rdi), %rdi - lea 8(%rsi), %rsi - jmp .Llo1 - -.Lb11: lea -8(%rdi), %rdi - lea -8(%rsi), %rsi - jmp .Llo3 - - .align 16, 0x90 -.Ltop: .byte 0xc4,194,251,0xf6,217 - add %rax, %rbp - mov (%rsi), %rdx - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx - add %rax, %rbp - adc $0, %rcx - add %r10, %rbp -.Llo0: mov %rbp, (%rdi) - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - add %rax, %rbx - mov 8(%rsi), %rdx - adc $0, %rbp - .byte 0xc4,66,251,0xf6,208 - add %rax, %rbx - adc $0, %r10 - add %rcx, %rbx -.Llo3: mov %rbx, 8(%rdi) - adc $0, %r10 - .byte 0xc4,194,251,0xf6,217 - add %rax, %rbp - mov 16(%rsi), %rdx - .byte 0xc4,194,251,0xf6,200 - adc $0, %rbx - add %rax, %rbp - adc $0, %rcx - add %r10, %rbp -.Llo2: mov %rbp, 16(%rdi) - adc $0, %rcx - .byte 0xc4,194,251,0xf6,233 - add %rax, %rbx - mov 24(%rsi), %rdx - adc $0, %rbp - .byte 0xc4,66,251,0xf6,208 - add %rax, %rbx - adc $0, %r10 - add %rcx, %rbx - lea 32(%rsi), %rsi -.Llo1: mov %rbx, 24(%rdi) - adc $0, %r10 - dec %r11 - lea 32(%rdi), %rdi - jnz .Ltop - -.Lend: .byte 0xc4,194,235,0xf6,193 - add %rdx, %rbp - adc $0, %rax - add %r10, %rbp - mov %rbp, (%rdi) - adc $0, %rax + .align 32, 0x90 +.Lm2top: + add %rax, %r10 + adc %rdx, %rbx + mov 0(%rsi,%r11,8), %rax + adc $0, %ecx + mov $0, %ebp + mul %r9 + add %rax, %rbx + mov %r10, 0(%rdi,%r11,8) + adc %rdx, %rcx + mov 8(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rbx + adc %rdx, %rcx + adc $0, %ebp +.Lm20: mov 8(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r11,8), %rax + mov $0, %r10d + mul %r8 + add %rax, %rcx + mov 16(%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r9 + add %rax, %rbp + mov %rbx, 8(%rdi,%r11,8) +.Lm23: adc %rdx, %r10 + mov 24(%rsi,%r11,8), %rax + mul %r8 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov %rcx, 16(%rdi,%r11,8) + mov 24(%rsi,%r11,8), %rax + mov $0, %ecx + adc $0, %ebx +.Lm22: mul %r9 + add %rax, %r10 + mov %rbp, 24(%rdi,%r11,8) + adc %rdx, %rbx + mov 32(%rsi,%r11,8), %rax + mul %r8 + add $4, %r11 + js .Lm2top + + + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + mov (%rsi), %rax + mul %r9 + mov %r10, (%rdi) + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%rdi) + mov %rcx, %rax pop %rbp pop %rbx diff --git a/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s index 498782526f..2cfb7aaa17 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s @@ -84,295 +84,400 @@ + .text + .align 16, 0x90 + .globl __gmpn_mul_basecase + .type __gmpn_mul_basecase,@function + +__gmpn_mul_basecase: + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + xor %r13d, %r13d + mov (%rsi), %rax + mov (%rcx), %r12 + sub %rdx, %r13 + mov %r13, %r11 + mov %edx, %ebx + lea (%rdi,%rdx,8), %rdi + lea (%rsi,%rdx,8), %rsi + mul %r12 + test $1, %r8b + jz .Lmul_2 - .text - .align 16, 0x90 - .globl __gmpn_mul_basecase - .type __gmpn_mul_basecase,@function - -__gmpn_mul_basecase: - +.Lmul_1: + and $3, %ebx + jz .Lmul_1_prologue_0 + cmp $2, %ebx + jc .Lmul_1_prologue_1 + jz .Lmul_1_prologue_2 - cmp $2, %rdx - ja .Lgen - mov (%rcx), %rdx - .byte 0xc4,98,251,0xf6,14 - je .Ls2x +.Lmul_1_prologue_3: + add $-1, %r11 + lea .Laddmul_outer_3(%rip), %r14 + mov %rax, %r10 + mov %rdx, %rbx + jmp .Lmul_1_entry_3 -.Ls11: mov %rax, (%rdi) - mov %r9, 8(%rdi) - - ret +.Lmul_1_prologue_0: + mov %rax, %rbp + mov %rdx, %r10 + lea .Laddmul_outer_0(%rip), %r14 + jmp .Lmul_1_entry_0 + +.Lmul_1_prologue_1: + cmp $-1, %r13 + jne 2f + mov %rax, -8(%rdi) + mov %rdx, (%rdi) + jmp .Lret +2: add $1, %r11 + lea .Laddmul_outer_1(%rip), %r14 + mov %rax, %r15 + mov %rdx, %rbp + xor %r10d, %r10d + mov (%rsi,%r11,8), %rax + jmp .Lmul_1_entry_1 + +.Lmul_1_prologue_2: + add $-2, %r11 + lea .Laddmul_outer_2(%rip), %r14 + mov %rax, %rbx + mov %rdx, %r15 + mov 24(%rsi,%r11,8), %rax + xor %ebp, %ebp + xor %r10d, %r10d + jmp .Lmul_1_entry_2 -.Ls2x: cmp $2, %r8 - .byte 0xc4,98,187,0xf6,86,8 - je .Ls22 -.Ls21: add %r8, %r9 - adc $0, %r10 - mov %rax, (%rdi) - mov %r9, 8(%rdi) - mov %r10, 16(%rdi) - ret -.Ls22: add %r8, %r9 - adc $0, %r10 - mov 8(%rcx), %rdx - mov %rax, (%rdi) - .byte 0xc4,98,187,0xf6,30 - .byte 0xc4,226,251,0xf6,86,8 - add %r11, %rax - adc $0, %rdx - add %r8, %r9 - adc %rax, %r10 - adc $0, %rdx - mov %r9, 8(%rdi) - mov %r10, 16(%rdi) - mov %rdx, 24(%rdi) + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r11,8) + add %rax, %r15 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp +.Lmul_1_entry_1: + xor %ebx, %ebx + mul %r12 + mov %r15, -8(%rdi,%r11,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov %rbp, (%rdi,%r11,8) + add %rax, %r10 + adc %rdx, %rbx +.Lmul_1_entry_3: + mov 16(%rsi,%r11,8), %rax + mul %r12 + mov %r10, 8(%rdi,%r11,8) + xor %ebp, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r11,8), %rax + mov %rbp, %r15 + adc %rdx, %r15 +.Lmul_1_entry_2: + mul %r12 + add $4, %r11 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %r15 + mov %r15, -8(%rdi) + adc %rdx, %rbp + mov %rbp, (%rdi) + + add $-1, %r8 + jz .Lret + + mov 8(%rcx), %r12 + mov 16(%rcx), %r9 + + lea 8(%rcx), %rcx + lea 8(%rdi), %rdi + + jmp *%r14 + + + + + .align 16, 0x90 +.Lmul_2: + mov 8(%rcx), %r9 + + and $3, %ebx + jz .Lmul_2_prologue_0 + cmp $2, %ebx + jz .Lmul_2_prologue_2 + jc .Lmul_2_prologue_1 + +.Lmul_2_prologue_3: + lea .Laddmul_outer_3(%rip), %r14 + add $2, %r11 + mov %rax, -16(%rdi,%r11,8) + mov %rdx, %rbp + xor %r10d, %r10d + xor %ebx, %ebx + mov -16(%rsi,%r11,8), %rax + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_prologue_0: + add $3, %r11 + mov %rax, %rbx + mov %rdx, %r15 + xor %ebp, %ebp + mov -24(%rsi,%r11,8), %rax + lea .Laddmul_outer_0(%rip), %r14 + jmp .Lmul_2_entry_0 + + .align 16, 0x90 +.Lmul_2_prologue_1: + mov %rax, %r10 + mov %rdx, %rbx + xor %r15d, %r15d + lea .Laddmul_outer_1(%rip), %r14 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_2: + add $1, %r11 + lea .Laddmul_outer_2(%rip), %r14 + mov $0, %ebx + mov $0, %r15d + mov %rax, %rbp + mov -8(%rsi,%r11,8), %rax + mov %rdx, %r10 + jmp .Lmul_2_entry_2 + - ret .align 16, 0x90 -.Lgen: - push %rbx - push %rbp - push %r12 - push %r14 +.Lmul_2_top: + mov -32(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + adc %rdx, %r15 + mov -24(%rsi,%r11,8), %rax + xor %ebp, %ebp + mul %r12 + add %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + adc $0, %ebp +.Lmul_2_entry_0: + mul %r9 + add %rax, %r15 + mov %rbx, -24(%rdi,%r11,8) + adc %rdx, %rbp + mov -16(%rsi,%r11,8), %rax + mul %r12 + mov $0, %r10d + add %rax, %r15 + adc %rdx, %rbp + mov -16(%rsi,%r11,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %r15, -16(%rdi,%r11,8) +.Lmul_2_entry_3: + mul %r9 + add %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + mov $0, %r15d + mul %r12 + add %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc %r15d, %ebx +.Lmul_2_entry_2: + mul %r9 + add %rax, %r10 + mov %rbp, -8(%rdi,%r11,8) + adc %rdx, %rbx + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %r15d +.Lmul_2_entry_1: + add $4, %r11 + mov %r10, -32(%rdi,%r11,8) + js .Lmul_2_top + + mov -32(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + mov %rbx, (%rdi) + adc %rdx, %r15 + mov %r15, 8(%rdi) + + add $-2, %r8 + jz .Lret + + mov 16(%rcx), %r12 + mov 24(%rcx), %r9 + + lea 16(%rcx), %rcx + lea 16(%rdi), %rdi + + jmp *%r14 - mov %rcx, %r14 - lea 1(%rdx), %rbx + + + + + + + +.Laddmul_outer_0: + add $3, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -24(%rsi,%r13,8), %rax + mul %r12 + mov %rax, %rbx + mov -24(%rsi,%r13,8), %rax + mov %rdx, %r15 + xor %ebp, %ebp + jmp .Laddmul_entry_0 + +.Laddmul_outer_1: + mov %r13, %r11 + mov (%rsi,%r13,8), %rax + mul %r12 + mov %rax, %r10 + mov (%rsi,%r13,8), %rax + mov %rdx, %rbx + xor %r15d, %r15d + jmp .Laddmul_entry_1 + +.Laddmul_outer_2: + add $1, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -8(%rsi,%r13,8), %rax + mul %r12 + xor %ebx, %ebx + mov %rax, %rbp + xor %r15d, %r15d + mov %rdx, %r10 + mov -8(%rsi,%r13,8), %rax + jmp .Laddmul_entry_2 + +.Laddmul_outer_3: + add $2, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -16(%rsi,%r13,8), %rax + xor %r10d, %r10d + mul %r12 + mov %rax, %r15 + mov -16(%rsi,%r13,8), %rax mov %rdx, %rbp - mov %edx, %eax - and $-8, %rbx - shr $3, %rbp - neg %rbx - and $7, %eax - - mov %rbp, %rcx - mov (%r14), %rdx - lea 8(%r14), %r14 - - lea .Lmtab(%rip), %r10 - movslq (%r10,%rax,4), %r11 - lea (%r11, %r10), %r10 - jmp *%r10 - - -.Lmf0: .byte 0xc4,98,171,0xf6,30 - lea 56(%rsi), %rsi - lea -8(%rdi), %rdi - jmp .Lmb0 - -.Lmf3: .byte 0xc4,98,155,0xf6,14 - lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - inc %rcx - jmp .Lmb3 - -.Lmf4: .byte 0xc4,98,171,0xf6,30 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - inc %rcx - jmp .Lmb4 - -.Lmf5: .byte 0xc4,98,155,0xf6,14 - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - inc %rcx - jmp .Lmb5 - -.Lmf6: .byte 0xc4,98,171,0xf6,30 - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - inc %rcx - jmp .Lmb6 - -.Lmf7: .byte 0xc4,98,155,0xf6,14 - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - inc %rcx - jmp .Lmb7 - -.Lmf1: .byte 0xc4,98,155,0xf6,14 - jmp .Lmb1 - -.Lmf2: .byte 0xc4,98,171,0xf6,30 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,98,155,0xf6,14 + jmp .Laddmul_entry_3 + + + + .align 16, 0x90 +.Laddmul_top: + add %r10, -32(%rdi,%r11,8) + adc %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + xor %ebp, %ebp + mul %r12 + add %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + adc %ebp, %ebp +.Laddmul_entry_0: + mul %r9 + xor %r10d, %r10d + add %rbx, -24(%rdi,%r11,8) + adc %rax, %r15 + mov -16(%rsi,%r11,8), %rax + adc %rdx, %rbp + mul %r12 + add %rax, %r15 + mov -16(%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d +.Laddmul_entry_3: + mul %r9 + add %r15, -16(%rdi,%r11,8) + adc %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + mul %r12 + xor %ebx, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %r15d + mov -8(%rsi,%r11,8), %rax + adc %r15d, %ebx +.Laddmul_entry_2: + mul %r9 + add %rbp, -8(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %r15d +.Laddmul_entry_1: + mul %r9 + add $4, %r11 + js .Laddmul_top + + add %r10, -8(%rdi) + adc %rax, %rbx + mov %rbx, (%rdi) + adc %rdx, %r15 + mov %r15, 8(%rdi) + + add $-2, %r8 + jz .Lret + + lea 16(%rdi), %rdi + lea 16(%rcx), %rcx + + mov (%rcx), %r12 + mov 8(%rcx), %r9 + + jmp *%r14 .align 16, 0x90 -.Lm1top: - mov %r10, -8(%rdi) - adc %r11, %r12 -.Lmb1: .byte 0xc4,98,171,0xf6,94,8 - adc %r9, %r10 - lea 64(%rsi), %rsi - mov %r12, (%rdi) -.Lmb0: mov %r10, 8(%rdi) - .byte 0xc4,98,155,0xf6,78,208 - lea 64(%rdi), %rdi - adc %r11, %r12 -.Lmb7: .byte 0xc4,98,171,0xf6,94,216 - mov %r12, -48(%rdi) - adc %r9, %r10 -.Lmb6: mov %r10, -40(%rdi) - .byte 0xc4,98,155,0xf6,78,224 - adc %r11, %r12 -.Lmb5: .byte 0xc4,98,171,0xf6,94,232 - mov %r12, -32(%rdi) - adc %r9, %r10 -.Lmb4: .byte 0xc4,98,155,0xf6,78,240 - mov %r10, -24(%rdi) - adc %r11, %r12 -.Lmb3: .byte 0xc4,98,171,0xf6,94,248 - adc %r9, %r10 - mov %r12, -16(%rdi) - dec %rcx - .byte 0xc4,98,155,0xf6,14 - jnz .Lm1top - -.Lm1end: - mov %r10, -8(%rdi) - adc %r11, %r12 - mov %r12, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - - dec %r8 - jz .Ldone - - lea .Latab(%rip), %r10 - movslq (%r10,%rax,4), %rax - lea (%rax, %r10), %rax - - -.Louter: - lea (%rsi,%rbx,8), %rsi - mov %rbp, %rcx - mov (%r14), %rdx - lea 8(%r14), %r14 - jmp *%rax - -.Lf0: .byte 0xc4,98,171,0xf6,94,8 - lea 8(%rdi,%rbx,8), %rdi - lea -1(%rcx), %rcx - jmp .Lb0 - -.Lf3: .byte 0xc4,98,155,0xf6,78,240 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb3 - -.Lf4: .byte 0xc4,98,171,0xf6,94,232 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb4 - -.Lf5: .byte 0xc4,98,155,0xf6,78,224 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb5 - -.Lf6: .byte 0xc4,98,171,0xf6,94,216 - lea -56(%rdi,%rbx,8), %rdi - jmp .Lb6 - -.Lf7: .byte 0xc4,98,155,0xf6,78,16 - lea 8(%rdi,%rbx,8), %rdi - jmp .Lb7 - -.Lf1: .byte 0xc4,98,155,0xf6,14 - lea 8(%rdi,%rbx,8), %rdi - jmp .Lb1 - -.Lam1end: - .byte 0xf3,76,0x0f,0x38,0xf6,39 - .byte 0xf3,76,0x0f,0x38,0xf6,201 - mov %r12, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - - dec %r8 - jnz .Louter -.Ldone: +.Lret: pop %r15 pop %r14 + pop %r13 pop %r12 pop %rbp pop %rbx ret -.Lf2: - .byte 0xc4,98,171,0xf6,94,248 - lea 8(%rdi,%rbx,8), %rdi - .byte 0xc4,98,155,0xf6,14 - - .align 16, 0x90 -.Lam1top: - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,227 - mov %r10, -8(%rdi) - jrcxz .Lam1end -.Lb1: .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,39 - lea -1(%rcx), %rcx - mov %r12, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 -.Lb0: .byte 0xc4,98,155,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,227 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) -.Lb7: .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,103,16 - mov %r12, 16(%rdi) -.Lb6: .byte 0xc4,98,155,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,227 - mov %r10, 24(%rdi) -.Lb5: .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,103,32 - mov %r12, 32(%rdi) -.Lb4: .byte 0xc4,98,155,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,227 - mov %r10, 40(%rdi) -.Lb3: .byte 0xf3,76,0x0f,0x38,0xf6,103,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r12, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,155,0xf6,14 - jmp .Lam1top - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Lmtab:.long .Lmf0-.Lmtab - .long .Lmf1-.Lmtab - .long .Lmf2-.Lmtab - .long .Lmf3-.Lmtab - .long .Lmf4-.Lmtab - .long .Lmf5-.Lmtab - .long .Lmf6-.Lmtab - .long .Lmf7-.Lmtab -.Latab:.long .Lf0-.Latab - .long .Lf1-.Latab - .long .Lf2-.Latab - .long .Lf3-.Latab - .long .Lf4-.Latab - .long .Lf5-.Latab - .long .Lf6-.Latab - .long .Lf7-.Latab - .text .size __gmpn_mul_basecase,.-__gmpn_mul_basecase diff --git a/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s index 81d8b64e47..d76272ca92 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s +++ b/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s @@ -55,6 +55,16 @@ + + + + + + + + + + @@ -67,340 +77,363 @@ .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_mullo_basecase .type __gmpn_mullo_basecase,@function __gmpn_mullo_basecase: - cmp $4, %ecx - jae .Lbig + cmp $4, %rcx + jge .Lgen + mov (%rsi), %rax + mov (%rdx), %r8 - mov %rdx, %r11 - mov (%rsi), %rdx + lea .Ltab(%rip), %r9 + movslq (%r9,%rcx,4), %r10 + add %r10, %r9 + jmp *%r9 + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .Ltab-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .text - cmp $2, %ecx - jae .Lgt1 -.Ln1: imul (%r11), %rdx - mov %rdx, (%rdi) +.L1: imul %r8, %rax + mov %rax, (%rdi) ret -.Lgt1: ja .Lgt2 -.Ln2: mov (%r11), %r9 - .byte 0xc4,194,251,0xf6,209 + +.L2: mov 8(%rdx), %r11 + imul %rax, %r11 + mul %r8 mov %rax, (%rdi) - mov 8(%rsi), %rax - imul %r9, %rax - add %rax, %rdx - mov 8(%r11), %r9 - mov (%rsi), %rcx - imul %r9, %rcx - add %rcx, %rdx - mov %rdx, 8(%rdi) + imul 8(%rsi), %r8 + lea (%r11, %rdx), %rax + add %r8, %rax + mov %rax, 8(%rdi) ret -.Lgt2: -.Ln3: mov (%r11), %r9 - .byte 0xc4,66,251,0xf6,209 - mov %rax, (%rdi) - mov 8(%rsi), %rdx - .byte 0xc4,194,251,0xf6,209 - imul 16(%rsi), %r9 - add %rax, %r10 - adc %rdx, %r9 - mov 8(%r11), %r8 - mov (%rsi), %rdx - .byte 0xc4,194,251,0xf6,208 - add %rax, %r10 + +.L3: mov 8(%rdx), %r9 + mov 16(%rdx), %r11 + mul %r8 + mov %rax, (%rdi) + mov (%rsi), %rax + mov %rdx, %rcx + mul %r9 + imul 8(%rsi), %r9 + mov 16(%rsi), %r10 + imul %r8, %r10 + add %rax, %rcx adc %rdx, %r9 - imul 8(%rsi), %r8 - add %r8, %r9 - mov %r10, 8(%rdi) - mov 16(%r11), %r10 - mov (%rsi), %rax - imul %rax, %r10 add %r10, %r9 + mov 8(%rsi), %rax + mul %r8 + add %rax, %rcx + adc %rdx, %r9 + mov %r11, %rax + imul (%rsi), %rax + add %rax, %r9 + mov %rcx, 8(%rdi) mov %r9, 16(%rdi) ret - .align 16, 0x90 -.Lbig: push %r14 - push %r12 - push %rbx +.L0m4: +.L1m4: +.L2m4: +.L3m4: +.Lgen: push %rbx push %rbp - mov -8(%rdx,%rcx,8), %r14 - imul (%rsi), %r14 - lea -3(%rcx), %ebp - lea 8(%rdx), %r11 - mov (%rdx), %rdx - - mov %ecx, %eax - shr $3, %ecx - and $7, %eax - lea .Lmtab(%rip), %r10 - movslq (%r10,%rax,4), %rax - lea (%rax, %r10), %r10 - jmp *%r10 - - -.Lmf0: .byte 0xc4,98,171,0xf6,6 - lea 56(%rsi), %rsi + push %r13 + push %r14 + push %r15 + + mov (%rsi), %rax + mov (%rdx), %r13 + mov %rdx, %r11 + + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + neg %rcx + + mul %r13 + + test $1, %cl + jz .Lmul_2 + +.Lmul_1: lea -8(%rdi), %rdi - lea .Lf7(%rip), %rbx - jmp .Lmb0 + lea -8(%rsi), %rsi + test $2, %cl + jnz .Lmul_1_prologue_3 + +.Lmul_1_prologue_2: + lea -1(%rcx), %r9 + lea .Laddmul_outer_1(%rip), %r8 + mov %rax, %rbx + mov %rdx, %r15 + xor %ebp, %ebp + xor %r10d, %r10d + mov 16(%rsi,%rcx,8), %rax + jmp .Lmul_1_entry_2 + +.Lmul_1_prologue_3: + lea 1(%rcx), %r9 + lea .Laddmul_outer_3(%rip), %r8 + mov %rax, %rbp + mov %rdx, %r10 + xor %ebx, %ebx + jmp .Lmul_1_entry_0 + + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r9,8) + add %rax, %r15 + mov (%rsi,%r9,8), %rax + adc %rdx, %rbp + xor %ebx, %ebx + mul %r13 + mov %r15, -8(%rdi,%r9,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r9,8), %rax + mul %r13 + mov %rbp, (%rdi,%r9,8) + add %rax, %r10 + adc %rdx, %rbx + mov 16(%rsi,%r9,8), %rax + mul %r13 + mov %r10, 8(%rdi,%r9,8) + xor %ebp, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r9,8), %rax + mov %rbp, %r15 + adc %rdx, %r15 +.Lmul_1_entry_2: + mul %r13 + add $4, %r9 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %r15 + mov %r15, -8(%rdi) + adc %rdx, %rbp + + imul (%rsi), %r13 + add %r13, %rbp + mov %rbp, (%rdi) + + add $1, %rcx + jz .Lret + + mov 8(%r11), %r13 + mov 16(%r11), %r14 -.Lmf3: .byte 0xc4,226,179,0xf6,6 lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - jrcxz .Lmc - inc %ecx - lea .Lf2(%rip), %rbx - jmp .Lmb3 + lea 8(%r11), %r11 + lea 24(%rdi), %rdi + + jmp *%r8 -.Lmc: .byte 0xc4,98,171,0xf6,70,248 + +.Lmul_2: + mov 8(%r11), %r14 + test $2, %cl + jz .Lmul_2_prologue_3 + + .align 16, 0x90 +.Lmul_2_prologue_1: + lea 0(%rcx), %r9 + mov %rax, %r10 + mov %rdx, %rbx + xor %r15d, %r15d + mov (%rsi,%rcx,8), %rax + lea .Laddmul_outer_3(%rip), %r8 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_3: + lea 2(%rcx), %r9 + mov $0, %r10d + mov %rax, %r15 + mov (%rsi,%rcx,8), %rax + mov %rdx, %rbp + lea .Laddmul_outer_1(%rip), %r8 + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_top: + mov -32(%rsi,%r9,8), %rax + mul %r14 + add %rax, %rbx + adc %rdx, %r15 + mov -24(%rsi,%r9,8), %rax + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + adc $0, %ebp + mul %r14 + add %rax, %r15 + mov %rbx, -24(%rdi,%r9,8) + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %r15 + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d +.Lmul_2_entry_3: + mov $0, %ebx + mov %r15, -16(%rdi,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %r15d + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc %r15d, %ebx + mul %r14 add %rax, %r10 - mov %r9, -16(%rdi) - .byte 0xc4,226,179,0xf6,6 - mov %r10, -8(%rdi) - adc %r8, %r9 - mov %r9, (%rdi) - jmp .Lc2 - -.Lmf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - inc %ecx - lea .Lf3(%rip), %rbx - jmp .Lmb4 - -.Lmf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - inc %ecx - lea .Lf4(%rip), %rbx - jmp .Lmb5 - -.Lmf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - inc %ecx - lea .Lf5(%rip), %rbx - jmp .Lmb6 - -.Lmf7: .byte 0xc4,226,179,0xf6,6 - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - lea .Lf6(%rip), %rbx - jmp .Lmb7 - -.Lmf1: .byte 0xc4,226,179,0xf6,6 - lea .Lf0(%rip), %rbx - jmp .Lmb1 - -.Lmf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - lea .Lf1(%rip), %rbx - .byte 0xc4,226,179,0xf6,6 - - - .align 32, 0x90 -.Lmtop:mov %r10, -8(%rdi) - adc %r8, %r9 -.Lmb1: .byte 0xc4,98,171,0xf6,70,8 - adc %rax, %r10 - lea 64(%rsi), %rsi - mov %r9, (%rdi) -.Lmb0: mov %r10, 8(%rdi) - .byte 0xc4,226,179,0xf6,70,208 - lea 64(%rdi), %rdi - adc %r8, %r9 -.Lmb7: .byte 0xc4,98,171,0xf6,70,216 - mov %r9, -48(%rdi) - adc %rax, %r10 -.Lmb6: mov %r10, -40(%rdi) - .byte 0xc4,226,179,0xf6,70,224 - adc %r8, %r9 -.Lmb5: .byte 0xc4,98,171,0xf6,70,232 - mov %r9, -32(%rdi) - adc %rax, %r10 -.Lmb4: .byte 0xc4,226,179,0xf6,70,240 - mov %r10, -24(%rdi) - adc %r8, %r9 -.Lmb3: .byte 0xc4,98,171,0xf6,70,248 + mov %rbp, -8(%rdi,%r9,8) + adc %rdx, %rbx + mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %r15d +.Lmul_2_entry_1: + add $4, %r9 + mov %r10, -32(%rdi,%r9,8) + js .Lmul_2_top + + imul -16(%rsi), %r14 + add %r14, %rbx + imul -8(%rsi), %r13 + add %r13, %rbx + mov %rbx, -8(%rdi) + + add $2, %rcx + jz .Lret + + mov 16(%r11), %r13 + mov 24(%r11), %r14 + + lea 16(%r11), %r11 + lea 16(%rdi), %rdi + + jmp *%r8 + + +.Laddmul_outer_1: + lea -2(%rcx), %r9 + mov -16(%rsi,%rcx,8), %rax + mul %r13 + mov %rax, %r10 + mov -16(%rsi,%rcx,8), %rax + mov %rdx, %rbx + xor %r15d, %r15d + lea .Laddmul_outer_3(%rip), %r8 + jmp .Laddmul_entry_1 + +.Laddmul_outer_3: + lea 0(%rcx), %r9 + mov -16(%rsi,%rcx,8), %rax + xor %r10d, %r10d + mul %r13 + mov %rax, %r15 + mov -16(%rsi,%rcx,8), %rax + mov %rdx, %rbp + lea .Laddmul_outer_1(%rip), %r8 + jmp .Laddmul_entry_3 + + .align 16, 0x90 +.Laddmul_top: + add %r10, -32(%rdi,%r9,8) + adc %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + adc %ebp, %ebp + mul %r14 + xor %r10d, %r10d + add %rbx, -24(%rdi,%r9,8) + adc %rax, %r15 + mov -16(%rsi,%r9,8), %rax + adc %rdx, %rbp + mul %r13 + add %rax, %r15 + mov -16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d +.Laddmul_entry_3: + mul %r14 + add %r15, -16(%rdi,%r9,8) + adc %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + xor %ebx, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %r15d + mov -8(%rsi,%r9,8), %rax + adc %r15d, %ebx + mul %r14 + add %rbp, -8(%rdi,%r9,8) adc %rax, %r10 - mov %r9, -16(%rdi) - dec %ecx - .byte 0xc4,226,179,0xf6,6 - jnz .Lmtop - -.Lmend:mov %r10, -8(%rdi) - adc %r8, %r9 - mov %r9, (%rdi) - adc %rcx, %rax - - lea 8(,%rbp,8), %r12 - neg %r12 - shr $3, %ebp - jmp .Lent - -.Lf0: .byte 0xc4,98,171,0xf6,6 - lea -8(%rsi), %rsi - lea -8(%rdi), %rdi - lea .Lf7(%rip), %rbx - jmp .Lb0 - -.Lf1: .byte 0xc4,226,179,0xf6,6 - lea -1(%rbp), %ebp - lea .Lf0(%rip), %rbx - jmp .Lb1 - -.Lend: .byte 0xf3,76,0x0f,0x38,0xf6,15 - mov %r9, (%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,193 - adc %rcx, %rax - lea 8(%r12), %r12 -.Lent: .byte 0xc4,98,171,0xf6,70,8 - add %rax, %r14 - add %r10, %r14 - lea (%rsi,%r12), %rsi - lea 8(%rdi,%r12), %rdi - mov (%r11), %rdx - lea 8(%r11), %r11 - or %ebp, %ecx - jmp *%rbx + adc %rdx, %rbx + mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov (%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %r15d +.Laddmul_entry_1: + mul %r14 + add $4, %r9 + js .Laddmul_top + + add %r10, -32(%rdi) + adc %rax, %rbx + + imul -24(%rsi), %r13 + add %r13, %rbx + add %rbx, -24(%rdi) + + add $2, %rcx + jns .Lret + + lea 16(%r11), %r11 + + mov (%r11), %r13 + mov 8(%r11), %r14 -.Lf7: .byte 0xc4,226,179,0xf6,6 lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - lea .Lf6(%rip), %rbx - jmp .Lb7 - -.Lf2: .byte 0xc4,98,171,0xf6,6 - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - .byte 0xc4,226,179,0xf6,6 - lea .Lf1(%rip), %rbx - - - .align 32, 0x90 -.Ltop: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - jrcxz .Lend -.Lb1: .byte 0xc4,98,171,0xf6,70,8 - .byte 0xf3,76,0x0f,0x38,0xf6,15 - lea -1(%rcx), %ecx - mov %r9, (%rdi) - .byte 0x66,76,0x0f,0x38,0xf6,208 -.Lb0: .byte 0xc4,226,179,0xf6,70,16 - .byte 0x66,77,0x0f,0x38,0xf6,200 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) -.Lb7: .byte 0xc4,98,171,0xf6,70,24 - lea 64(%rsi), %rsi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,16 - mov %r9, 16(%rdi) -.Lb6: .byte 0xc4,226,179,0xf6,70,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 24(%rdi) -.Lb5: .byte 0xc4,98,171,0xf6,70,232 - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xf3,76,0x0f,0x38,0xf6,79,32 - mov %r9, 32(%rdi) -.Lb4: .byte 0xc4,226,179,0xf6,70,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, 40(%rdi) -.Lb3: .byte 0xf3,76,0x0f,0x38,0xf6,79,48 - .byte 0xc4,98,171,0xf6,70,248 - mov %r9, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xc4,226,179,0xf6,6 - jmp .Ltop - -.Lf6: .byte 0xc4,98,171,0xf6,6 - lea 40(%rsi), %rsi - lea -24(%rdi), %rdi - lea .Lf5(%rip), %rbx - jmp .Lb6 - -.Lf5: .byte 0xc4,226,179,0xf6,6 - lea 32(%rsi), %rsi - lea -32(%rdi), %rdi - lea .Lf4(%rip), %rbx - jmp .Lb5 - -.Lf4: .byte 0xc4,98,171,0xf6,6 - lea 24(%rsi), %rsi - lea -40(%rdi), %rdi - lea .Lf3(%rip), %rbx - jmp .Lb4 - -.Lf3: .byte 0xc4,226,179,0xf6,6 - lea 16(%rsi), %rsi - lea -48(%rdi), %rdi - jrcxz .Lcor - lea .Lf2(%rip), %rbx - jmp .Lb3 - -.Lcor: .byte 0xf3,76,0x0f,0x38,0xf6,79,48 - .byte 0xc4,98,171,0xf6,70,248 - mov %r9, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,76,0x0f,0x38,0xf6,208 - .byte 0xc4,226,179,0xf6,6 - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,15 - mov %r9, (%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,193 -.Lc2: - .byte 0xc4,98,171,0xf6,70,8 - adc %rax, %r14 - add %r10, %r14 - mov (%r11), %rdx - test %ecx, %ecx - .byte 0xc4,98,171,0xf6,70,240 - .byte 0xc4,226,179,0xf6,70,248 - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,200 - mov %r10, -8(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,15 - .byte 0xf3,72,0x0f,0x38,0xf6,193 - adc %rcx, %rax - .byte 0xc4,98,171,0xf6,6 - add %rax, %r14 - add %r10, %r14 - mov 8(%r11), %rdx - .byte 0xc4,226,243,0xf6,70,240 - add %r9, %rcx - mov %rcx, (%rdi) - adc $0, %rax - .byte 0xc4,98,171,0xf6,70,248 - add %rax, %r14 - add %r10, %r14 - mov %r14, 8(%rdi) + + jmp *%r8 + +.Lret: pop %r15 + pop %r14 + pop %r13 pop %rbp pop %rbx - pop %r12 - pop %r14 ret .size __gmpn_mullo_basecase,.-__gmpn_mullo_basecase - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Lmtab:.long .Lmf7-.Lmtab - .long .Lmf0-.Lmtab - .long .Lmf1-.Lmtab - .long .Lmf2-.Lmtab - .long .Lmf3-.Lmtab - .long .Lmf4-.Lmtab - .long .Lmf5-.Lmtab - .long .Lmf6-.Lmtab diff --git a/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s new file mode 100644 index 0000000000..b607e84aca --- /dev/null +++ b/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s @@ -0,0 +1,573 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mulmid_basecase + .type __gmpn_mulmid_basecase,@function + +__gmpn_mulmid_basecase: + + + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rcx, %r15 + + + lea 1(%rdx), %r13 + sub %r8, %r13 + + lea (%rdi,%r13,8), %rdi + + cmp $4, %r13 + jc .Ldiagonal + + lea (%rsi,%rdx,8), %rsi + + test $1, %r8 + jz .Lmul_2 + + + + +.Lmul_1: + mov %r13d, %ebx + + neg %r13 + mov (%rsi,%r13,8), %rax + mov (%r15), %r12 + mul %r12 + + and $-4, %r13 + mov %r13, %r11 + + and $3, %ebx + jz .Lmul_1_prologue_0 + cmp $2, %ebx + jc .Lmul_1_prologue_1 + jz .Lmul_1_prologue_2 + +.Lmul_1_prologue_3: + mov %rax, %r10 + mov %rdx, %rbx + lea .Laddmul_prologue_3(%rip), %r14 + jmp .Lmul_1_entry_3 + + .align 16, 0x90 +.Lmul_1_prologue_0: + mov %rax, %rbp + mov %rdx, %r10 + lea .Laddmul_prologue_0(%rip), %r14 + jmp .Lmul_1_entry_0 + + .align 16, 0x90 +.Lmul_1_prologue_1: + add $4, %r11 + mov %rax, %rcx + mov %rdx, %rbp + mov $0, %r10d + mov (%rsi,%r11,8), %rax + lea .Laddmul_prologue_1(%rip), %r14 + jmp .Lmul_1_entry_1 + + .align 16, 0x90 +.Lmul_1_prologue_2: + mov %rax, %rbx + mov %rdx, %rcx + mov 24(%rsi,%r11,8), %rax + mov $0, %ebp + mov $0, %r10d + lea .Laddmul_prologue_2(%rip), %r14 + jmp .Lmul_1_entry_2 + + + + + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r11,8) + add %rax, %rcx + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp +.Lmul_1_entry_1: + mov $0, %ebx + mul %r12 + mov %rcx, -8(%rdi,%r11,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov %rbp, (%rdi,%r11,8) + add %rax, %r10 + adc %rdx, %rbx +.Lmul_1_entry_3: + mov 16(%rsi,%r11,8), %rax + mul %r12 + mov %r10, 8(%rdi,%r11,8) + mov $0, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r11,8), %rax + mov %rbp, %rcx + adc %rdx, %rcx +.Lmul_1_entry_2: + mul %r12 + add $4, %r11 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %rcx + mov %rcx, -8(%rdi) + mov %rbp, 8(%rdi) + adc %rdx, %rbp + mov %rbp, (%rdi) + + dec %r8 + jz .Lret + + lea -8(%rsi), %rsi + lea 8(%r15), %r15 + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Lmul_2: + mov %r13d, %ebx + + neg %r13 + mov -8(%rsi,%r13,8), %rax + mov (%r15), %r12 + mov 8(%r15), %r9 + mul %r9 + + and $-4, %r13 + mov %r13, %r11 + + and $3, %ebx + jz .Lmul_2_prologue_0 + cmp $2, %ebx + jc .Lmul_2_prologue_1 + jz .Lmul_2_prologue_2 + +.Lmul_2_prologue_3: + mov %rax, %rcx + mov %rdx, %rbp + lea .Laddmul_prologue_3(%rip), %r14 + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_prologue_0: + mov %rax, %rbx + mov %rdx, %rcx + lea .Laddmul_prologue_0(%rip), %r14 + jmp .Lmul_2_entry_0 + + .align 16, 0x90 +.Lmul_2_prologue_1: + mov %rax, %r10 + mov %rdx, %rbx + mov $0, %ecx + lea .Laddmul_prologue_1(%rip), %r14 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_2: + mov %rax, %rbp + mov %rdx, %r10 + mov $0, %ebx + mov 16(%rsi,%r11,8), %rax + lea .Laddmul_prologue_2(%rip), %r14 + jmp .Lmul_2_entry_2 + + + + + .align 16, 0x90 +.Lmul_2_top: + mov -8(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + adc %rdx, %rcx +.Lmul_2_entry_0: + mov $0, %ebp + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %rbx + mov (%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r9 + add %rax, %rcx + mov %rbx, (%rdi,%r11,8) + adc %rdx, %rbp +.Lmul_2_entry_3: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov $0, %ebx + adc $0, %r10d + mov 8(%rsi,%r11,8), %rax + mov %rcx, 8(%rdi,%r11,8) + mul %r9 + add %rax, %rbp + mov 16(%rsi,%r11,8), %rax + adc %rdx, %r10 +.Lmul_2_entry_2: + mov $0, %ecx + mul %r12 + add %rax, %rbp + mov 16(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r9 + add %rax, %r10 + mov %rbp, 16(%rdi,%r11,8) + adc %rdx, %rbx +.Lmul_2_entry_1: + mov 24(%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r11 + mov %r10, -8(%rdi,%r11,8) + jnz .Lmul_2_top + + mov %rbx, (%rdi) + mov %rcx, 8(%rdi) + + sub $2, %r8 + jz .Lret + + lea 16(%r15), %r15 + lea -16(%rsi), %rsi + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Laddmul_prologue_0: + mov -8(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rcx + mov %rdx, %rbp + mov $0, %r10d + jmp .Laddmul_entry_0 + + .align 16, 0x90 +.Laddmul_prologue_1: + mov 16(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rbx + mov %rdx, %rcx + mov $0, %ebp + mov 24(%rsi,%r11,8), %rax + jmp .Laddmul_entry_1 + + .align 16, 0x90 +.Laddmul_prologue_2: + mov 8(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %r10 + mov %rdx, %rbx + mov $0, %ecx + jmp .Laddmul_entry_2 + + .align 16, 0x90 +.Laddmul_prologue_3: + mov (%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rbp + mov %rdx, %r10 + mov $0, %ebx + mov $0, %ecx + jmp .Laddmul_entry_3 + + + + .align 16, 0x90 +.Laddmul_top: + mov $0, %r10d + add %rax, %rbx + mov -8(%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r9 + add %rbx, -8(%rdi,%r11,8) + adc %rax, %rcx + adc %rdx, %rbp +.Laddmul_entry_0: + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %rcx + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r9 + add %rcx, (%rdi,%r11,8) + mov $0, %ecx + adc %rax, %rbp + mov $0, %ebx + adc %rdx, %r10 +.Laddmul_entry_3: + mov 8(%rsi,%r11,8), %rax + mul %r12 + add %rax, %rbp + mov 8(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r9 + add %rbp, 8(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx +.Laddmul_entry_2: + mov 16(%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + mov 16(%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r9 + add %r10, 16(%rdi,%r11,8) + nop + adc %rax, %rbx + mov $0, %ebp + mov 24(%rsi,%r11,8), %rax + adc %rdx, %rcx +.Laddmul_entry_1: + mul %r12 + add $4, %r11 + jnz .Laddmul_top + + add %rax, %rbx + adc %rdx, %rcx + adc $0, %ebp + + add %rbx, -8(%rdi) + adc %rcx, (%rdi) + adc %rbp, 8(%rdi) + + sub $2, %r8 + jz .Lret + + lea 16(%r15), %r15 + lea -16(%rsi), %rsi + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Ldiagonal: + xor %ebx, %ebx + xor %ecx, %ecx + xor %ebp, %ebp + + neg %r13 + + mov %r8d, %eax + and $3, %eax + jz .Ldiag_prologue_0 + cmp $2, %eax + jc .Ldiag_prologue_1 + jz .Ldiag_prologue_2 + +.Ldiag_prologue_3: + lea -8(%r15), %r15 + mov %r15, %r10 + add $1, %r8 + mov %r8, %r11 + lea .Ldiag_entry_3(%rip), %r14 + jmp .Ldiag_entry_3 + +.Ldiag_prologue_0: + mov %r15, %r10 + mov %r8, %r11 + lea 0(%rip), %r14 + mov -8(%rsi,%r11,8), %rax + jmp .Ldiag_entry_0 + +.Ldiag_prologue_1: + lea 8(%r15), %r15 + mov %r15, %r10 + add $3, %r8 + mov %r8, %r11 + lea 0(%rip), %r14 + mov -8(%r10), %rax + jmp .Ldiag_entry_1 + +.Ldiag_prologue_2: + lea -16(%r15), %r15 + mov %r15, %r10 + add $2, %r8 + mov %r8, %r11 + lea 0(%rip), %r14 + mov 16(%r10), %rax + jmp .Ldiag_entry_2 + + + + + .align 16, 0x90 +.Ldiag_top: + add %rax, %rbx + adc %rdx, %rcx + mov -8(%rsi,%r11,8), %rax + adc $0, %rbp +.Ldiag_entry_0: + mulq (%r10) + add %rax, %rbx + adc %rdx, %rcx + adc $0, %rbp +.Ldiag_entry_3: + mov -16(%rsi,%r11,8), %rax + mulq 8(%r10) + add %rax, %rbx + mov 16(%r10), %rax + adc %rdx, %rcx + adc $0, %rbp +.Ldiag_entry_2: + mulq -24(%rsi,%r11,8) + add %rax, %rbx + mov 24(%r10), %rax + adc %rdx, %rcx + lea 32(%r10), %r10 + adc $0, %rbp +.Ldiag_entry_1: + mulq -32(%rsi,%r11,8) + sub $4, %r11 + jnz .Ldiag_top + + add %rax, %rbx + adc %rdx, %rcx + adc $0, %rbp + + mov %rbx, (%rdi,%r13,8) + + inc %r13 + jz .Ldiag_end + + mov %r8, %r11 + mov %r15, %r10 + + lea 8(%rsi), %rsi + mov %rcx, %rbx + mov %rbp, %rcx + xor %ebp, %ebp + + jmp *%r14 + +.Ldiag_end: + mov %rcx, (%rdi) + mov %rbp, 8(%rdi) + +.Lret: pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + .size __gmpn_mulmid_basecase,.-__gmpn_mulmid_basecase diff --git a/ext/gmp/gen/x86_64-linux/mpn/nand_n.s b/ext/gmp/gen/x86_64-linux/mpn/nand_n.s index ad4e827623..04593b9b51 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/nand_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/nand_n.s @@ -94,7 +94,6 @@ - .text @@ -107,56 +106,48 @@ __gmpn_nand_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: and (%rsi), %r8 +.Lb11: and (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: and (%rsi), %r8 + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: and (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - dec %rcx + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - and (%rsi), %r8 + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + and (%rsi,%rcx,8), %r8 not %r8 - and 8(%rsi), %r9 + and 8(%rsi,%rcx,8), %r9 not %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - and 16(%rsi), %r8 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + and 16(%rsi,%rcx,8), %r8 not %r8 - and 24(%rsi), %r9 - lea 32(%rsi), %rsi + and 24(%rsi,%rcx,8), %r9 not %r9 - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/nior_n.s b/ext/gmp/gen/x86_64-linux/mpn/nior_n.s index 68dffa7222..8ea0437f09 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/nior_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/nior_n.s @@ -94,7 +94,6 @@ - .text @@ -107,56 +106,48 @@ __gmpn_nior_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: or (%rsi), %r8 +.Lb11: or (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: or (%rsi), %r8 + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: or (%rsi,%rcx,8), %r8 not %r8 - mov %r8, (%rdi) - dec %rcx + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - or (%rsi), %r8 + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + or (%rsi,%rcx,8), %r8 not %r8 - or 8(%rsi), %r9 + or 8(%rsi,%rcx,8), %r9 not %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - or 16(%rsi), %r8 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + or 16(%rsi,%rcx,8), %r8 not %r8 - or 24(%rsi), %r9 - lea 32(%rsi), %rsi + or 24(%rsi,%rcx,8), %r9 not %r9 - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/popcount.s b/ext/gmp/gen/x86_64-linux/mpn/popcount.s index d118f5bda4..243219e87c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/popcount.s +++ b/ext/gmp/gen/x86_64-linux/mpn/popcount.s @@ -59,16 +59,15 @@ - - - - - - - - - - + + + + + + + + + @@ -86,110 +85,76 @@ __gmpn_popcount: - - - mov %esi, %r8d - and $7, %r8d - - .byte 0xf3,0x48,0x0f,0xb8,0x07 - xor %ecx, %ecx - - lea .Ltab(%rip), %r9 - - movslq (%r9,%r8,4), %r8 - add %r9, %r8 - jmp *%r8 - - -.L3: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 - add $24, %rdi - sub $8, %rsi - jg .Le34 - add %r10, %rax - add %r11, %rax -.Ls1: - ret - -.L1: sub $8, %rsi - jle .Ls1 - .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 - add $8, %rdi - jmp .Le12 - -.L7: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 - add $-8, %rdi - jmp .Le07 - -.L0: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 - jmp .Le07 - -.L4: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 - add $32, %rdi - sub $8, %rsi - jle .Lx4 + + push %rbx + mov $0x5555555555555555, %r10 + push %rbp + mov $0x3333333333333333, %r11 + lea (%rdi,%rsi,8), %rdi + mov $0x0f0f0f0f0f0f0f0f, %rcx + neg %rsi + mov $0x0101010101010101, %rdx + xor %eax, %eax + test $1, %sil + jz .Ltop + + mov (%rdi,%rsi,8), %r8 + + mov %r8, %r9 + shr %r8 + and %r10, %r8 + sub %r8, %r9 + + mov %r9, %r8 + shr $2, %r9 + and %r11, %r8 + and %r11, %r9 + add %r8, %r9 + + dec %rsi + jmp .Lmid .align 16, 0x90 -.Ltop: -.Le34: .byte 0xf3,0x4c,0x0f,0xb8,0x07 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x08 - add %r10, %rcx - add %r11, %rax -.Le12: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 - add %r8, %rcx - add %r9, %rax -.Le07: .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x20 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x28 - add %r10, %rcx - add %r11, %rax -.Le56: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x30 - .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x38 - add $64, %rdi - add %r8, %rcx - add %r9, %rax - sub $8, %rsi - jg .Ltop - -.Lx4: add %r10, %rcx - add %r11, %rax -.Lx2: add %rcx, %rax - +.Ltop: mov (%rdi,%rsi,8), %r8 + mov 8(%rdi,%rsi,8), %rbx + + mov %r8, %r9 + mov %rbx, %rbp + shr %r8 + shr %rbx + and %r10, %r8 + and %r10, %rbx + sub %r8, %r9 + sub %rbx, %rbp + + mov %r9, %r8 + mov %rbp, %rbx + shr $2, %r9 + shr $2, %rbp + and %r11, %r8 + and %r11, %r9 + and %r11, %rbx + and %r11, %rbp + add %r8, %r9 + add %rbx, %rbp + + add %rbp, %r9 +.Lmid: mov %r9, %r8 + shr $4, %r9 + and %rcx, %r8 + and %rcx, %r9 + add %r8, %r9 + + imul %rdx, %r9 + shr $56, %r9 + + add %r9, %rax + add $2, %rsi + jnc .Ltop + +.Lend: + pop %rbp + pop %rbx ret - -.L2: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - sub $8, %rsi - jle .Lx2 - .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 - add $16, %rdi - jmp .Le12 - -.L5: .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 - add $-24, %rdi - jmp .Le56 - -.L6: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 - .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 - .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 - add $-16, %rdi - jmp .Le56 .size __gmpn_popcount,.-__gmpn_popcount - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/redc_1.s b/ext/gmp/gen/x86_64-linux/mpn/redc_1.s index a5912b7b6d..da7fd88758 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/redc_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/redc_1.s @@ -63,6 +63,11 @@ + + + + + @@ -77,14 +82,15 @@ - - + + + .text - .align 16, 0x90 + .align 32, 0x90 .globl __gmpn_redc_1 .type __gmpn_redc_1,@function @@ -92,356 +98,506 @@ __gmpn_redc_1: - push %rbx push %rbp + mov (%rsi), %rbp + push %rbx + imul %r8, %rbp push %r12 push %r13 push %r14 push %r15 - push %rdi - mov %rdx, %rdi - mov (%rsi), %rdx - - neg %rcx - push %r8 - imul %r8, %rdx - mov %rcx, %r15 - - test $1, %cl - jnz .Lbx1 - -.Lbx0: test $2, %cl - jz .Lo0b - - cmp $-2, %ecx - jnz .Lo2 - - - mov 8(%rsp), %rbx - lea 16(%rsp), %rsp - .byte 0xc4,98,179,0xf6,39 - .byte 0xc4,98,163,0xf6,87,8 - add %r12, %r11 - adc $0, %r10 - add (%rsi), %r9 - adc 8(%rsi), %r11 - adc $0, %r10 - mov %r11, %rdx - imul %r8, %rdx - .byte 0xc4,98,147,0xf6,39 - .byte 0xc4,98,139,0xf6,127,8 - xor %eax, %eax - add %r12, %r14 - adc $0, %r15 - add %r11, %r13 - adc 16(%rsi), %r14 - adc $0, %r15 - add %r14, %r10 - adc 24(%rsi), %r15 - mov %r10, (%rbx) - mov %r15, 8(%rbx) - setc %al - jmp .Lret -.Lo2: lea 2(%rcx), %r14 - .byte 0xc4,98,179,0xf6,7 - .byte 0xc4,98,163,0xf6,87,8 - sar $2, %r14 - add %r8, %r11 - jmp .Llo2 + mov %rcx, %r12 + neg %r12 + lea (%rdx,%rcx,8), %r13 + lea -16(%rsi,%rcx,8), %rsi + + mov %ecx, %eax + and $3, %eax + lea 4(%rax), %r9 + cmp $4, %ecx + cmovg %r9, %rax + lea .Ltab(%rip), %r9 + + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax + + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .L0-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .long .L0m4-.Ltab + .long .L1m4-.Ltab + .long .L2m4-.Ltab + .long .L3m4-.Ltab + .text .align 16, 0x90 -.Ltp2: adc %rax, %r9 - lea 32(%rsi), %rsi - adc %r8, %r11 -.Llo2: .byte 0xc4,98,147,0xf6,103,16 - mov (%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 8(%rsi), %r10 - mov 16(%rsi), %r12 - add %r9, %r8 - mov 24(%rsi), %rbp - mov %r8, (%rsi) - adc %r11, %r10 - .byte 0xc4,98,179,0xf6,7 - mov %r10, 8(%rsi) - adc %r13, %r12 - mov %r12, 16(%rsi) - adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 24(%rsi) - inc %r14 - jnz .Ltp2 - -.Led2: mov 56(%rsi,%rcx,8), %rdx - lea 16(%rdi,%rcx,8), %rdi - adc %rax, %r9 - adc %r8, %r11 - mov 32(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 40(%rsi), %rax - add %r9, %r8 - mov %r8, 32(%rsi) - adc %r11, %rax - mov %rax, 40(%rsi) - lea 56(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo2 - - jmp .Lcj - - -.Lbx1: test $2, %cl - jz .Lo3a - -.Lo1a: cmp $-1, %ecx - jnz .Lo1b - - - mov 8(%rsp), %rbx - lea 16(%rsp), %rsp - .byte 0xc4,98,163,0xf6,23 - add (%rsi), %r11 - adc 8(%rsi), %r10 - mov %r10, (%rbx) +.L1: mov (%rdx), %rax + mul %rbp + add 8(%rsi), %rax + adc 16(%rsi), %rdx + mov %rdx, (%rdi) mov $0, %eax - setc %al + adc %eax, %eax jmp .Lret -.Lo1b: lea 24(%rdi), %rdi -.Lo1: lea 1(%rcx), %r14 - .byte 0xc4,98,163,0xf6,87,232 - .byte 0xc4,98,147,0xf6,103,240 - .byte 0xc4,226,227,0xf6,71,248 - sar $2, %r14 - add %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (%rsi), %r10 - mov 8(%rsi), %r12 - mov 16(%rsi), %rbp - add %r11, %r10 - jmp .Llo1 .align 16, 0x90 -.Ltp1: adc %rax, %r9 - lea 32(%rsi), %rsi - adc %r8, %r11 - .byte 0xc4,98,147,0xf6,103,16 - mov -8(%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (%rsi), %r10 - mov 8(%rsi), %r12 - add %r9, %r8 - mov 16(%rsi), %rbp - mov %r8, -8(%rsi) - adc %r11, %r10 -.Llo1: .byte 0xc4,98,179,0xf6,7 +.L2: mov (%rdx), %rax + mul %rbp + xor %r14d, %r14d + mov %rax, %r10 + mov -8(%r13), %rax + mov %rdx, %r9 + mul %rbp + add (%rsi), %r10 + adc %rax, %r9 + adc %rdx, %r14 + add 8(%rsi), %r9 + adc $0, %r14 + mov %r9, %rbp + imul %r8, %rbp + mov -16(%r13), %rax + mul %rbp + xor %ebx, %ebx + mov %rax, %r10 + mov -8(%r13), %rax + mov %rdx, %r11 + mul %rbp + add %r9, %r10 + adc %rax, %r11 + adc %rdx, %rbx + add 16(%rsi), %r11 + adc $0, %rbx + xor %eax, %eax + add %r11, %r14 + adc 24(%rsi), %rbx + mov %r14, (%rdi) + mov %rbx, 8(%rdi) + adc %eax, %eax + jmp .Lret + + +.L3: mov (%rdx), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add -8(%rsi), %rbx + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add (%rsi), %r10 mov %r10, (%rsi) - adc %r13, %r12 - mov %r12, 8(%rsi) - adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 16(%rsi) - inc %r14 - jnz .Ltp1 - -.Led1: mov 48(%rsi,%rcx,8), %rdx - lea 40(%rdi,%rcx,8), %rdi adc %rax, %r9 - adc %r8, %r11 - mov 24(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 32(%rsi), %rax - add %r9, %r8 - mov %r8, 24(%rsi) - adc %r11, %rax - mov %rax, 32(%rsi) - lea 48(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo1 - - jmp .Lcj - -.Lo3a: cmp $-3, %ecx - jnz .Lo3b - - -.Ln3: .byte 0xc4,226,227,0xf6,7 - .byte 0xc4,98,179,0xf6,119,8 + adc %rdx, %r14 + mov %r10, %rbp + imul %r8, %rbp + add %r9, 8(%rsi) + adc $0, %r14 + mov %r14, -8(%rsi) + + mov -24(%r13), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d add (%rsi), %rbx - .byte 0xc4,98,163,0xf6,87,16 - adc %rax, %r9 - adc %r14, %r11 - mov 8(%rsi), %r14 - mov %r8, %rdx - adc $0, %r10 - mov 16(%rsi), %rax - add %r9, %r14 - mov %r14, 8(%rsi) - .byte 0xc4,66,235,0xf6,238 - adc %r11, %rax - mov %rax, 16(%rsi) - adc $0, %r10 - mov %r10, (%rsi) - lea 8(%rsi), %rsi - inc %r15 - jnz .Ln3 - - jmp .Lcj - -.Lo3b: lea 8(%rdi), %rdi -.Lo3: lea 4(%rcx), %r14 - .byte 0xc4,226,227,0xf6,71,248 - .byte 0xc4,98,179,0xf6,7 - mov (%rsi), %rbp - .byte 0xc4,98,163,0xf6,87,8 - sar $2, %r14 - add %rbx, %rbp - nop + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add 8(%rsi), %r10 + mov %r10, 8(%rsi) + adc %rax, %r9 + adc %rdx, %r14 + mov %r10, %rbp + imul %r8, %rbp + add %r9, 16(%rsi) + adc $0, %r14 + mov %r14, (%rsi) + + mov -24(%r13), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add 8(%rsi), %rbx + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add 16(%rsi), %r10 adc %rax, %r9 - jmp .Llo3 + adc %rdx, %r14 + add 24(%rsi), %r9 + adc $0, %r14 + + xor %eax, %eax + add -8(%rsi), %r10 + adc (%rsi), %r9 + adc 32(%rsi), %r14 + mov %r10, (%rdi) + mov %r9, 8(%rdi) + mov %r14, 16(%rdi) + adc %eax, %eax + jmp .Lret + .align 16, 0x90 -.Ltp3: adc %rax, %r9 - lea 32(%rsi), %rsi -.Llo3: adc %r8, %r11 - .byte 0xc4,98,147,0xf6,103,16 - mov 8(%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 16(%rsi), %r10 - mov 24(%rsi), %r12 - add %r9, %r8 - mov 32(%rsi), %rbp - mov %r8, 8(%rsi) - adc %r11, %r10 - .byte 0xc4,98,179,0xf6,7 - mov %r10, 16(%rsi) - adc %r13, %r12 - mov %r12, 24(%rsi) - adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 32(%rsi) - inc %r14 - jnz .Ltp3 - -.Led3: mov 64(%rsi,%rcx,8), %rdx - lea 24(%rdi,%rcx,8), %rdi +.L2m4: +.Llo2: mov (%r13,%r12,8), %rax + mul %rbp + xor %r14d, %r14d + xor %ebx, %ebx + mov %rax, %r10 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mov %rdx, %r9 + mul %rbp + add 16(%rsi,%r12,8), %r10 adc %rax, %r9 - adc %r8, %r11 - mov 40(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 48(%rsi), %rax - add %r9, %r8 - mov %r8, 40(%rsi) - adc %r11, %rax - mov %rax, 48(%rsi) - lea 64(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo3 - - jmp .Lcj - -.Lo0b: lea 16(%rdi), %rdi -.Lo0: mov %rcx, %r14 - .byte 0xc4,98,147,0xf6,103,240 - .byte 0xc4,226,227,0xf6,71,248 - sar $2, %r14 - add %r12, %rbx - adc $0, %rax - mov (%rsi), %r12 - mov 8(%rsi), %rbp - .byte 0xc4,98,179,0xf6,7 - add %r13, %r12 - jmp .Llo0 + mov 16(%r13,%r12,8), %rax + adc %rdx, %r14 + mul %rbp + mov $0, %r10d + lea 2(%r12), %r11 + add %r9, %r15 + imul %r8, %r15 + jmp .Le2 .align 16, 0x90 -.Ltp0: adc %rax, %r9 - lea 32(%rsi), %rsi - adc %r8, %r11 - .byte 0xc4,98,147,0xf6,103,16 +.Lli2: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp +.Le2: add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli2 + +.Lle2: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo2 + + mov %r12, %rcx + sar $2, %rcx + lea 32(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + mov -16(%rsi), %r8 - .byte 0xc4,226,227,0xf6,71,24 - lea 32(%rdi), %rdi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov -8(%rsi), %r10 - mov (%rsi), %r12 - add %r9, %r8 - mov 8(%rsi), %rbp - mov %r8, -16(%rsi) - adc %r11, %r10 - .byte 0xc4,98,179,0xf6,7 - mov %r10, -8(%rsi) - adc %r13, %r12 - mov %r12, (%rsi) -.Llo0: adc %rbx, %rbp - .byte 0xc4,98,163,0xf6,87,8 - mov %rbp, 8(%rsi) - inc %r14 - jnz .Ltp0 - -.Led0: mov 40(%rsi,%rcx,8), %rdx - lea 32(%rdi,%rcx,8), %rdi + mov -8(%rsi), %r9 + add -16(%rdx), %r8 + adc -8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + lea 16(%rdi), %rdi + jmp .Laddx + + + .align 16, 0x90 +.L1m4: +.Llo1: mov (%r13,%r12,8), %rax + xor %r9, %r9 + xor %ebx, %ebx + mul %rbp + mov %rax, %r9 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mov %rdx, %r14 + mov $0, %r10d + mul %rbp + add 16(%rsi,%r12,8), %r9 + adc %rax, %r14 + adc %rdx, %rbx + mov 16(%r13,%r12,8), %rax + mul %rbp + lea 1(%r12), %r11 + add %r14, %r15 + imul %r8, %r15 + jmp .Le1 + + .align 16, 0x90 +.Lli1: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp +.Le1: add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli1 + +.Lle1: add %r10, (%rsi) adc %rax, %r9 - adc %r8, %r11 - mov 16(%rsi), %r8 - adc $0, %r10 - imul (%rsp), %rdx - mov 24(%rsi), %rax - add %r9, %r8 - mov %r8, 16(%rsi) - adc %r11, %rax - mov %rax, 24(%rsi) - lea 40(%rsi,%rcx,8), %rsi - adc $0, %r10 - mov %r10, -8(%rsi) - inc %r15 - jnz .Lo0 - -.Lcj: - mov 8(%rsp), %rdi - lea 16-8(%rsp), %rsp - lea (%rsi,%rcx,8), %rdx - neg %ecx + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo1 + + mov %r12, %rcx + sar $2, %rcx + lea 24(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx - + mov -8(%rsi), %r8 + add -8(%rdx), %r8 + mov %r8, (%rdi) + lea 8(%rdi), %rdi + jmp .Laddx - - call __gmpn_add_n@PLT + + .align 16, 0x90 +.L0: +.L0m4: +.Llo0: mov (%r13,%r12,8), %rax + mov %r12, %r11 + mul %rbp + xor %r10d, %r10d + mov %rax, %r14 + mov %rdx, %rbx + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mul %rbp + add 16(%rsi,%r12,8), %r14 + adc %rax, %rbx + adc %rdx, %r10 + add %rbx, %r15 + imul %r8, %r15 + jmp .Le0 + + .align 16, 0x90 +.Lli0: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.Le0: mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli0 + +.Lle0: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo0 + + mov %r12, %rcx + sar $2, %rcx + clc + lea 16(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + jmp .Laddy + + + .align 16, 0x90 +.L3m4: +.Llo3: mov (%r13,%r12,8), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mul %rbp + add 16(%rsi,%r12,8), %rbx + mov $0, %ebx + mov %rbx, %r14 + adc %rax, %r10 + mov 16(%r13,%r12,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + add %r10, %r15 + mul %rbp + lea 3(%r12), %r11 + imul %r8, %r15 + + + .align 16, 0x90 +.Lli3: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli3 + +.Lle3: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + mov %r15, %rbp + lea 8(%rsi), %rsi + dec %rcx + jnz .Llo3 - lea 8(%rsp), %rsp + mov %r12, %rcx + sar $2, %rcx + lea 40(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + + mov -24(%rsi), %r8 + mov -16(%rsi), %r9 + mov -8(%rsi), %r10 + add -24(%rdx), %r8 + adc -16(%rdx), %r9 + adc -8(%rdx), %r10 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + lea 24(%rdi), %rdi + +.Laddx:inc %rcx + jz .Lad3 + +.Laddy:mov (%rsi), %r8 + mov 8(%rsi), %r9 + inc %rcx + jmp .Lmid + + +.Lal3: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + inc %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Lal3 + +.Lae3: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + +.Lad3: mov %ecx, %eax + adc %eax, %eax .Lret: pop %r15 pop %r14 pop %r13 pop %r12 - pop %rbp pop %rbx + pop %rbp ret .size __gmpn_redc_1,.-__gmpn_redc_1 diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s index 7eebcc0aff..ac1323b3c6 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s @@ -47,15 +47,6 @@ - - - - - - - - - @@ -78,6 +69,9 @@ + + + @@ -86,29 +80,6 @@ .text - .align 16, 0x90 - .globl __gmpn_rsblsh1_nc - .type __gmpn_rsblsh1_nc,@function - -__gmpn_rsblsh1_nc: - - - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $63, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_rsblsh1_nc,.-__gmpn_rsblsh1_nc - .align 16, 0x90 .globl __gmpn_rsblsh1_n .type __gmpn_rsblsh1_n,@function @@ -117,96 +88,92 @@ __gmpn_rsblsh1_n: push %rbp - xor %ebp, %ebp + mov (%rdx), %r8 - shrd $63, %r8, %rbp mov %ecx, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + xor %ebp, %ebp and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 sbb %eax, %eax - sub $3, %rcx - ja .Ltop - jmp .Lend - -.Lb01: add %eax, %eax - sbb (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi + sub (%rsi,%rcx,8), %r8 + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb 16(%rsi,%rcx,8), %r10 + mov %r10, 16(%rdi,%rcx,8) + sbb %ebp, %ebp + add $3, %rcx + jmp .Lent + +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 sbb %eax, %eax - sub $1, %rcx - ja .Ltop - jmp .Lend - -.Lb10: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi + sub (%rsi,%rcx,8), %r8 + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb %ebp, %ebp + add $2, %rcx + jmp .Lent + +.Lb01: add %r8, %r8 sbb %eax, %eax - sub $2, %rcx - ja .Ltop - jmp .Lend + sub (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $63, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $63, %r9, %r8 - mov 16(%rdx), %r10 - shrd $63, %r10, %r9 - mov 24(%rdx), %r11 - shrd $63, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - sbb 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi +.Ltop: add %eax, %eax + + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + sbb %eax, %eax - sub $4, %rcx - jnz .Ltop + add %ebp, %ebp + + sbb (%rsi,%rcx,8), %r8 + nop + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb 16(%rsi,%rcx,8), %r10 + sbb 24(%rsi,%rcx,8), %r11 + mov %r10, 16(%rdi,%rcx,8) + mov %r11, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp + add $4, %rcx + js .Ltop + +.Lend: + + + sub %eax, %ebp + movslq %ebp, %rax -.Lend: shr $63, %rbp - add %eax, %eax - sbb $0, %rbp - mov %rbp, %rax pop %rbp ret .size __gmpn_rsblsh1_n,.-__gmpn_rsblsh1_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s index fe7d1d3930..e9f079a236 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s @@ -47,10 +47,13 @@ - - - - + + + + + + + @@ -87,30 +90,11 @@ - .text - .align 16, 0x90 - .globl __gmpn_rsblsh2_nc - .type __gmpn_rsblsh2_nc,@function - -__gmpn_rsblsh2_nc: - - push %rbp - mov %r8, %rax - neg %rax - xor %ebp, %ebp - mov (%rdx), %r8 - shrd $62, %r8, %rbp - mov %ecx, %r9d - and $3, %r9d - je .Lb00 - cmp $2, %r9d - jc .Lb01 - je .Lb10 - jmp .Lb11 - .size __gmpn_rsblsh2_nc,.-__gmpn_rsblsh2_nc + + .text .align 16, 0x90 .globl __gmpn_rsblsh2_n .type __gmpn_rsblsh2_n,@function @@ -118,96 +102,102 @@ __gmpn_rsblsh2_nc: __gmpn_rsblsh2_n: - push %rbp - xor %ebp, %ebp + push %r12 + push %r13 + push %r14 + push %r15 + mov (%rdx), %r8 - shrd $62, %r8, %rbp + lea (,%r8,4), %r12 + shr $62, %r8 + mov %ecx, %eax - and $3, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + and $3, %al je .Lb00 - cmp $2, %eax + cmp $2, %al jc .Lb01 je .Lb10 -.Lb11: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, %rbp - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - sbb %eax, %eax - sub $3, %rcx - ja .Ltop +.Lb11: mov 8(%rdx,%rcx,8), %r10 + lea (%r8,%r10,4), %r14 + shr $62, %r10 + mov 16(%rdx,%rcx,8), %r11 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + sub (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r14 + sbb 16(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r14, 8(%rdi,%rcx,8) + mov %r15, 16(%rdi,%rcx,8) + add $3, %rcx + js .Ltop jmp .Lend -.Lb01: add %eax, %eax - sbb (%rsi), %rbp - mov %rbp, (%rdi) - mov %r8, %rbp - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - sbb %eax, %eax - sub $1, %rcx - ja .Ltop +.Lb01: mov %r8, %r11 + sub (%rsi,%rcx,8), %r12 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + add $1, %rcx + js .Ltop jmp .Lend -.Lb10: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, %rbp - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - sbb %eax, %eax - sub $2, %rcx - ja .Ltop +.Lb10: mov 8(%rdx,%rcx,8), %r11 + lea (%r8,%r11,4), %r15 + shr $62, %r11 + sub (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r15, 8(%rdi,%rcx,8) + add $2, %rcx + js .Ltop jmp .Lend +.Lb00: mov 8(%rdx,%rcx,8), %r9 + mov 16(%rdx,%rcx,8), %r10 + jmp .Le00 + .align 16, 0x90 -.Ltop: mov (%rdx), %r8 - shrd $62, %r8, %rbp -.Lb00: mov 8(%rdx), %r9 - shrd $62, %r9, %r8 - mov 16(%rdx), %r10 - shrd $62, %r10, %r9 - mov 24(%rdx), %r11 - shrd $62, %r11, %r10 - lea 32(%rdx), %rdx - add %eax, %eax - sbb (%rsi), %rbp - sbb 8(%rsi), %r8 - sbb 16(%rsi), %r9 - sbb 24(%rsi), %r10 - lea 32(%rsi), %rsi - mov %rbp, (%rdi) - mov %r8, 8(%rdi) - mov %r9, 16(%rdi) - mov %r10, 24(%rdi) - mov %r11, %rbp - lea 32(%rdi), %rdi - sbb %eax, %eax - sub $4, %rcx - jnz .Ltop - -.Lend: shr $62, %rbp - add %eax, %eax - sbb $0, %rbp - mov %rbp, %rax - pop %rbp +.Ltop: mov 16(%rdx,%rcx,8), %r10 + mov (%rdx,%rcx,8), %r8 + mov 8(%rdx,%rcx,8), %r9 + lea (%r11,%r8,4), %r12 + shr $62, %r8 +.Le00: lea (%r8,%r9,4), %r13 + shr $62, %r9 + mov 24(%rdx,%rcx,8), %r11 + lea (%r9,%r10,4), %r14 + shr $62, %r10 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add %eax, %eax + sbb (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r13 + sbb 16(%rsi,%rcx,8), %r14 + sbb 24(%rsi,%rcx,8), %r15 + mov %r12, (%rdi,%rcx,8) + mov %r13, 8(%rdi,%rcx,8) + mov %r14, 16(%rdi,%rcx,8) + sbb %eax, %eax + mov %r15, 24(%rdi,%rcx,8) + add $4, %rcx + js .Ltop +.Lend: + + + add %r11d, %eax + movslq %eax, %rax + + pop %r15 + pop %r14 + pop %r13 + pop %r12 ret .size __gmpn_rsblsh2_n,.-__gmpn_rsblsh2_n diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s index b64824b9f9..d439217a6c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s @@ -66,32 +66,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -103,7 +78,7 @@ .text - .align 32, 0x90 + .align 16, 0x90 .globl __gmpn_rsblsh_n .type __gmpn_rsblsh_n,@function @@ -111,142 +86,143 @@ __gmpn_rsblsh_n: + push %r12 + push %rbp + push %rbx + + mov (%rdx), %rax + + mov $0, %ebp + sub %rcx, %rbp + + lea -16(%rsi,%rcx,8), %rsi + lea -16(%rdi,%rcx,8), %rdi + lea 16(%rdx,%rcx,8), %r12 + + mov %rcx, %r9 + + mov %r8, %rcx + mov $1, %r8d + shl %cl, %r8 + + mul %r8 + + and $3, %r9d + jz .Lb0 + cmp $2, %r9d + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %r11 + sub 16(%rsi,%rbp,8), %r11 + mov -8(%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov (%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $3, %rbp + jnz .Llo3 + jmp .Lcj3 + +.Lb2: mov %rax, %rbx + mov -8(%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $2, %rbp + jz .Lcj2 + mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + xor %ecx, %ecx + jmp .Llo2 + +.Lb1: mov %rax, %r9 + mov %rdx, %r10 + add $1, %rbp + jnz .Lgt1 + sub 8(%rsi,%rbp,8), %r9 + jmp .Lcj1 +.Lgt1: mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + sub 8(%rsi,%rbp,8), %r9 + sbb 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + jmp .Llo1 + +.Lb0: mov %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + sub 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jz .Lend - mov (%rdx), %r10 - - mov %ecx, %eax - shr $3, %rcx - xor %r9d, %r9d - sub %r8, %r9 - and $7, %eax - - lea .Ltab(%rip), %r11 - - movslq (%r11,%rax,4), %rax - add %r11, %rax - jmp *%rax - - -.L0: lea 32(%rsi), %rsi - lea 32(%rdx), %rdx - lea 32(%rdi), %rdi - xor %r11d, %r11d - jmp .Le0 - -.L7: mov %r10, %r11 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea 24(%rdi), %rdi - xor %r10d, %r10d - jmp .Le7 - -.L6: lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea 16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le6 - -.L5: mov %r10, %r11 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le5 - -.Lend: sbb 24(%rsi), %rax - mov %rax, -40(%rdi) - .byte 0xc4,194,179,0xf7,195 - sbb %rcx, %rax + .align 8, 0x90 +.Ltop: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Llo3: mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %r11, -8(%rdi,%rbp,8) +.Llo2: mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add %ecx, %ecx + sbb (%rsi,%rbp,8), %rbx + sbb 8(%rsi,%rbp,8), %r9 + sbb 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rbx, (%rdi,%rbp,8) +.Llo1: mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov %r9, 8(%rdi,%rbp,8) +.Llo0: mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jnz .Ltop + +.Lend: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Lcj3: mov %r11, -8(%rdi,%rbp,8) +.Lcj2: add %ecx, %ecx + sbb (%rsi,%rbp,8), %rbx + sbb 8(%rsi,%rbp,8), %r9 + mov %rbx, (%rdi,%rbp,8) +.Lcj1: mov %r9, 8(%rdi,%rbp,8) + mov %rdx, %rax + sbb $0, %rax + pop %rbx + pop %rbp + pop %r12 ret - - .align 32, 0x90 -.Ltop: jrcxz .Lend - mov -32(%rdx), %r10 - sbb 24(%rsi), %rax - lea 64(%rsi), %rsi - .byte 0xc4,66,179,0xf7,219 - mov %rax, -40(%rdi) -.Le0: dec %rcx - .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -24(%rdx), %r11 - sbb -32(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -32(%rdi) -.Le7: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov -16(%rdx), %r10 - sbb -24(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -24(%rdi) -.Le6: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov -8(%rdx), %r11 - sbb -16(%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, -16(%rdi) -.Le5: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov (%rdx), %r10 - sbb -8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, -8(%rdi) -.Le4: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 8(%rdx), %r11 - sbb (%rsi), %rax - .byte 0xc4,66,179,0xf7,210 - mov %rax, (%rdi) -.Le3: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - mov 16(%rdx), %r10 - sbb 8(%rsi), %rax - .byte 0xc4,66,179,0xf7,219 - mov %rax, 8(%rdi) -.Le2: .byte 0xc4,194,185,0xf7,194 - lea (%r11,%rax), %rax - mov 24(%rdx), %r11 - sbb 16(%rsi), %rax - lea 64(%rdx), %rdx - .byte 0xc4,66,179,0xf7,210 - mov %rax, 16(%rdi) - lea 64(%rdi), %rdi -.Le1: .byte 0xc4,194,185,0xf7,195 - lea (%r10,%rax), %rax - jmp .Ltop - -.L4: xor %r11d, %r11d - jmp .Le4 - -.L3: mov %r10, %r11 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - xor %r10d, %r10d - jmp .Le3 - -.L2: lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - xor %r11d, %r11d - jmp .Le2 - -.L1: mov %r10, %r11 - lea -24(%rsi), %rsi - lea 40(%rdx), %rdx - lea 40(%rdi), %rdi - xor %r10d, %r10d - jmp .Le1 .size __gmpn_rsblsh_n,.-__gmpn_rsblsh_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab - diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s index c385f661fc..8554f6f047 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s @@ -56,6 +56,8 @@ + + @@ -72,7 +74,6 @@ .text - .align 16, 0x90 .globl __gmpn_rsh1add_nc .type __gmpn_rsh1add_nc,@function @@ -82,12 +83,11 @@ __gmpn_rsh1add_nc: push %rbx - push %rbp + xor %eax, %eax neg %r8 - mov (%rsi), %rbp - adc (%rdx), %rbp - + mov (%rsi), %rbx + adc (%rdx), %rbx jmp .Lent .size __gmpn_rsh1add_nc,.-__gmpn_rsh1add_nc @@ -99,14 +99,13 @@ __gmpn_rsh1add_n: push %rbx - push %rbp - mov (%rsi), %rbp - add (%rdx), %rbp + xor %eax, %eax + mov (%rsi), %rbx + add (%rdx), %rbx .Lent: - sbb %ebx, %ebx - mov %ebp, %eax - and $1, %eax + rcr %rbx + adc %eax, %eax mov %ecx, %r11d and $3, %r11d @@ -116,21 +115,20 @@ __gmpn_rsh1add_n: .Ln1: cmp $2, %r11d jne .Ln2 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r10 adc 8(%rdx), %r10 lea 8(%rsi), %rsi lea 8(%rdx), %rdx lea 8(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r10, %rbp - mov %rbp, -8(%rdi) + rcr %r10 + rcr %rbx + mov %rbx, -8(%rdi) jmp .Lcj1 .Ln2: cmp $3, %r11d jne .Ln3 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r9 mov 16(%rsi), %r10 adc 8(%rdx), %r9 @@ -138,14 +136,14 @@ __gmpn_rsh1add_n: lea 16(%rsi), %rsi lea 16(%rdx), %rdx lea 16(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r9, %rbp - mov %rbp, -16(%rdi) + rcr %r10 + rcr %r9 + rcr %rbx + mov %rbx, -16(%rdi) jmp .Lcj2 -.Ln3: dec %rcx - add %ebx, %ebx +.Ln3: dec %rcx + add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 adc 8(%rdx), %r8 @@ -155,21 +153,20 @@ __gmpn_rsh1add_n: lea 24(%rsi), %rsi lea 24(%rdx), %rdx lea 24(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r8, %rbp - mov %rbp, -24(%rdi) - shrd $1, %r9, %r8 + rcr %r10 + rcr %r9 + rcr %r8 + rcr %rbx + mov %rbx, -24(%rdi) mov %r8, -16(%rdi) -.Lcj2: shrd $1, %r10, %r9 - mov %r9, -8(%rdi) -.Lcj1: mov %r10, %rbp +.Lcj2: mov %r9, -8(%rdi) +.Lcj1: mov %r10, %rbx .Ldo: - shr $2, %rcx + shr $2, %rcx je .Lend .align 16, 0x90 -.Ltop: add %ebx, %ebx +.Ltop: add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 @@ -183,25 +180,23 @@ __gmpn_rsh1add_n: lea 32(%rsi), %rsi lea 32(%rdx), %rdx - sbb %ebx, %ebx + rcr %r11 + rcr %r10 + rcr %r9 + rcr %r8 - shrd $1, %r8, %rbp - mov %rbp, (%rdi) - shrd $1, %r9, %r8 + rcr %rbx + mov %rbx, (%rdi) mov %r8, 8(%rdi) - shrd $1, %r10, %r9 mov %r9, 16(%rdi) - shrd $1, %r11, %r10 mov %r10, 24(%rdi) + mov %r11, %rbx - dec %rcx - mov %r11, %rbp lea 32(%rdi), %rdi + dec %rcx jne .Ltop -.Lend: shrd $1, %rbx, %rbp - mov %rbp, (%rdi) - pop %rbp +.Lend: mov %rbx, (%rdi) pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s b/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s index 0d7ab328a6..ff06ece4bc 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s @@ -57,6 +57,8 @@ + + @@ -72,7 +74,6 @@ .text - .align 16, 0x90 .globl __gmpn_rsh1sub_nc .type __gmpn_rsh1sub_nc,@function @@ -82,12 +83,11 @@ __gmpn_rsh1sub_nc: push %rbx - push %rbp + xor %eax, %eax neg %r8 - mov (%rsi), %rbp - sbb (%rdx), %rbp - + mov (%rsi), %rbx + sbb (%rdx), %rbx jmp .Lent .size __gmpn_rsh1sub_nc,.-__gmpn_rsh1sub_nc @@ -99,14 +99,13 @@ __gmpn_rsh1sub_n: push %rbx - push %rbp - mov (%rsi), %rbp - sub (%rdx), %rbp + xor %eax, %eax + mov (%rsi), %rbx + sub (%rdx), %rbx .Lent: - sbb %ebx, %ebx - mov %ebp, %eax - and $1, %eax + rcr %rbx + adc %eax, %eax mov %ecx, %r11d and $3, %r11d @@ -116,21 +115,20 @@ __gmpn_rsh1sub_n: .Ln1: cmp $2, %r11d jne .Ln2 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r10 sbb 8(%rdx), %r10 lea 8(%rsi), %rsi lea 8(%rdx), %rdx lea 8(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r10, %rbp - mov %rbp, -8(%rdi) + rcr %r10 + rcr %rbx + mov %rbx, -8(%rdi) jmp .Lcj1 .Ln2: cmp $3, %r11d jne .Ln3 - add %ebx, %ebx + add %rbx, %rbx mov 8(%rsi), %r9 mov 16(%rsi), %r10 sbb 8(%rdx), %r9 @@ -138,14 +136,14 @@ __gmpn_rsh1sub_n: lea 16(%rsi), %rsi lea 16(%rdx), %rdx lea 16(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r9, %rbp - mov %rbp, -16(%rdi) + rcr %r10 + rcr %r9 + rcr %rbx + mov %rbx, -16(%rdi) jmp .Lcj2 -.Ln3: dec %rcx - add %ebx, %ebx +.Ln3: dec %rcx + add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 sbb 8(%rdx), %r8 @@ -155,21 +153,20 @@ __gmpn_rsh1sub_n: lea 24(%rsi), %rsi lea 24(%rdx), %rdx lea 24(%rdi), %rdi - sbb %ebx, %ebx - - shrd $1, %r8, %rbp - mov %rbp, -24(%rdi) - shrd $1, %r9, %r8 + rcr %r10 + rcr %r9 + rcr %r8 + rcr %rbx + mov %rbx, -24(%rdi) mov %r8, -16(%rdi) -.Lcj2: shrd $1, %r10, %r9 - mov %r9, -8(%rdi) -.Lcj1: mov %r10, %rbp +.Lcj2: mov %r9, -8(%rdi) +.Lcj1: mov %r10, %rbx .Ldo: - shr $2, %rcx + shr $2, %rcx je .Lend .align 16, 0x90 -.Ltop: add %ebx, %ebx +.Ltop: add %rbx, %rbx mov 8(%rsi), %r8 mov 16(%rsi), %r9 @@ -183,25 +180,23 @@ __gmpn_rsh1sub_n: lea 32(%rsi), %rsi lea 32(%rdx), %rdx - sbb %ebx, %ebx + rcr %r11 + rcr %r10 + rcr %r9 + rcr %r8 - shrd $1, %r8, %rbp - mov %rbp, (%rdi) - shrd $1, %r9, %r8 + rcr %rbx + mov %rbx, (%rdi) mov %r8, 8(%rdi) - shrd $1, %r10, %r9 mov %r9, 16(%rdi) - shrd $1, %r11, %r10 mov %r10, 24(%rdi) + mov %r11, %rbx - dec %rcx - mov %r11, %rbp lea 32(%rdi), %rdi + dec %rcx jne .Ltop -.Lend: shrd $1, %rbx, %rbp - mov %rbp, (%rdi) - pop %rbp +.Lend: mov %rbx, (%rdi) pop %rbx ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/rshift.s b/ext/gmp/gen/x86_64-linux/mpn/rshift.s index 386eccd1ac..8ddd7b5557 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/rshift.s +++ b/ext/gmp/gen/x86_64-linux/mpn/rshift.s @@ -41,32 +41,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -89,142 +63,129 @@ .text - .align 64, 0x90 + .align 32, 0x90 .globl __gmpn_rshift .type __gmpn_rshift,@function __gmpn_rshift: - movd %ecx, %xmm4 - mov $64, %eax - sub %ecx, %eax - movd %eax, %xmm5 - - neg %ecx + neg %ecx mov (%rsi), %rax - shl %cl, %rax - - cmp $3, %rdx - jle .Lbc + shl %cl, %rax + neg %ecx - test $8, %dil - jz .Lrp_aligned - - - movq (%rsi), %xmm0 - movq 8(%rsi), %xmm1 - psrlq %xmm4, %xmm0 - psllq %xmm5, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, (%rdi) - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - dec %rdx - -.Lrp_aligned: lea 1(%rdx), %r8d - lea (%rsi,%rdx,8), %rsi - lea (%rdi,%rdx,8), %rdi - neg %rdx - and $6, %r8d - jz .Lbu0 - cmp $4, %r8d - jz .Lbu4 - jc .Lbu2 -.Lbu6: add $4, %rdx - jmp .Li56 -.Lbu0: add $6, %rdx - jmp .Li70 -.Lbu4: add $2, %rdx - jmp .Li34 -.Lbu2: add $8, %rdx - jge .Lend + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + neg %rdx + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + shl %cl, %r8 + or %r8, %r10 + mov %r10, 8(%rdi,%rdx,8) + inc %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + shl %cl, %r8 + or %r8, %r10 + mov %r10, 8(%rdi,%rdx,8) + inc %rdx + neg %ecx +.L1x: + cmp $-1, %rdx + je .Last + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r11 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 24(%rsi,%rdx,8), %r9 + shl %cl, %r8 + or %r8, %r10 + shl %cl, %r9 + or %r9, %r11 + mov %r10, 8(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + add $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r11 + + add $4, %rdx + jb .Lend .align 16, 0x90 -.Ltop: movdqu -64(%rsi,%rdx,8), %xmm1 - movdqu -56(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -64(%rdi,%rdx,8) -.Li70: - movdqu -48(%rsi,%rdx,8), %xmm1 - movdqu -40(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -48(%rdi,%rdx,8) -.Li56: - movdqu -32(%rsi,%rdx,8), %xmm1 - movdqu -24(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -32(%rdi,%rdx,8) -.Li34: - movdqu -16(%rsi,%rdx,8), %xmm1 - movdqu -8(%rsi,%rdx,8), %xmm0 - psllq %xmm5, %xmm0 - psrlq %xmm4, %xmm1 - por %xmm1, %xmm0 - movdqa %xmm0, -16(%rdi,%rdx,8) - add $8, %rdx - jl .Ltop - -.Lend: test $1, %dl - jnz .Le1 - - movdqu -16(%rsi), %xmm1 - movq -8(%rsi), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movdqa %xmm0, -16(%rdi) +.Ltop: - ret - -.Le1: movq -8(%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, -8(%rdi) + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shl %cl, %r8 + or %r8, %r10 + shl %cl, %r9 + or %r9, %r11 + mov %r10, -24(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) - ret + mov (%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shl %cl, %r8 + shl %cl, %r9 - - .align 16, 0x90 -.Lbc: dec %edx - jnz 1f - movq (%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, (%rdi) - ret - -1: movq (%rsi), %xmm1 - movq 8(%rsi), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, (%rdi) - dec %edx - jnz 1f - movq 8(%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, 8(%rdi) + neg %ecx + mov -8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shr %cl, %r10 + or %r10, %r8 + shr %cl, %r11 + or %r11, %r9 + mov %r8, -8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) - ret - -1: movq 8(%rsi), %xmm1 - movq 16(%rsi), %xmm0 - psrlq %xmm4, %xmm1 - psllq %xmm5, %xmm0 - por %xmm1, %xmm0 - movq %xmm0, 8(%rdi) - movq 16(%rsi), %xmm0 - psrlq %xmm4, %xmm0 - movq %xmm0, 16(%rdi) + mov 8(%rsi,%rdx,8), %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r10 + shr %cl, %r11 + + add $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov -8(%rsi), %r8 + shl %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shl %cl, %r9 + or %r9, %r11 + mov %r10, -16(%rdi) + mov %r11, -8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shr %cl, %r10 + mov %r10, (%rdi) ret .size __gmpn_rshift,.-__gmpn_rshift - diff --git a/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s b/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s index 6e67f45c31..7a50a70410 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s @@ -62,21 +62,6 @@ - - - - - - - - - - - - - - - @@ -103,131 +88,103 @@ __gmpn_sec_tabselect: - - - - - - - movd %r8, %xmm8 - pshufd $0, %xmm8, %xmm8 - mov $1, %eax - movd %rax, %xmm9 - pshufd $0, %xmm9, %xmm9 + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 mov %rdx, %r9 - add $-8, %r9 + add $-4, %r9 js .Louter_end .Louter_top: - mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 - pxor %xmm5, %xmm5 - pxor %xmm6, %xmm6 - pxor %xmm7, %xmm7 + mov %rcx, %rbp + push %rsi + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d + mov %r8, %rbx + .align 16, 0x90 -.Ltop: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movdqu 0(%rsi), %xmm2 - movdqu 16(%rsi), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm4 - por %xmm3, %xmm5 - movdqu 32(%rsi), %xmm2 - movdqu 48(%rsi), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm6 - por %xmm3, %xmm7 +.Ltop: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + mov 8(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 + mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r14 + or %r11, %r15 lea (%rsi,%rdx,8), %rsi - add $-1, %r10 + add $-1, %rbp jne .Ltop - movdqu %xmm4, 0(%rdi) - movdqu %xmm5, 16(%rdi) - movdqu %xmm6, 32(%rdi) - movdqu %xmm7, 48(%rdi) - - lea 64(%r11), %rsi - lea 64(%rdi), %rdi - add $-8, %r9 + mov %r12, 0(%rdi) + mov %r13, 8(%rdi) + mov %r14, 16(%rdi) + mov %r15, 24(%rdi) + pop %rsi + lea 32(%rsi), %rsi + lea 32(%rdi), %rdi + add $-4, %r9 jns .Louter_top .Louter_end: - test $4, %dl - je .Lb0xx -.Lb1xx:mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 - pxor %xmm5, %xmm5 + test $2, %dl + jz .Lb0x +.Lb1x: mov %rcx, %rbp + push %rsi + xor %r12d, %r12d + xor %r13d, %r13d + mov %r8, %rbx .align 16, 0x90 -.Ltp4: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movdqu 0(%rsi), %xmm2 - movdqu 16(%rsi), %xmm3 - pand %xmm0, %xmm2 - pand %xmm0, %xmm3 - por %xmm2, %xmm4 - por %xmm3, %xmm5 +.Ltp2: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + mov 8(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 lea (%rsi,%rdx,8), %rsi - add $-1, %r10 - jne .Ltp4 - movdqu %xmm4, 0(%rdi) - movdqu %xmm5, 16(%rdi) - lea 32(%r11), %rsi - lea 32(%rdi), %rdi - -.Lb0xx:test $2, %dl - je .Lb00x -.Lb01x:mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 - .align 16, 0x90 -.Ltp2: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movdqu 0(%rsi), %xmm2 - pand %xmm0, %xmm2 - por %xmm2, %xmm4 - lea (%rsi,%rdx,8), %rsi - add $-1, %r10 + add $-1, %rbp jne .Ltp2 - movdqu %xmm4, 0(%rdi) - lea 16(%r11), %rsi + mov %r12, 0(%rdi) + mov %r13, 8(%rdi) + pop %rsi + lea 16(%rsi), %rsi lea 16(%rdi), %rdi -.Lb00x:test $1, %dl - je .Lb000 -.Lb001:mov %rcx, %r10 - mov %rsi, %r11 - pxor %xmm1, %xmm1 - pxor %xmm4, %xmm4 +.Lb0x: test $1, %dl + jz .Lb00 +.Lb01: mov %rcx, %rbp + xor %r12d, %r12d + mov %r8, %rbx .align 16, 0x90 -.Ltp1: movdqa %xmm8, %xmm0 - pcmpeqd %xmm1, %xmm0 - paddd %xmm9, %xmm1 - movq 0(%rsi), %xmm2 - pand %xmm0, %xmm2 - por %xmm2, %xmm4 +.Ltp1: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + and %rax, %r10 + or %r10, %r12 lea (%rsi,%rdx,8), %rsi - add $-1, %r10 + add $-1, %rbp jne .Ltp1 - movq %xmm4, 0(%rdi) - -.Lb000: - - - - - + mov %r12, 0(%rdi) + +.Lb00: pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx ret .size __gmpn_sec_tabselect,.-__gmpn_sec_tabselect - diff --git a/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s b/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s index 26efdaa53a..eb24851327 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s @@ -89,6 +89,11 @@ + + + + + @@ -103,746 +108,711 @@ __gmpn_sqr_basecase: + mov %edx, %ecx + mov %edx, %r11d + + add $-40, %rsp + + and $3, %ecx + cmp $4, %edx + lea 4(%rcx), %r8 - cmp $2, %rdx - jae .Lgt1 + mov %rbx, 32(%rsp) + mov %rbp, 24(%rsp) + mov %r12, 16(%rsp) + mov %r13, 8(%rsp) + mov %r14, (%rsp) + + cmovg %r8, %rcx + + lea .Ltab(%rip), %rax + movslq (%rax,%rcx,4), %r10 + add %r10, %rax + jmp *%rax + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .L4-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .long .L0m4-.Ltab + .long .L1m4-.Ltab + .long .L2m4-.Ltab + .long .L3m4-.Ltab + .text - mov (%rsi), %rdx - .byte 0xc4,226,251,0xf6,210 +.L1: mov (%rsi), %rax + mul %rax + add $40, %rsp mov %rax, (%rdi) mov %rdx, 8(%rdi) ret -.Lgt1: jne .Lgt2 - - mov (%rsi), %rdx - mov 8(%rsi), %rcx - .byte 0xc4,98,179,0xf6,209 - .byte 0xc4,98,251,0xf6,194 - mov %rcx, %rdx - .byte 0xc4,226,163,0xf6,210 - add %r9, %r9 - adc %r10, %r10 - adc $0, %rdx - add %r9, %r8 - adc %r11, %r10 - adc $0, %rdx +.L2: mov (%rsi), %rax + mov %rax, %r8 + mul %rax + mov 8(%rsi), %r11 mov %rax, (%rdi) - mov %r8, 8(%rdi) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + add $40, %rsp + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(%rdi) + adc %rdx, %r10 mov %r10, 16(%rdi) + adc %r8, %r11 + mov %r11, 24(%rdi) + + ret + +.L3: mov (%rsi), %rax + mov %rax, %r10 + mul %rax + mov 8(%rsi), %r11 + mov %rax, (%rdi) + mov %r11, %rax + mov %rdx, 8(%rdi) + mul %rax + mov 16(%rsi), %rcx + mov %rax, 16(%rdi) + mov %rcx, %rax mov %rdx, 24(%rdi) + mul %rax + mov %rax, 32(%rdi) + mov %rdx, 40(%rdi) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add $40, %rsp + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(%rdi) + adc %r9, 16(%rdi) + adc %r10, 24(%rdi) + adc %rdx, 32(%rdi) + adc %r11, 40(%rdi) ret -.Lgt2: cmp $4, %rdx - jae .Lgt3 - - push %rbx - mov (%rsi), %rdx - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xc4,98,187,0xf6,78,16 - add %r11, %r8 - mov 8(%rsi), %rdx - .byte 0xc4,98,251,0xf6,94,16 - adc %rax, %r9 - adc $0, %r11 - test %ebx, %ebx - mov (%rsi), %rdx - .byte 0xc4,226,227,0xf6,202 - mov %rbx, (%rdi) - mov 8(%rsi), %rdx - .byte 0xc4,226,251,0xf6,218 - mov 16(%rsi), %rdx - .byte 0xc4,226,203,0xf6,210 - .byte 0x66,77,0x0f,0x38,0xf6,210 - .byte 0x66,77,0x0f,0x38,0xf6,192 - .byte 0x66,77,0x0f,0x38,0xf6,201 - .byte 0x66,77,0x0f,0x38,0xf6,219 - .byte 0xf3,73,0x0f,0x38,0xf6,202 - .byte 0xf3,73,0x0f,0x38,0xf6,192 - .byte 0xf3,73,0x0f,0x38,0xf6,217 - .byte 0xf3,73,0x0f,0x38,0xf6,243 - mov $0, %r8d - .byte 0xf3,73,0x0f,0x38,0xf6,208 - .byte 0x66,73,0x0f,0x38,0xf6,208 - mov %rcx, 8(%rdi) +.L4: mov (%rsi), %rax + mov %rax, %r11 + mul %rax + mov 8(%rsi), %rbx + mov %rax, (%rdi) + mov %rbx, %rax + mov %rdx, 8(%rdi) + mul %rax mov %rax, 16(%rdi) - mov %rbx, 24(%rdi) - mov %rsi, 32(%rdi) + mov %rdx, 24(%rdi) + mov 16(%rsi), %rax + mul %rax + mov %rax, 32(%rdi) mov %rdx, 40(%rdi) + mov 24(%rsi), %rax + mul %rax + mov %rax, 48(%rdi) + mov %rbx, %rax + mov %rdx, 56(%rdi) + + mul %r11 + add $32, %rsp + mov %rax, %r8 + mov %rdx, %r9 + mov 16(%rsi), %rax + mul %r11 + xor %r10, %r10 + add %rax, %r9 + adc %rdx, %r10 + mov 24(%rsi), %rax + mul %r11 + xor %r11, %r11 + add %rax, %r10 + adc %rdx, %r11 + mov 16(%rsi), %rax + mul %rbx + xor %rcx, %rcx + add %rax, %r10 + adc %rdx, %r11 + adc $0, %rcx + mov 24(%rsi), %rax + mul %rbx pop %rbx + add %rax, %r11 + adc %rdx, %rcx + mov 16(%rsi), %rdx + mov 24(%rsi), %rax + mul %rdx + add %rax, %rcx + adc $0, %rdx + + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %r11, %r11 + adc %rcx, %rcx + mov $0, %eax + adc %rdx, %rdx + + adc %rax, %rax + add %r8, 8(%rdi) + adc %r9, 16(%rdi) + adc %r10, 24(%rdi) + adc %r11, 32(%rdi) + adc %rcx, 40(%rdi) + adc %rdx, 48(%rdi) + adc %rax, 56(%rdi) ret -.Lgt3: push %rbx - - lea -3(%rdx), %ebx - lea 5(%rdx), %ecx - mov %edx, %eax - and $-8, %ebx - shr $3, %ecx - neg %rbx - and $7, %eax - mov (%rsi), %rdx +.L0m4: + lea -16(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea (%rsi,%r11,8), %rsi - lea .Lmtab(%rip), %r10 - movslq (%r10,%rax,4), %r8 - lea (%r8, %r10), %r10 - jmp *%r10 + lea -4(%r11), %r8 + xor %r9d, %r9d + sub %r11, %r9 -.Lmf0: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - lea 64(%rsi), %rsi - add %r9, %r10 - jmp .Lmb0 - -.Lmf3: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - mov %r10, (%rdi) - .byte 0xc4,98,187,0xf6,78,8 - lea 24(%rsi), %rsi - lea 24(%rdi), %rdi - add %r11, %r8 - jmp .Lmb3 - -.Lmf4: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - mov %r8, (%rdi) - lea 32(%rsi), %rsi - lea 32(%rdi), %rdi - add %r9, %r10 - jmp .Lmb4 - -.Lmf5: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - .byte 0xc4,98,187,0xf6,78,8 - mov %r10, (%rdi) - lea 40(%rsi), %rsi - lea 40(%rdi), %rdi - add %r11, %r8 - jmp .Lmb5 - -.Lmf6: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - mov %r8, (%rdi) - lea 48(%rsi), %rsi - lea 48(%rdi), %rdi - add %r9, %r10 - jmp .Lmb6 - -.Lmf7: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - .byte 0xc4,98,187,0xf6,78,8 - mov %r10, (%rdi) - lea 56(%rsi), %rsi - lea 56(%rdi), %rdi - add %r11, %r8 - jmp .Lmb7 - -.Lmf1: .byte 0xc4,98,171,0xf6,218 - add %rdx, %rdx - .byte 0xc4,98,187,0xf6,78,8 - mov %r10, (%rdi) - lea 8(%rsi), %rsi - lea 8(%rdi), %rdi - add %r11, %r8 - jmp .Lmb1 - -.Lmf2: .byte 0xc4,98,187,0xf6,202 - add %rdx, %rdx - .byte 0xc4,98,171,0xf6,94,8 - mov %r8, (%rdi) - lea 16(%rsi), %rsi - lea 16(%rdi), %rdi - dec %ecx - add %r9, %r10 - .byte 0xc4,98,187,0xf6,14 - - .align 16, 0x90 -.Ltop: mov %r10, -8(%rdi) - adc %r11, %r8 -.Lmb1: .byte 0xc4,98,171,0xf6,94,8 - adc %r9, %r10 - lea 64(%rsi), %rsi -.Lmb0: mov %r8, (%rdi) - mov %r10, 8(%rdi) - .byte 0xc4,98,187,0xf6,78,208 - lea 64(%rdi), %rdi - adc %r11, %r8 -.Lmb7: .byte 0xc4,98,171,0xf6,94,216 - mov %r8, -48(%rdi) - adc %r9, %r10 -.Lmb6: mov %r10, -40(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - adc %r11, %r8 -.Lmb5: .byte 0xc4,98,171,0xf6,94,232 - mov %r8, -32(%rdi) - adc %r9, %r10 -.Lmb4: .byte 0xc4,98,187,0xf6,78,240 - mov %r10, -24(%rdi) - adc %r11, %r8 -.Lmb3: .byte 0xc4,98,171,0xf6,94,248 - adc %r9, %r10 - mov %r8, -16(%rdi) - dec %ecx - .byte 0xc4,98,187,0xf6,14 - jnz .Ltop - -.Lend: mov %r10, -8(%rdi) - adc %r11, %r8 - - - - - lea .Latab(%rip), %r10 - movslq (%r10,%rax,4), %r11 - lea (%r11, %r10), %r11 - - mov $63, %eax - jmp *%r11 - -.Led0: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf7: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea -64(%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov (%rsi), %r9 - mov 8(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - jmp .Lb7 - - .align 16, 0x90 -.Ltp0: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led0 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx -.Lb0: mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp0 - -.Led1: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf0: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea -64(%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -8(%rsi), %r11 - mov (%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - jmp .Lb0 + mul %r13 + xor %ebp, %ebp + mov %rax, %rbx + mov 16(%rsi,%r9,8), %rax + mov %rdx, %r10 + jmp .LL3 .align 16, 0x90 -.Ltp1: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led1 -.Lb1: .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp1 - -.Led2: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf1: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea 8(%rbx), %rbx - lea -56(%rdi,%rbx,8), %rdi - mov -16(%rsi), %r9 - mov -8(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jmp .Lb1 +.Lmul_1_m3_top: + add %rax, %rbp + mov %r10, (%r12,%r9,8) + mov (%rsi,%r9,8), %rax + adc %rdx, %rcx + xor %ebx, %ebx + mul %r13 + xor %r10d, %r10d + mov %rbp, 8(%r12,%r9,8) + add %rax, %rcx + adc %rdx, %rbx + mov 8(%rsi,%r9,8), %rax + mov %rcx, 16(%r12,%r9,8) + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %r10 +.LL3: xor %ecx, %ecx + mul %r13 + add %rax, %r10 + mov 24(%rsi,%r9,8), %rax + adc %rdx, %rbp + mov %rbx, 24(%r12,%r9,8) + mul %r13 + add $4, %r9 + js .Lmul_1_m3_top + + add %rax, %rbp + mov %r10, (%r12) + adc %rdx, %rcx + mov %rbp, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + lea -8(%rsi), %rsi + jmp .Ldowhile + + +.L1m4: + lea 8(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea 8(%rsi,%r11,8), %rsi + + lea -3(%r11), %r8 + + lea -3(%r11), %r9 + neg %r9 + + mov %rax, %r14 + mul %r13 + mov %rdx, %rcx + xor %ebp, %ebp + mov %rax, 8(%rdi) + jmp .Lm0 .align 16, 0x90 -.Ltp2: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led2 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) -.Lb2: .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp2 - -.Led3: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf2: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - or %ebx, %ecx - jz .Lcor3 - lea -56(%rdi,%rbx,8), %rdi - mov -24(%rsi), %r11 - mov -16(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - jmp .Lb2 +.Lmul_2_m0_top: + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov -24(%rsi,%r9,8), %rax + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rax, %rcx + mov %rbx, -24(%r12,%r9,8) + adc %rdx, %rbp +.Lm0: mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %rcx, -16(%r12,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %ecx + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%r12,%r9,8) + adc %rdx, %rbx +.Lm2x: mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r9 + mov -32(%rsi,%r9,8), %rax + mov %r10, -32(%r12,%r9,8) + js .Lmul_2_m0_top + + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, -8(%r12) + mov %rcx, (%r12) + + lea -16(%rsi), %rsi + lea 0(%r12), %r12 + jmp .Ldowhile_end + + +.L2m4: + lea -16(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea (%rsi,%r11,8), %rsi + + lea -4(%r11), %r8 + + lea -2(%r11), %r9 + neg %r9 + + mul %r13 + mov %rax, %rbp + mov (%rsi,%r9,8), %rax + mov %rdx, %rcx + jmp .LL1 .align 16, 0x90 -.Ltp3: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led3 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) -.Lb3: .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp3 - -.Led4: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf3: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -32(%rsi), %r9 - mov -24(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - jmp .Lb3 +.Lmul_1_m1_top: + add %rax, %rbp + mov %r10, (%r12,%r9,8) + mov (%rsi,%r9,8), %rax + adc %rdx, %rcx +.LL1: xor %ebx, %ebx + mul %r13 + xor %r10d, %r10d + mov %rbp, 8(%r12,%r9,8) + add %rax, %rcx + adc %rdx, %rbx + mov 8(%rsi,%r9,8), %rax + mov %rcx, 16(%r12,%r9,8) + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %r10 + xor %ecx, %ecx + mul %r13 + add %rax, %r10 + mov 24(%rsi,%r9,8), %rax + adc %rdx, %rbp + mov %rbx, 24(%r12,%r9,8) + mul %r13 + add $4, %r9 + js .Lmul_1_m1_top + + add %rax, %rbp + mov %r10, (%r12) + adc %rdx, %rcx + mov %rbp, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + lea -8(%rsi), %rsi + jmp .Ldowhile_mid + + +.L3m4: + lea 8(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea 8(%rsi,%r11,8), %rsi + + lea -5(%r11), %r8 + + lea -1(%r11), %r9 + neg %r9 + + mov %rax, %r14 + mul %r13 + mov %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + mov %rax, 8(%rdi) + jmp .Lm2 .align 16, 0x90 -.Ltp4: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led4 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) -.Lb4: .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp4 - -.Led5: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf4: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -40(%rsi), %r11 - mov -32(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - jmp .Lb4 +.Lmul_2_m2_top: + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov -24(%rsi,%r9,8), %rax + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rax, %rcx + mov %rbx, -24(%r12,%r9,8) + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %rcx, -16(%r12,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %ecx + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%r12,%r9,8) + adc %rdx, %rbx +.Lm2: mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r9 + mov -32(%rsi,%r9,8), %rax + mov %r10, -32(%r12,%r9,8) + js .Lmul_2_m2_top + + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, -8(%r12) + mov %rcx, (%r12) + + lea -16(%rsi), %rsi + jmp .Ldowhile_mid + +.Ldowhile: + + lea 4(%r8), %r9 + neg %r9 + + mov 16(%rsi,%r9,8), %r13 + mov 24(%rsi,%r9,8), %r14 + mov 24(%rsi,%r9,8), %rax + mul %r13 + xor %r10d, %r10d + add %rax, 24(%r12,%r9,8) + adc %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + jmp .Lam2 .align 16, 0x90 -.Ltp5: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led5 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) -.Lb5: .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp5 - -.Led6: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf5: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -48(%rsi), %r9 - mov -40(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - jmp .Lb5 +.Laddmul_2_m2_top: + add %r10, (%r12,%r9,8) + adc %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rbx, 8(%r12,%r9,8) + adc %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r9,8), %rax + mov $0, %r10d + mul %r13 + add %rax, %rcx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r14 + add %rcx, 16(%r12,%r9,8) + adc %rax, %rbp + mov 24(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %ecx + mov 24(%rsi,%r9,8), %rax + adc $0, %ebx + mul %r14 + add %rbp, 24(%r12,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx +.Lam2: mov 32(%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov 32(%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r14 + add $4, %r9 + js .Laddmul_2_m2_top + + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + + add $-2, %r8d + +.Ldowhile_mid: + + lea 2(%r8), %r9 + neg %r9 + + mov (%rsi,%r9,8), %r13 + mov 8(%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %rax + mul %r13 + xor %ecx, %ecx + add %rax, 8(%r12,%r9,8) + adc %rdx, %rcx + xor %ebp, %ebp + jmp .L20 .align 16, 0x90 -.Ltp6: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led6 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi -.Lb6: .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp6 - -.Led7: .byte 0xf3,76,0x0f,0x38,0xf6,7 - .byte 0xf3,76,0x0f,0x38,0xf6,201 -.Lf6: mov %r8, (%rdi) - adc %rcx, %r9 - mov %r9, 8(%rdi) - lea (%rsi,%rbx,8), %rsi - mov %ebx, %ecx - lea -56(%rdi,%rbx,8), %rdi - mov -56(%rsi), %r11 - mov -48(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xc4,98,171,0xf6,94,216 - jmp .Lb6 +.Laddmul_2_m0_top: + add %r10, (%r12,%r9,8) + adc %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rbx, 8(%r12,%r9,8) + adc %rax, %rcx + adc %rdx, %rbp +.L20: mov 16(%rsi,%r9,8), %rax + mov $0, %r10d + mul %r13 + add %rax, %rcx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r14 + add %rcx, 16(%r12,%r9,8) + adc %rax, %rbp + mov 24(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %ecx + mov 24(%rsi,%r9,8), %rax + adc $0, %ebx + mul %r14 + add %rbp, 24(%r12,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx + mov 32(%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov 32(%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r14 + add $4, %r9 + js .Laddmul_2_m0_top + + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 +.Ldowhile_end: + + add $-2, %r8d + jne .Ldowhile + + + mov -16(%rsi), %r13 + mov -8(%rsi), %r14 + mov -8(%rsi), %rax + mul %r13 + xor %r10d, %r10d + add %rax, -8(%r12) + adc %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + mov (%rsi), %rax + mul %r13 + add %rax, %r10 + mov (%rsi), %rax + adc %rdx, %rbx + mul %r14 + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + + lea -4(%r11,%r11), %r9 + + mov 8(%rdi), %r11 + lea -8(%rsi), %rsi + lea (%rdi,%r9,8), %rdi + neg %r9 + mov (%rsi,%r9,4), %rax + mul %rax + test $2, %r9b + jnz .Lodd + +.Levn: add %r11, %r11 + sbb %ebx, %ebx + add %rdx, %r11 + mov %rax, (%rdi,%r9,8) + jmp .Ld0 + +.Lodd: add %r11, %r11 + sbb %ebp, %ebp + add %rdx, %r11 + mov %rax, (%rdi,%r9,8) + lea -2(%r9), %r9 + jmp .Ld1 .align 16, 0x90 -.Ltp7: .byte 0xf3,76,0x0f,0x38,0xf6,87,248 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, -8(%rdi) - jrcxz .Led7 - .byte 0xc4,98,171,0xf6,94,8 - .byte 0xf3,76,0x0f,0x38,0xf6,7 - lea 8(%rcx), %ecx - mov %r8, (%rdi) -.Lb7: .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,78,16 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,87,8 - mov %r10, 8(%rdi) - .byte 0xc4,98,171,0xf6,94,24 - lea 64(%rsi), %rsi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,16 - mov %r8, 16(%rdi) - .byte 0xc4,98,187,0xf6,78,224 - .byte 0xf3,76,0x0f,0x38,0xf6,87,24 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 24(%rdi) - .byte 0xc4,98,171,0xf6,94,232 - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xf3,76,0x0f,0x38,0xf6,71,32 - mov %r8, 32(%rdi) - .byte 0xc4,98,187,0xf6,78,240 - .byte 0xf3,76,0x0f,0x38,0xf6,87,40 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 40(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,48 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 48(%rdi) - lea 64(%rdi), %rdi - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,187,0xf6,14 - jmp .Ltp7 - -.Lcor3:lea -64(%rdi), %rdi - mov -24(%rsi), %r11 - mov -16(%rsi), %rdx - .byte 0xc4,66,251,0xf7,211 - .byte 0xc4,66,250,0xf7,219 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - lea (%r10,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,71,56 - .byte 0xc4,98,171,0xf6,94,248 - mov %r8, 56(%rdi) - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,98,227,0xf6,14 - .byte 0xf3,76,0x0f,0x38,0xf6,87,64 - .byte 0x66,73,0x0f,0x38,0xf6,219 - mov %r10, 64(%rdi) - .byte 0xf3,72,0x0f,0x38,0xf6,95,72 - .byte 0xf3,76,0x0f,0x38,0xf6,201 - adc %rcx, %r9 - mov %r9, 80(%rdi) - - mov -16(%rsi), %r9 - mov -8(%rsi), %rdx - .byte 0xc4,66,251,0xf7,193 - .byte 0xc4,66,250,0xf7,201 - and %rdx, %r9 - .byte 0xc4,98,171,0xf6,218 - lea (%r8,%rdx,2), %rdx - .byte 0x66,77,0x0f,0x38,0xf6,209 - .byte 0xc4,226,187,0xf6,6 - .byte 0xf3,76,0x0f,0x38,0xf6,211 - .byte 0x66,77,0x0f,0x38,0xf6,195 - mov %r10, 72(%rdi) - .byte 0xf3,76,0x0f,0x38,0xf6,71,80 - .byte 0xf3,72,0x0f,0x38,0xf6,193 - mov %r8, 80(%rdi) - adc %rcx, %rax - - mov -8(%rsi), %r11 - mov (%rsi), %rdx - sar $63, %r11 - and %rdx, %r11 - .byte 0xc4,98,187,0xf6,202 - .byte 0x66,77,0x0f,0x38,0xf6,195 - .byte 0xf3,76,0x0f,0x38,0xf6,192 - mov %r8, 88(%rdi) - .byte 0x66,76,0x0f,0x38,0xf6,201 - .byte 0xf3,76,0x0f,0x38,0xf6,201 - mov %r9, 96(%rdi) +.Ltop: mov (%rsi,%r9,4), %rax + mul %rax + add %ebp, %ebp + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (%rdi,%r9,8) +.Ld0: mov %r11, 8(%rdi,%r9,8) + mov 16(%rdi,%r9,8), %r10 + adc %r10, %r10 + mov 24(%rdi,%r9,8), %r11 + adc %r11, %r11 + nop + sbb %ebp, %ebp + mov 8(%rsi,%r9,4), %rax + mul %rax + add %ebx, %ebx + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, 16(%rdi,%r9,8) +.Ld1: mov %r11, 24(%rdi,%r9,8) + mov 32(%rdi,%r9,8), %r10 + adc %r10, %r10 + mov 40(%rdi,%r9,8), %r11 + adc %r11, %r11 + sbb %ebx, %ebx + add $4, %r9 + js .Ltop + + mov (%rsi), %rax + mul %rax + add %ebp, %ebp + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (%rdi) + mov %r11, 8(%rdi) + mov 16(%rdi), %r10 + adc %r10, %r10 + sbb %ebp, %ebp + neg %ebp + mov 8(%rsi), %rax + mul %rax + add %ebx, %ebx + adc %rax, %r10 + adc %rbp, %rdx + mov %r10, 16(%rdi) + mov %rdx, 24(%rdi) + pop %r14 + pop %r13 + pop %r12 + pop %rbp pop %rbx ret - - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Lmtab:.long .Lmf7-.Lmtab - .long .Lmf0-.Lmtab - .long .Lmf1-.Lmtab - .long .Lmf2-.Lmtab - .long .Lmf3-.Lmtab - .long .Lmf4-.Lmtab - .long .Lmf5-.Lmtab - .long .Lmf6-.Lmtab -.Latab:.long .Lf6-.Latab - .long .Lf7-.Latab - .long .Lf0-.Latab - .long .Lf1-.Latab - .long .Lf2-.Latab - .long .Lf3-.Latab - .long .Lf4-.Latab - .long .Lf5-.Latab - .text .size __gmpn_sqr_basecase,.-__gmpn_sqr_basecase diff --git a/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s b/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s index 7db64b894e..cbef8af042 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s @@ -189,20 +189,20 @@ __gmpn_sub_err1_n: .align 32, 0x90 .Lloop: - mov (%rsi,%r9,8), %r14 shr $1, %al mov -8(%r8), %r10 mov $0, %r13d + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 sbb (%rdx,%r9,8), %r14 cmovnc %r13, %r10 - mov %r14, (%rdi,%r9,8) - mov 8(%rsi,%r9,8), %r15 - mov 16(%rsi,%r9,8), %r14 sbb 8(%rdx,%r9,8), %r15 mov -16(%r8), %r11 + mov %r14, (%rdi,%r9,8) + mov 16(%rsi,%r9,8), %r14 + mov %r15, 8(%rdi,%r9,8) cmovnc %r13, %r11 mov -24(%r8), %r12 - mov %r15, 8(%rdi,%r9,8) sbb 16(%rdx,%r9,8), %r14 cmovnc %r13, %r12 mov 24(%rsi,%r9,8), %r15 @@ -215,12 +215,12 @@ __gmpn_sub_err1_n: adc $0, %rbp add %r12, %rbx adc $0, %rbp - lea -32(%r8), %r8 mov %r14, 16(%rdi,%r9,8) add %r13, %rbx + lea -32(%r8), %r8 adc $0, %rbp + mov %r15, 24(%rdi,%r9,8) add $4, %r9 - mov %r15, -8(%rdi,%r9,8) jnz .Lloop .Lend: diff --git a/ext/gmp/gen/x86_64-linux/mpn/sub_n.s b/ext/gmp/gen/x86_64-linux/mpn/sub_n.s index 2ae18233ca..8c1db0a02f 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sub_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sub_n.s @@ -94,20 +94,18 @@ __gmpn_sub_nc: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 - neg %r8 + shr $2, %rcx + and $3, %eax + bt $0, %r8 + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax + mov (%rsi), %r8 + mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid .size __gmpn_sub_nc,.-__gmpn_sub_nc - .align 16, 0x90 .globl __gmpn_sub_n .type __gmpn_sub_n,@function @@ -115,159 +113,82 @@ __gmpn_sub_nc: __gmpn_sub_n: - mov %ecx, %eax - shr $3, %rcx - and $7, %eax - - lea .Ltab(%rip), %r9 + shr $2, %rcx + and $3, %eax + jrcxz .Llt4 - movslq (%r9,%rax,4), %rax - lea (%r9,%rax), %rax - jmp *%rax - - -.L0: mov (%rsi), %r8 + mov (%rsi), %r8 mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid + +.Llt4: dec %eax + mov (%rsi), %r8 + jnz .L2 sbb (%rdx), %r8 - jmp .Le0 + mov %r8, (%rdi) + adc %eax, %eax + + ret -.L4: mov (%rsi), %r8 +.L2: dec %eax mov 8(%rsi), %r9 + jnz .L3 sbb (%rdx), %r8 - lea -32(%rsi), %rsi - lea -32(%rdx), %rdx - lea -32(%rdi), %rdi - inc %rcx - jmp .Le4 - -.L5: mov (%rsi), %r11 - mov 8(%rsi), %r8 - mov 16(%rsi), %r9 - sbb (%rdx), %r11 - lea -24(%rsi), %rsi - lea -24(%rdx), %rdx - lea -24(%rdi), %rdi - inc %rcx - jmp .Le5 - -.L6: mov (%rsi), %r10 - sbb (%rdx), %r10 - mov 8(%rsi), %r11 - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi - inc %rcx - jmp .Le6 - -.L7: mov (%rsi), %r9 - mov 8(%rsi), %r10 - sbb (%rdx), %r9 - sbb 8(%rdx), %r10 - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi - inc %rcx - jmp .Le7 + sbb 8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + adc %eax, %eax + + ret - .align 16, 0x90 -.Ltop: -.Le3: mov %r9, 40(%rdi) -.Le2: mov %r10, 48(%rdi) -.Le1: mov (%rsi), %r8 - mov 8(%rsi), %r9 +.L3: mov 16(%rsi), %r10 sbb (%rdx), %r8 - mov %r11, 56(%rdi) - lea 64(%rdi), %rdi -.Le0: mov 16(%rsi), %r10 sbb 8(%rdx), %r9 sbb 16(%rdx), %r10 mov %r8, (%rdi) -.Le7: mov 24(%rsi), %r11 mov %r9, 8(%rdi) -.Le6: mov 32(%rsi), %r8 - mov 40(%rsi), %r9 - sbb 24(%rdx), %r11 mov %r10, 16(%rdi) -.Le5: sbb 32(%rdx), %r8 - mov %r11, 24(%rdi) -.Le4: mov 48(%rsi), %r10 - mov 56(%rsi), %r11 - mov %r8, 32(%rdi) - lea 64(%rsi), %rsi - sbb 40(%rdx), %r9 - sbb 48(%rdx), %r10 - sbb 56(%rdx), %r11 - lea 64(%rdx), %rdx - dec %rcx - jnz .Ltop - -.Lend: mov %r9, 40(%rdi) - mov %r10, 48(%rdi) - mov %r11, 56(%rdi) - mov %ecx, %eax - adc %ecx, %eax + setc %al ret .align 16, 0x90 -.L3: mov (%rsi), %r9 - mov 8(%rsi), %r10 - mov 16(%rsi), %r11 - sbb (%rdx), %r9 - sbb 8(%rdx), %r10 - sbb 16(%rdx), %r11 - jrcxz .Lx3 - lea 24(%rsi), %rsi - lea 24(%rdx), %rdx - lea -40(%rdi), %rdi - jmp .Le3 -.Lx3: mov %r9, (%rdi) - mov %r10, 8(%rdi) - mov %r11, 16(%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Ltop: sbb (%rdx), %r8 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + sbb 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + dec %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Ltop - .align 16, 0x90 -.L1: mov (%rsi), %r11 - sbb (%rdx), %r11 - jrcxz .Lx1 - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea -56(%rdi), %rdi - jmp .Le1 -.Lx1: mov %r11, (%rdi) - mov %ecx, %eax - adc %ecx, %eax - - ret +.Lend: lea 32(%rsi), %rsi + sbb (%rdx), %r8 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + sbb 24(%rdx), %r11 + lea 32(%rdx), %rdx + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + lea 32(%rdi), %rdi - .align 16, 0x90 -.L2: mov (%rsi), %r10 - mov 8(%rsi), %r11 - sbb (%rdx), %r10 - sbb 8(%rdx), %r11 - jrcxz .Lx2 - lea 16(%rsi), %rsi - lea 16(%rdx), %rdx - lea -48(%rdi), %rdi - jmp .Le2 -.Lx2: mov %r10, (%rdi) - mov %r11, 8(%rdi) - mov %ecx, %eax - adc %ecx, %eax + inc %eax + dec %eax + jnz .Llt4 + adc %eax, %eax ret .size __gmpn_sub_n,.-__gmpn_sub_n - .section .data.rel.ro.local,"a",@progbits - .align 8, 0x90 -.Ltab: .long .L0-.Ltab - .long .L1-.Ltab - .long .L2-.Ltab - .long .L3-.Ltab - .long .L4-.Ltab - .long .L5-.Ltab - .long .L6-.Ltab - .long .L7-.Ltab diff --git a/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s b/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s index cabbb914a0..d257a0544b 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s @@ -58,14 +58,6 @@ - - - - - - - - @@ -76,7 +68,7 @@ .text - .align 8, 0x90 + .align 16, 0x90 .globl __gmpn_sublsh1_n .type __gmpn_sublsh1_n,@function @@ -84,107 +76,100 @@ __gmpn_sublsh1_n: push %rbx - push %r12 + push %rbp + mov (%rdx), %r8 mov %ecx, %eax - lea 24(%rsi,%rcx,8), %rsi - lea 24(%rdx,%rcx,8), %rdx - lea 24(%rdi,%rcx,8), %rdi + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx neg %rcx - - xor %r11d, %r11d - - mov -24(%rdx,%rcx,8), %r8 - shrd $63, %r8, %r11 - + xor %ebp, %ebp and $3, %eax - je .Lb0 + je .Lb00 cmp $2, %eax - jc .Lb1 - je .Lb2 - -.Lb3: mov -16(%rdx,%rcx,8), %r9 - shrd $63, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $63, %r10, %r9 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - mov %r10, %r11 + jc .Lb01 + je .Lb10 + +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + mov 16(%rsi,%rcx,8), %rbp + sbb %r10, %rbp + mov %rbp, 16(%rdi,%rcx,8) + sbb %ebp, %ebp add $3, %rcx - js .Ltop - jmp .Lend + jmp .Lent -.Lb1: mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov %r8, %r11 - sbb %eax, %eax - inc %rcx - js .Ltop - jmp .Lend - -.Lb2: mov -16(%rdx,%rcx,8), %r9 - shrd $63, %r9, %r8 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov %r9, %r11 +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + sbb %ebp, %ebp add $2, %rcx - js .Ltop - jmp .Lend + jmp .Lent - .align 16, 0x90 -.Ltop: mov -24(%rdx,%rcx,8), %r8 - shrd $63, %r8, %r11 -.Lb0: mov -16(%rdx,%rcx,8), %r9 - shrd $63, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $63, %r10, %r9 - mov (%rdx,%rcx,8), %rbx - shrd $63, %rbx, %r10 - - add %eax, %eax - - mov -24(%rsi,%rcx,8), %r12 - sbb %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - - mov (%rsi,%rcx,8), %r12 - sbb %r10, %r12 - mov %r12, (%rdi,%rcx,8) - - mov %rbx, %r11 +.Lb01: add %r8, %r8 sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + sub %r8, %rbp + mov %rbp, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend + + .align 16, 0x90 +.Ltop: add %eax, %eax + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + + sbb %eax, %eax + add %ebp, %ebp + + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sbb %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + mov 16(%rsi,%rcx,8), %rbp + mov 24(%rsi,%rcx,8), %rbx + sbb %r10, %rbp + sbb %r11, %rbx + mov %rbp, 16(%rdi,%rcx,8) + mov %rbx, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp add $4, %rcx js .Ltop -.Lend: shr $63, %r11 - pop %r12 - pop %rbx - sub %r11d, %eax +.Lend: add %ebp, %eax neg %eax + + pop %rbp + pop %rbx ret .size __gmpn_sublsh1_n,.-__gmpn_sublsh1_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s b/ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s deleted file mode 100644 index d5bf3a7be3..0000000000 --- a/ext/gmp/gen/x86_64-linux/mpn/sublsh2_n.s +++ /dev/null @@ -1,190 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - .text - .align 8, 0x90 - .globl __gmpn_sublsh2_n - .type __gmpn_sublsh2_n,@function - -__gmpn_sublsh2_n: - - - push %rbx - push %r12 - - mov %ecx, %eax - lea 24(%rsi,%rcx,8), %rsi - lea 24(%rdx,%rcx,8), %rdx - lea 24(%rdi,%rcx,8), %rdi - neg %rcx - - xor %r11d, %r11d - - mov -24(%rdx,%rcx,8), %r8 - shrd $62, %r8, %r11 - - and $3, %eax - je .Lb0 - cmp $2, %eax - jc .Lb1 - je .Lb2 - -.Lb3: mov -16(%rdx,%rcx,8), %r9 - shrd $62, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $62, %r10, %r9 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - mov %r10, %r11 - sbb %eax, %eax - add $3, %rcx - js .Ltop - jmp .Lend - -.Lb1: mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov %r8, %r11 - sbb %eax, %eax - inc %rcx - js .Ltop - jmp .Lend - -.Lb2: mov -16(%rdx,%rcx,8), %r9 - shrd $62, %r9, %r8 - mov -24(%rsi,%rcx,8), %r12 - sub %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - mov %r9, %r11 - sbb %eax, %eax - add $2, %rcx - js .Ltop - jmp .Lend - - .align 16, 0x90 -.Ltop: mov -24(%rdx,%rcx,8), %r8 - shrd $62, %r8, %r11 -.Lb0: mov -16(%rdx,%rcx,8), %r9 - shrd $62, %r9, %r8 - mov -8(%rdx,%rcx,8), %r10 - shrd $62, %r10, %r9 - mov (%rdx,%rcx,8), %rbx - shrd $62, %rbx, %r10 - - add %eax, %eax - - mov -24(%rsi,%rcx,8), %r12 - sbb %r11, %r12 - mov %r12, -24(%rdi,%rcx,8) - - mov -16(%rsi,%rcx,8), %r12 - sbb %r8, %r12 - mov %r12, -16(%rdi,%rcx,8) - - mov -8(%rsi,%rcx,8), %r12 - sbb %r9, %r12 - mov %r12, -8(%rdi,%rcx,8) - - mov (%rsi,%rcx,8), %r12 - sbb %r10, %r12 - mov %r12, (%rdi,%rcx,8) - - mov %rbx, %r11 - sbb %eax, %eax - - add $4, %rcx - js .Ltop - -.Lend: shr $62, %r11 - pop %r12 - pop %rbx - sub %r11d, %eax - neg %eax - - ret - .size __gmpn_sublsh2_n,.-__gmpn_sublsh2_n - diff --git a/ext/gmp/gen/x86_64-linux/mpn/submul_1.s b/ext/gmp/gen/x86_64-linux/mpn/submul_1.s index 07aaadb7bb..5e34932b8d 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/submul_1.s +++ b/ext/gmp/gen/x86_64-linux/mpn/submul_1.s @@ -68,6 +68,7 @@ + @@ -78,10 +79,8 @@ - - - - + + @@ -89,6 +88,7 @@ + .text .align 16, 0x90 @@ -97,115 +97,100 @@ __gmpn_submul_1: - + + + + + + mov (%rsi), %rax push %rbx - push %rbp - push %r12 - push %r13 - - mov %rdx, %rbp - mov %rcx, %rdx - - test $1, %bpl - jnz .Lbx1 - -.Lbx0: shr $2, %rbp - jc .Lb10 - -.Lb00: .byte 0xc4,98,147,0xf6,38 - .byte 0xc4,226,227,0xf6,70,8 - add %r12, %rbx - adc $0, %rax - mov (%rdi), %r12 - mov 8(%rdi), %rcx - .byte 0xc4,98,179,0xf6,70,16 - lea -16(%rdi), %rdi - lea 16(%rsi), %rsi - sub %r13, %r12 - jmp .Llo0 - -.Lbx1: shr $2, %rbp - jc .Lb11 - -.Lb01: .byte 0xc4,98,163,0xf6,22 - jnz .Lgt1 -.Ln1: sub %r11, (%rdi) - mov $0, %eax - adc %r10, %rax - jmp .Lret + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 -.Lgt1: .byte 0xc4,98,147,0xf6,102,8 - .byte 0xc4,226,227,0xf6,70,16 - lea 24(%rsi), %rsi - add %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov (%rdi), %r10 - mov 8(%rdi), %r12 - mov 16(%rdi), %rcx - lea -8(%rdi), %rdi - sub %r11, %r10 - jmp .Llo1 - -.Lb11: .byte 0xc4,226,227,0xf6,6 - mov (%rdi), %rcx - .byte 0xc4,98,179,0xf6,70,8 - lea 8(%rsi), %rsi - lea -24(%rdi), %rdi - inc %rbp - sub %rbx, %rcx - jmp .Llo3 - -.Lb10: .byte 0xc4,98,179,0xf6,6 - .byte 0xc4,98,163,0xf6,86,8 - lea -32(%rdi), %rdi - mov $0, %eax - clc - jz .Lend + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + sub %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 .align 16, 0x90 -.Ltop: adc %rax, %r9 - lea 32(%rdi), %rdi - adc %r8, %r11 - .byte 0xc4,98,147,0xf6,102,16 - mov (%rdi), %r8 - .byte 0xc4,226,227,0xf6,70,24 - lea 32(%rsi), %rsi - adc %r10, %r13 - adc %r12, %rbx - adc $0, %rax - mov 8(%rdi), %r10 - mov 16(%rdi), %r12 - sub %r9, %r8 - mov 24(%rdi), %rcx - mov %r8, (%rdi) - sbb %r11, %r10 -.Llo1: .byte 0xc4,98,179,0xf6,6 - mov %r10, 8(%rdi) - sbb %r13, %r12 -.Llo0: mov %r12, 16(%rdi) - sbb %rbx, %rcx -.Llo3: .byte 0xc4,98,163,0xf6,86,8 - mov %rcx, 24(%rdi) - dec %rbp - jnz .Ltop - -.Lend: adc %rax, %r9 - adc %r8, %r11 - mov 32(%rdi), %r8 - mov %r10, %rax - adc $0, %rax - mov 40(%rdi), %r10 - sub %r9, %r8 - mov %r8, 32(%rdi) - sbb %r11, %r10 - mov %r10, 40(%rdi) - adc $0, %rax - -.Lret: pop %r13 - pop %r12 - pop %rbp +.Ltop: sub %r10, (%rdi,%r11,8) + adc %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + sub %r9, 8(%rdi,%r11,8) + adc %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + sub %r8, 16(%rdi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + sub %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + adc %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + sub %r10, (%rdi,%r11,8) + adc %rax, %r9 + adc %r8, %rdx + sub %r9, 8(%rdi,%r11,8) +.Lret: adc $0, %rdx + mov %rdx, %rax + pop %rbx - + + ret .size __gmpn_submul_1,.-__gmpn_submul_1 diff --git a/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s b/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s index 360b9b8869..4db0497767 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s @@ -92,7 +92,6 @@ - .text @@ -106,54 +105,46 @@ __gmpn_xnor_n: mov (%rdx), %r8 not %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: xor (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: xor (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 +.Ltop: mov (%rdx,%rcx,8), %r8 not %r8 -.Lb00: mov 8(%rdx), %r9 +.Lb00: mov 8(%rdx,%rcx,8), %r9 not %r9 - xor (%rsi), %r8 - xor 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 + xor (%rsi,%rcx,8), %r8 + xor 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 not %r8 -.Le10: mov 24(%rdx), %r9 +.Le10: mov 24(%rdx,%rcx,8), %r9 not %r9 - lea 32(%rdx), %rdx - xor 16(%rsi), %r8 - xor 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + xor 16(%rsi,%rcx,8), %r8 + xor 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret diff --git a/ext/gmp/gen/x86_64-linux/mpn/xor_n.s b/ext/gmp/gen/x86_64-linux/mpn/xor_n.s index 6889f2720a..8ef14d059c 100644 --- a/ext/gmp/gen/x86_64-linux/mpn/xor_n.s +++ b/ext/gmp/gen/x86_64-linux/mpn/xor_n.s @@ -90,7 +90,6 @@ - .text @@ -103,50 +102,42 @@ __gmpn_xor_n: mov (%rdx), %r8 mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx and $3, %eax je .Lb00 cmp $2, %eax jc .Lb01 je .Lb10 -.Lb11: xor (%rsi), %r8 - mov %r8, (%rdi) - inc %rcx - lea -8(%rsi), %rsi - lea -8(%rdx), %rdx - lea -8(%rdi), %rdi +.Lb11: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx jmp .Le11 -.Lb10: add $2, %rcx - lea -16(%rsi), %rsi - lea -16(%rdx), %rdx - lea -16(%rdi), %rdi +.Lb10: add $-2, %rcx jmp .Le10 -.Lb01: xor (%rsi), %r8 - mov %r8, (%rdi) - dec %rcx +.Lb01: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx jz .Lret - lea 8(%rsi), %rsi - lea 8(%rdx), %rdx - lea 8(%rdi), %rdi - - .align 16, 0x90 -.Ltop: mov (%rdx), %r8 -.Lb00: mov 8(%rdx), %r9 - xor (%rsi), %r8 - xor 8(%rsi), %r9 - mov %r8, (%rdi) - mov %r9, 8(%rdi) -.Le11: mov 16(%rdx), %r8 -.Le10: mov 24(%rdx), %r9 - lea 32(%rdx), %rdx - xor 16(%rsi), %r8 - xor 24(%rsi), %r9 - lea 32(%rsi), %rsi - mov %r8, 16(%rdi) - mov %r9, 24(%rdi) - lea 32(%rdi), %rdi - sub $4, %rcx - jnz .Ltop + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + xor (%rsi,%rcx,8), %r8 + xor 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + xor 16(%rsi,%rcx,8), %r8 + xor 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop .Lret: ret From ec02731ceab9e3b69e4479de11a167e104ed53c7 Mon Sep 17 00:00:00 2001 From: Santeri Hannula Date: Thu, 10 Oct 2024 18:42:35 +0300 Subject: [PATCH 2/3] ext: add generated gmp config header for each target --- ext/gmp/build.zig | 323 -------------- ext/gmp/gen/aarch64-linux/config.h | 668 +++++++++++++++++++++++++++++ ext/gmp/gen/aarch64-macos/config.h | 668 +++++++++++++++++++++++++++++ ext/gmp/gen/x86_64-linux/config.h | 668 +++++++++++++++++++++++++++++ ext/gmp/gen/x86_64-macos/config.h | 668 +++++++++++++++++++++++++++++ 5 files changed, 2672 insertions(+), 323 deletions(-) create mode 100644 ext/gmp/gen/aarch64-linux/config.h create mode 100644 ext/gmp/gen/aarch64-macos/config.h create mode 100644 ext/gmp/gen/x86_64-linux/config.h create mode 100644 ext/gmp/gen/x86_64-macos/config.h diff --git a/ext/gmp/build.zig b/ext/gmp/build.zig index cdfad41394..2444000e3d 100644 --- a/ext/gmp/build.zig +++ b/ext/gmp/build.zig @@ -18,328 +18,6 @@ pub fn build(b: *std.Build) void { lib.linkLibC(); - // TODO: The values here should be provided programmatically - const config_h = b.addConfigHeader(.{ - .style = .{ - .autoconf = dep_c.path("config.in"), - }, - .include_path = "config.h", - }, .{ - .GMP_MPARAM_H_SUGGEST = "./mpn/arm64/gmp-mparam.h", - .HAVE_ALARM = 1, - .HAVE_ALLOCA = 1, - .HAVE_ALLOCA_H = 1, - .HAVE_ATTRIBUTE_CONST = 1, - .HAVE_ATTRIBUTE_MALLOC = 1, - .HAVE_ATTRIBUTE_MODE = 1, - .HAVE_ATTRIBUTE_NORETURN = 1, - .HAVE_CLOCK = 1, - .HAVE_CLOCK_GETTIME = 1, - .HAVE_DECL_FGETC = 1, - .HAVE_DECL_FSCANF = 1, - .HAVE_DECL_OPTARG = 1, - .HAVE_DECL_SYS_ERRLIST = 1, - .HAVE_DECL_SYS_NERR = 1, - .HAVE_DECL_UNGETC = 1, - .HAVE_DECL_VFPRINTF = 1, - .HAVE_DLFCN_H = 1, - .HAVE_DOUBLE_IEEE_LITTLE_ENDIAN = 1, - .HAVE_FCNTL_H = 1, - .HAVE_FLOAT_H = 1, - .HAVE_GETPAGESIZE = 1, - .HAVE_GETRUSAGE = 1, - .HAVE_GETTIMEOFDAY = 1, - .HAVE_INTMAX_T = 1, - .HAVE_INTPTR_T = 1, - .HAVE_INTTYPES_H = 1, - .HAVE_LANGINFO_H = 1, - .HAVE_LIMB_LITTLE_ENDIAN = 1, - .HAVE_LOCALECONV = 1, - .HAVE_LOCALE_H = 1, - .HAVE_LONG_DOUBLE = 1, - .HAVE_LONG_LONG = 1, - .HAVE_MEMORY_H = 1, - .HAVE_MEMSET = 1, - .HAVE_MMAP = 1, - .HAVE_MPROTECT = 1, - .HAVE_NATIVE_mpn_add_n = 1, - .HAVE_NATIVE_mpn_add_nc = 1, - .HAVE_NATIVE_mpn_addlsh1_n = 1, - .HAVE_NATIVE_mpn_addlsh2_n = 1, - .HAVE_NATIVE_mpn_and_n = 1, - .HAVE_NATIVE_mpn_andn_n = 1, - .HAVE_NATIVE_mpn_bdiv_dbm1c = 1, - .HAVE_NATIVE_mpn_bdiv_q_1 = 1, - .HAVE_NATIVE_mpn_pi1_bdiv_q_1 = 1, - .HAVE_NATIVE_mpn_cnd_add_n = 1, - .HAVE_NATIVE_mpn_cnd_sub_n = 1, - .HAVE_NATIVE_mpn_com = 1, - .HAVE_NATIVE_mpn_copyd = 1, - .HAVE_NATIVE_mpn_copyi = 1, - .HAVE_NATIVE_mpn_gcd_11 = 1, - .HAVE_NATIVE_mpn_gcd_22 = 1, - .HAVE_NATIVE_mpn_hamdist = 1, - .HAVE_NATIVE_mpn_invert_limb = 1, - .HAVE_NATIVE_mpn_ior_n = 1, - .HAVE_NATIVE_mpn_iorn_n = 1, - .HAVE_NATIVE_mpn_lshift = 1, - .HAVE_NATIVE_mpn_lshiftc = 1, - .HAVE_NATIVE_mpn_mod_34lsub1 = 1, - .HAVE_NATIVE_mpn_mul_1 = 1, - .HAVE_NATIVE_mpn_mul_1c = 1, - .HAVE_NATIVE_mpn_nand_n = 1, - .HAVE_NATIVE_mpn_nior_n = 1, - .HAVE_NATIVE_mpn_popcount = 1, - .HAVE_NATIVE_mpn_rsblsh1_n = 1, - .HAVE_NATIVE_mpn_rsblsh2_n = 1, - .HAVE_NATIVE_mpn_rsh1add_n = 1, - .HAVE_NATIVE_mpn_rsh1sub_n = 1, - .HAVE_NATIVE_mpn_rshift = 1, - .HAVE_NATIVE_mpn_sqr_diag_addlsh1 = 1, - .HAVE_NATIVE_mpn_sub_n = 1, - .HAVE_NATIVE_mpn_sub_nc = 1, - .HAVE_NATIVE_mpn_sublsh1_n = 1, - .HAVE_NATIVE_mpn_sublsh2_n = 1, - .HAVE_NATIVE_mpn_xor_n = 1, - .HAVE_NATIVE_mpn_xnor_n = 1, - .HAVE_NL_LANGINFO = 1, - .HAVE_NL_TYPES_H = 1, - .HAVE_POPEN = 1, - .HAVE_PROCESSOR_INFO = 1, - .HAVE_PTRDIFF_T = 1, - .HAVE_QUAD_T = 1, - .HAVE_RAISE = 1, - .HAVE_SIGACTION = 1, - .HAVE_SIGALTSTACK = 1, - .HAVE_STACK_T = 1, - .HAVE_STDINT_H = 1, - .HAVE_STDLIB_H = 1, - .HAVE_STRCHR = 1, - .HAVE_STRERROR = 1, - .HAVE_STRINGS_H = 1, - .HAVE_STRING_H = 1, - .HAVE_STRNLEN = 1, - .HAVE_STRTOL = 1, - .HAVE_STRTOUL = 1, - .HAVE_SYSCONF = 1, - .HAVE_SYSCTL = 1, - .HAVE_SYSCTLBYNAME = 1, - .HAVE_SYS_MMAN_H = 1, - .HAVE_SYS_PARAM_H = 1, - .HAVE_SYS_RESOURCE_H = 1, - .HAVE_SYS_STAT_H = 1, - .HAVE_SYS_SYSCTL_H = 1, - .HAVE_SYS_TIMES_H = 1, - .HAVE_SYS_TIME_H = 1, - .HAVE_SYS_TYPES_H = 1, - .HAVE_TIMES = 1, - .HAVE_UINT_LEAST32_T = 1, - .HAVE_UNISTD_H = 1, - .HAVE_VSNPRINTF = 1, - .LSYM_PREFIX = "L", - .LT_OBJDIR = ".libs/", - .PACKAGE = "gmp", - .PACKAGE_BUGREPORT = "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html", - .PACKAGE_NAME = "GNU MP", - .PACKAGE_STRING = "GNU MP 6.2.1", - .PACKAGE_TARNAME = "gmp", - .PACKAGE_URL = "http://www.gnu.org/software/gmp/", - .PACKAGE_VERSION = "6.2.1", - .RETSIGTYPE = null, - .SIZEOF_MP_LIMB_T = 8, - .SIZEOF_UNSIGNED = 4, - .SIZEOF_UNSIGNED_LONG = 8, - .SIZEOF_UNSIGNED_SHORT = 2, - .SIZEOF_VOID_P = 8, - .STDC_HEADERS = 1, - .TIME_WITH_SYS_TIME = 1, - .TUNE_SQR_TOOM2_MAX = "SQR_TOOM2_MAX_GENERIC", - .VERSION = "6.2.1", - .WANT_FFT = 1, - .WANT_TMP_ALLOCA = 1, - .YYTEXT_POINTER = 1, - .restrict = .__restrict, - .AC_APPLE_UNIVERSAL_BUILD = null, - .HAVE_ATTR_GET = null, - .HAVE_CALLING_CONVENTIONS = null, - .HAVE_CPUTIME = null, - .HAVE_DOUBLE_IEEE_BIG_ENDIAN = null, - .HAVE_DOUBLE_IEEE_LITTLE_SWAPPED = null, - .HAVE_DOUBLE_VAX_D = null, - .HAVE_DOUBLE_VAX_G = null, - .HAVE_DOUBLE_CRAY_CFP = null, - .HAVE_GETSYSINFO = null, - .HAVE_HIDDEN_ALIAS = null, - .HAVE_HOST_CPU_FAMILY_alpha = null, - .HAVE_HOST_CPU_FAMILY_m68k = null, - .HAVE_HOST_CPU_FAMILY_power = null, - .HAVE_HOST_CPU_FAMILY_powerpc = null, - .HAVE_HOST_CPU_FAMILY_x86 = null, - .HAVE_HOST_CPU_FAMILY_x86_64 = null, - .HAVE_HOST_CPU_alphaev67 = null, - .HAVE_HOST_CPU_alphaev68 = null, - .HAVE_HOST_CPU_alphaev7 = null, - .HAVE_HOST_CPU_m68020 = null, - .HAVE_HOST_CPU_m68030 = null, - .HAVE_HOST_CPU_m68040 = null, - .HAVE_HOST_CPU_m68060 = null, - .HAVE_HOST_CPU_m68360 = null, - .HAVE_HOST_CPU_powerpc604 = null, - .HAVE_HOST_CPU_powerpc604e = null, - .HAVE_HOST_CPU_powerpc750 = null, - .HAVE_HOST_CPU_powerpc7400 = null, - .HAVE_HOST_CPU_supersparc = null, - .HAVE_HOST_CPU_i386 = null, - .HAVE_HOST_CPU_i586 = null, - .HAVE_HOST_CPU_i686 = null, - .HAVE_HOST_CPU_pentium = null, - .HAVE_HOST_CPU_pentiummmx = null, - .HAVE_HOST_CPU_pentiumpro = null, - .HAVE_HOST_CPU_pentium2 = null, - .HAVE_HOST_CPU_pentium3 = null, - .HAVE_HOST_CPU_pentium4 = null, - .HAVE_HOST_CPU_core2 = null, - .HAVE_HOST_CPU_nehalem = null, - .HAVE_HOST_CPU_westmere = null, - .HAVE_HOST_CPU_sandybridge = null, - .HAVE_HOST_CPU_ivybridge = null, - .HAVE_HOST_CPU_haswell = null, - .HAVE_HOST_CPU_broadwell = null, - .HAVE_HOST_CPU_skylake = null, - .HAVE_HOST_CPU_silvermont = null, - .HAVE_HOST_CPU_goldmont = null, - .HAVE_HOST_CPU_k8 = null, - .HAVE_HOST_CPU_k10 = null, - .HAVE_HOST_CPU_bulldozer = null, - .HAVE_HOST_CPU_piledriver = null, - .HAVE_HOST_CPU_steamroller = null, - .HAVE_HOST_CPU_excavator = null, - .HAVE_HOST_CPU_zen = null, - .HAVE_HOST_CPU_bobcat = null, - .HAVE_HOST_CPU_jaguar = null, - .HAVE_HOST_CPU_s390_z900 = null, - .HAVE_HOST_CPU_s390_z990 = null, - .HAVE_HOST_CPU_s390_z9 = null, - .HAVE_HOST_CPU_s390_z10 = null, - .HAVE_HOST_CPU_s390_z196 = null, - .HAVE_HOST_CPU_s390_zarch = null, - .HAVE_INVENT_H = null, - .HAVE_LIMB_BIG_ENDIAN = null, - .HAVE_MACHINE_HAL_SYSINFO_H = null, - .HAVE_NATIVE_mpn_add_n_sub_n = null, - .HAVE_NATIVE_mpn_addaddmul_1msb0 = null, - .HAVE_NATIVE_mpn_addlsh_n = null, - .HAVE_NATIVE_mpn_addlsh1_nc = null, - .HAVE_NATIVE_mpn_addlsh2_nc = null, - .HAVE_NATIVE_mpn_addlsh_nc = null, - .HAVE_NATIVE_mpn_addlsh1_n_ip1 = null, - .HAVE_NATIVE_mpn_addlsh2_n_ip1 = null, - .HAVE_NATIVE_mpn_addlsh_n_ip1 = null, - .HAVE_NATIVE_mpn_addlsh1_nc_ip1 = null, - .HAVE_NATIVE_mpn_addlsh2_nc_ip1 = null, - .HAVE_NATIVE_mpn_addlsh_nc_ip1 = null, - .HAVE_NATIVE_mpn_addlsh1_n_ip2 = null, - .HAVE_NATIVE_mpn_addlsh2_n_ip2 = null, - .HAVE_NATIVE_mpn_addlsh_n_ip2 = null, - .HAVE_NATIVE_mpn_addlsh1_nc_ip2 = null, - .HAVE_NATIVE_mpn_addlsh2_nc_ip2 = null, - .HAVE_NATIVE_mpn_addlsh_nc_ip2 = null, - .HAVE_NATIVE_mpn_addmul_1c = null, - .HAVE_NATIVE_mpn_addmul_2 = null, - .HAVE_NATIVE_mpn_addmul_3 = null, - .HAVE_NATIVE_mpn_addmul_4 = null, - .HAVE_NATIVE_mpn_addmul_5 = null, - .HAVE_NATIVE_mpn_addmul_6 = null, - .HAVE_NATIVE_mpn_addmul_7 = null, - .HAVE_NATIVE_mpn_addmul_8 = null, - .HAVE_NATIVE_mpn_addmul_2s = null, - .HAVE_NATIVE_mpn_div_qr_1n_pi1 = null, - .HAVE_NATIVE_mpn_div_qr_2 = null, - .HAVE_NATIVE_mpn_divexact_1 = null, - .HAVE_NATIVE_mpn_divexact_by3c = null, - .HAVE_NATIVE_mpn_divrem_1 = null, - .HAVE_NATIVE_mpn_divrem_1c = null, - .HAVE_NATIVE_mpn_divrem_2 = null, - .HAVE_NATIVE_mpn_gcd_1 = null, - .HAVE_NATIVE_mpn_lshsub_n = null, - .HAVE_NATIVE_mpn_mod_1 = null, - .HAVE_NATIVE_mpn_mod_1_1p = null, - .HAVE_NATIVE_mpn_mod_1c = null, - .HAVE_NATIVE_mpn_mod_1s_2p = null, - .HAVE_NATIVE_mpn_mod_1s_4p = null, - .HAVE_NATIVE_mpn_modexact_1_odd = null, - .HAVE_NATIVE_mpn_modexact_1c_odd = null, - .HAVE_NATIVE_mpn_mul_2 = null, - .HAVE_NATIVE_mpn_mul_3 = null, - .HAVE_NATIVE_mpn_mul_4 = null, - .HAVE_NATIVE_mpn_mul_5 = null, - .HAVE_NATIVE_mpn_mul_6 = null, - .HAVE_NATIVE_mpn_mul_basecase = null, - .HAVE_NATIVE_mpn_mullo_basecase = null, - .HAVE_NATIVE_mpn_preinv_divrem_1 = null, - .HAVE_NATIVE_mpn_preinv_mod_1 = null, - .HAVE_NATIVE_mpn_redc_1 = null, - .HAVE_NATIVE_mpn_redc_2 = null, - .HAVE_NATIVE_mpn_rsblsh_n = null, - .HAVE_NATIVE_mpn_rsblsh1_nc = null, - .HAVE_NATIVE_mpn_rsblsh2_nc = null, - .HAVE_NATIVE_mpn_rsblsh_nc = null, - .HAVE_NATIVE_mpn_rsh1add_nc = null, - .HAVE_NATIVE_mpn_rsh1sub_nc = null, - .HAVE_NATIVE_mpn_sbpi1_bdiv_r = null, - .HAVE_NATIVE_mpn_sqr_basecase = null, - .HAVE_NATIVE_mpn_sqr_diagonal = null, - .HAVE_NATIVE_mpn_sublsh_n = null, - .HAVE_NATIVE_mpn_sublsh1_nc = null, - .HAVE_NATIVE_mpn_sublsh2_nc = null, - .HAVE_NATIVE_mpn_sublsh_nc = null, - .HAVE_NATIVE_mpn_sublsh1_n_ip1 = null, - .HAVE_NATIVE_mpn_sublsh2_n_ip1 = null, - .HAVE_NATIVE_mpn_sublsh_n_ip1 = null, - .HAVE_NATIVE_mpn_sublsh1_nc_ip1 = null, - .HAVE_NATIVE_mpn_sublsh2_nc_ip1 = null, - .HAVE_NATIVE_mpn_sublsh_nc_ip1 = null, - .HAVE_NATIVE_mpn_submul_1c = null, - .HAVE_NATIVE_mpn_tabselect = null, - .HAVE_NATIVE_mpn_udiv_qrnnd = null, - .HAVE_NATIVE_mpn_udiv_qrnnd_r = null, - .HAVE_NATIVE_mpn_umul_ppmm = null, - .HAVE_NATIVE_mpn_umul_ppmm_r = null, - .HAVE_OBSTACK_VPRINTF = null, - .HAVE_PSP_ITICKSPERCLKTICK = null, - .HAVE_PSTAT_GETPROCESSOR = null, - .HAVE_READ_REAL_TIME = null, - .HAVE_SIGSTACK = null, - .HAVE_SPEED_CYCLECOUNTER = null, - .HAVE_SSTREAM = null, - .HAVE_STD__LOCALE = null, - .HAVE_SYSSGI = null, - .HAVE_SYS_ATTRIBUTES_H = null, - .HAVE_SYS_IOGRAPH_H = null, - .HAVE_SYS_PROCESSOR_H = null, - .HAVE_SYS_PSTAT_H = null, - .HAVE_SYS_SYSINFO_H = null, - .HAVE_SYS_SYSSGI_H = null, - .HAVE_SYS_SYSTEMCFG_H = null, - .HOST_DOS64 = null, - .NO_ASM = null, - .SSCANF_WRITABLE_INPUT = null, - .WANT_ASSERT = null, - .WANT_FAKE_CPUID = null, - .WANT_FAT_BINARY = null, - .WANT_OLD_FFT_FULL = null, - .WANT_PROFILING_GPROF = null, - .WANT_PROFILING_INSTRUMENT = null, - .WANT_PROFILING_PROF = null, - .WANT_TMP_REENTRANT = null, - .WANT_TMP_NOTREENTRANT = null, - .WANT_TMP_DEBUG = null, - .WORDS_BIGENDIAN = null, - .X86_ASM_MULX = null, - .@"inline" = null, - .@"volatile" = null, - }); - // TODO: Finish this const gmp_h = b.addConfigHeader(.{ .style = .{ @@ -357,7 +35,6 @@ pub fn build(b: *std.Build) void { .CFLAGS = "-O2 -pedantic -march=armv8-a", }); - lib.addConfigHeader(config_h); lib.addConfigHeader(gmp_h); // Static headers diff --git a/ext/gmp/gen/aarch64-linux/config.h b/ext/gmp/gen/aarch64-linux/config.h new file mode 100644 index 0000000000..d2e56c8c54 --- /dev/null +++ b/ext/gmp/gen/aarch64-linux/config.h @@ -0,0 +1,668 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/arm64/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +/* #undef HAVE_CALLING_CONVENTIONS */ + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 0 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 0 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +#define HAVE_HIDDEN_ALIAS 1 + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +/* #undef HAVE_HOST_CPU_FAMILY_x86_64 */ + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +/* #undef HAVE_NATIVE_mpn_addaddmul_1msb0 */ +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +/* #undef HAVE_NATIVE_mpn_addlsh_n */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +/* #undef HAVE_NATIVE_mpn_addmul_2 */ +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +/* #undef HAVE_NATIVE_mpn_div_qr_1n_pi1 */ +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +/* #undef HAVE_NATIVE_mpn_divexact_1 */ +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +/* #undef HAVE_NATIVE_mpn_divrem_1 */ +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +/* #undef HAVE_NATIVE_mpn_divrem_2 */ +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +/* #undef HAVE_NATIVE_mpn_mod_1_1p */ +/* #undef HAVE_NATIVE_mpn_mod_1c */ +/* #undef HAVE_NATIVE_mpn_mod_1s_2p */ +/* #undef HAVE_NATIVE_mpn_mod_1s_4p */ +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +/* #undef HAVE_NATIVE_mpn_modexact_1_odd */ +/* #undef HAVE_NATIVE_mpn_modexact_1c_odd */ +#define HAVE_NATIVE_mpn_mul_1 1 +#define HAVE_NATIVE_mpn_mul_1c 1 +/* #undef HAVE_NATIVE_mpn_mul_2 */ +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +/* #undef HAVE_NATIVE_mpn_mul_basecase */ +/* #undef HAVE_NATIVE_mpn_mullo_basecase */ +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +/* #undef HAVE_NATIVE_mpn_preinv_divrem_1 */ +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +/* #undef HAVE_NATIVE_mpn_redc_1 */ +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +/* #undef HAVE_NATIVE_mpn_rsblsh_n */ +/* #undef HAVE_NATIVE_mpn_rsblsh1_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1add_nc */ +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1sub_nc */ +#define HAVE_NATIVE_mpn_rshift 1 +/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +/* #undef HAVE_NATIVE_mpn_sqr_basecase */ +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +#define HAVE_NATIVE_mpn_sublsh2_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #define HAVE_OBSTACK_VPRINTF 1 */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +/* #undef HAVE_PROCESSOR_INFO */ + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +#define HAVE_SIGSTACK 1 + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +/* #undef HAVE_SPEED_CYCLECOUNTER */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +/* #undef HAVE_SYSCTL */ + +/* Define to 1 if you have the `sysctlbyname' function. */ +/* #undef HAVE_SYSCTLBYNAME */ + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSCTL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSINFO_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX ".L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.2.1" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.2.1" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +#define TUNE_SQR_TOOM2_MAX SQR_TOOM2_MAX_GENERIC + +/* Version number of package */ +#define VERSION "6.2.1" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +/* #undef X86_ASM_MULX */ + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +/* #undef YYTEXT_POINTER */ + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ diff --git a/ext/gmp/gen/aarch64-macos/config.h b/ext/gmp/gen/aarch64-macos/config.h new file mode 100644 index 0000000000..dd1ca7f842 --- /dev/null +++ b/ext/gmp/gen/aarch64-macos/config.h @@ -0,0 +1,668 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/arm64/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +/* #undef HAVE_CALLING_CONVENTIONS */ + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 1 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 1 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +/* #undef HAVE_HIDDEN_ALIAS */ + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +/* #undef HAVE_HOST_CPU_FAMILY_x86_64 */ + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +/* #undef HAVE_NATIVE_mpn_addaddmul_1msb0 */ +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +/* #undef HAVE_NATIVE_mpn_addlsh_n */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +/* #undef HAVE_NATIVE_mpn_addmul_2 */ +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +/* #undef HAVE_NATIVE_mpn_div_qr_1n_pi1 */ +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +/* #undef HAVE_NATIVE_mpn_divexact_1 */ +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +/* #undef HAVE_NATIVE_mpn_divrem_1 */ +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +/* #undef HAVE_NATIVE_mpn_divrem_2 */ +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +/* #undef HAVE_NATIVE_mpn_mod_1_1p */ +/* #undef HAVE_NATIVE_mpn_mod_1c */ +/* #undef HAVE_NATIVE_mpn_mod_1s_2p */ +/* #undef HAVE_NATIVE_mpn_mod_1s_4p */ +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +/* #undef HAVE_NATIVE_mpn_modexact_1_odd */ +/* #undef HAVE_NATIVE_mpn_modexact_1c_odd */ +#define HAVE_NATIVE_mpn_mul_1 1 +#define HAVE_NATIVE_mpn_mul_1c 1 +/* #undef HAVE_NATIVE_mpn_mul_2 */ +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +/* #undef HAVE_NATIVE_mpn_mul_basecase */ +/* #undef HAVE_NATIVE_mpn_mullo_basecase */ +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +/* #undef HAVE_NATIVE_mpn_preinv_divrem_1 */ +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +/* #undef HAVE_NATIVE_mpn_redc_1 */ +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +/* #undef HAVE_NATIVE_mpn_rsblsh_n */ +/* #undef HAVE_NATIVE_mpn_rsblsh1_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1add_nc */ +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +/* #undef HAVE_NATIVE_mpn_rsh1sub_nc */ +#define HAVE_NATIVE_mpn_rshift 1 +/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +/* #undef HAVE_NATIVE_mpn_sqr_basecase */ +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +#define HAVE_NATIVE_mpn_sublsh2_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #undef HAVE_OBSTACK_VPRINTF */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +#define HAVE_PROCESSOR_INFO 1 + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +/* #undef HAVE_SIGSTACK */ + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +/* #undef HAVE_SPEED_CYCLECOUNTER */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +#define HAVE_SYSCTL 1 + +/* Define to 1 if you have the `sysctlbyname' function. */ +#define HAVE_SYSCTLBYNAME 1 + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSCTL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX "L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.2.1" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.2.1" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +#define TUNE_SQR_TOOM2_MAX SQR_TOOM2_MAX_GENERIC + +/* Version number of package */ +#define VERSION "6.2.1" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +/* #undef X86_ASM_MULX */ + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +#define YYTEXT_POINTER 1 + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ diff --git a/ext/gmp/gen/x86_64-linux/config.h b/ext/gmp/gen/x86_64-linux/config.h new file mode 100644 index 0000000000..47840ffb13 --- /dev/null +++ b/ext/gmp/gen/x86_64-linux/config.h @@ -0,0 +1,668 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/x86_64/k8/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +#define HAVE_CALLING_CONVENTIONS 1 + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 0 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 0 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +#define HAVE_HIDDEN_ALIAS 1 + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +#define HAVE_HOST_CPU_FAMILY_x86_64 1 + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +#define HAVE_NATIVE_mpn_addaddmul_1msb0 1 +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +#define HAVE_NATIVE_mpn_addlsh_n 1 +/* #undef HAVE_NATIVE_mpn_addlsh1_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +#define HAVE_NATIVE_mpn_addmul_2 1 +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +#define HAVE_NATIVE_mpn_div_qr_1n_pi1 1 +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +#define HAVE_NATIVE_mpn_divexact_1 1 +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +#define HAVE_NATIVE_mpn_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +#define HAVE_NATIVE_mpn_divrem_2 1 +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +#define HAVE_NATIVE_mpn_mod_1_1p 1 +/* #undef HAVE_NATIVE_mpn_mod_1c */ +#define HAVE_NATIVE_mpn_mod_1s_2p 1 +#define HAVE_NATIVE_mpn_mod_1s_4p 1 +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +#define HAVE_NATIVE_mpn_modexact_1_odd 1 +#define HAVE_NATIVE_mpn_modexact_1c_odd 1 +#define HAVE_NATIVE_mpn_mul_1 1 +#define HAVE_NATIVE_mpn_mul_1c 1 +#define HAVE_NATIVE_mpn_mul_2 1 +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +#define HAVE_NATIVE_mpn_mul_basecase 1 +#define HAVE_NATIVE_mpn_mullo_basecase 1 +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +#define HAVE_NATIVE_mpn_preinv_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +#define HAVE_NATIVE_mpn_redc_1 1 +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +#define HAVE_NATIVE_mpn_rsblsh_n 1 +/* #undef HAVE_NATIVE_mpn_rsblsh1_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +#define HAVE_NATIVE_mpn_rsh1add_nc 1 +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +#define HAVE_NATIVE_mpn_rsh1sub_nc 1 +#define HAVE_NATIVE_mpn_rshift 1 +/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +#define HAVE_NATIVE_mpn_sqr_basecase 1 +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh2_n */ +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #define HAVE_OBSTACK_VPRINTF 1 */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +/* #undef HAVE_PROCESSOR_INFO */ + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +#define HAVE_SIGSTACK 1 + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +#define HAVE_SPEED_CYCLECOUNTER 2 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +/* #undef HAVE_SYSCTL */ + +/* Define to 1 if you have the `sysctlbyname' function. */ +/* #undef HAVE_SYSCTLBYNAME */ + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSCTL_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSINFO_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX ".L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.2.1" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.2.1" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +/* #undef TUNE_SQR_TOOM2_MAX */ + +/* Version number of package */ +#define VERSION "6.2.1" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +/* #undef X86_ASM_MULX */ + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +/* #undef YYTEXT_POINTER */ + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ diff --git a/ext/gmp/gen/x86_64-macos/config.h b/ext/gmp/gen/x86_64-macos/config.h new file mode 100644 index 0000000000..1fbed06fdb --- /dev/null +++ b/ext/gmp/gen/x86_64-macos/config.h @@ -0,0 +1,668 @@ +/* config.h. Generated from config.in by configure. */ +/* config.in. Generated from configure.ac by autoheader. */ + +/* + +Copyright 1996-2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. +*/ + +/* Define if building universal (internal helper macro) */ +/* #undef AC_APPLE_UNIVERSAL_BUILD */ + +/* The gmp-mparam.h file (a string) the tune program should suggest updating. + */ +#define GMP_MPARAM_H_SUGGEST "./mpn/x86_64/skylake/gmp-mparam.h" + +/* Define to 1 if you have the `alarm' function. */ +#define HAVE_ALARM 1 + +/* Define to 1 if alloca() works (via gmp-impl.h). */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((const)) */ +#define HAVE_ATTRIBUTE_CONST 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((malloc)) */ +#define HAVE_ATTRIBUTE_MALLOC 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((mode (XX))) + */ +#define HAVE_ATTRIBUTE_MODE 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((noreturn)) */ +#define HAVE_ATTRIBUTE_NORETURN 1 + +/* Define to 1 if you have the `attr_get' function. */ +/* #undef HAVE_ATTR_GET */ + +/* Define to 1 if tests/libtests has calling conventions checking for the CPU + */ +#define HAVE_CALLING_CONVENTIONS 1 + +/* Define to 1 if you have the `clock' function. */ +#define HAVE_CLOCK 1 + +/* Define to 1 if you have the `clock_gettime' function */ +#define HAVE_CLOCK_GETTIME 1 + +/* Define to 1 if you have the `cputime' function. */ +/* #undef HAVE_CPUTIME */ + +/* Define to 1 if you have the declaration of `fgetc', and to 0 if you don't. + */ +#define HAVE_DECL_FGETC 1 + +/* Define to 1 if you have the declaration of `fscanf', and to 0 if you don't. + */ +#define HAVE_DECL_FSCANF 1 + +/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't. + */ +#define HAVE_DECL_OPTARG 1 + +/* Define to 1 if you have the declaration of `sys_errlist', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_ERRLIST 1 + +/* Define to 1 if you have the declaration of `sys_nerr', and to 0 if you + don't. */ +#define HAVE_DECL_SYS_NERR 1 + +/* Define to 1 if you have the declaration of `ungetc', and to 0 if you don't. + */ +#define HAVE_DECL_UNGETC 1 + +/* Define to 1 if you have the declaration of `vfprintf', and to 0 if you + don't. */ +#define HAVE_DECL_VFPRINTF 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define one of the following to 1 for the format of a `double'. + If your format is not among these choices, or you don't know what it is, + then leave all undefined. + IEEE_LITTLE_SWAPPED means little endian, but with the two 4-byte halves + swapped, as used by ARM CPUs in little endian mode. */ +/* #undef HAVE_DOUBLE_IEEE_BIG_ENDIAN */ +#define HAVE_DOUBLE_IEEE_LITTLE_ENDIAN 1 +/* #undef HAVE_DOUBLE_IEEE_LITTLE_SWAPPED */ +/* #undef HAVE_DOUBLE_VAX_D */ +/* #undef HAVE_DOUBLE_VAX_G */ +/* #undef HAVE_DOUBLE_CRAY_CFP */ + +/* Define to 1 if you have the header file. */ +#define HAVE_FCNTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_FLOAT_H 1 + +/* Define to 1 if you have the `getpagesize' function. */ +#define HAVE_GETPAGESIZE 1 + +/* Define to 1 if you have the `getrusage' function. */ +#define HAVE_GETRUSAGE 1 + +/* Define to 1 if you have the `getsysinfo' function. */ +/* #undef HAVE_GETSYSINFO */ + +/* Define to 1 if you have the `gettimeofday' function. */ +#define HAVE_GETTIMEOFDAY 1 + +/* Define to 1 if the compiler accepts gcc style __attribute__ ((visibility)) + and __attribute__ ((alias)) */ +/* #undef HAVE_HIDDEN_ALIAS */ + +/* Define one of these to 1 for the host CPU family. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ +/* #undef HAVE_HOST_CPU_FAMILY_alpha */ +/* #undef HAVE_HOST_CPU_FAMILY_m68k */ +/* #undef HAVE_HOST_CPU_FAMILY_power */ +/* #undef HAVE_HOST_CPU_FAMILY_powerpc */ +/* #undef HAVE_HOST_CPU_FAMILY_x86 */ +#define HAVE_HOST_CPU_FAMILY_x86_64 1 + +/* Define one of the following to 1 for the host CPU, as per the output of + ./config.guess. If your CPU is not listed here, leave all undefined. */ +/* #undef HAVE_HOST_CPU_alphaev67 */ +/* #undef HAVE_HOST_CPU_alphaev68 */ +/* #undef HAVE_HOST_CPU_alphaev7 */ +/* #undef HAVE_HOST_CPU_m68020 */ +/* #undef HAVE_HOST_CPU_m68030 */ +/* #undef HAVE_HOST_CPU_m68040 */ +/* #undef HAVE_HOST_CPU_m68060 */ +/* #undef HAVE_HOST_CPU_m68360 */ +/* #undef HAVE_HOST_CPU_powerpc604 */ +/* #undef HAVE_HOST_CPU_powerpc604e */ +/* #undef HAVE_HOST_CPU_powerpc750 */ +/* #undef HAVE_HOST_CPU_powerpc7400 */ +/* #undef HAVE_HOST_CPU_supersparc */ +/* #undef HAVE_HOST_CPU_i386 */ +/* #undef HAVE_HOST_CPU_i586 */ +/* #undef HAVE_HOST_CPU_i686 */ +/* #undef HAVE_HOST_CPU_pentium */ +/* #undef HAVE_HOST_CPU_pentiummmx */ +/* #undef HAVE_HOST_CPU_pentiumpro */ +/* #undef HAVE_HOST_CPU_pentium2 */ +/* #undef HAVE_HOST_CPU_pentium3 */ +/* #undef HAVE_HOST_CPU_pentium4 */ +/* #undef HAVE_HOST_CPU_core2 */ +/* #undef HAVE_HOST_CPU_nehalem */ +/* #undef HAVE_HOST_CPU_westmere */ +/* #undef HAVE_HOST_CPU_sandybridge */ +/* #undef HAVE_HOST_CPU_ivybridge */ +/* #undef HAVE_HOST_CPU_haswell */ +/* #undef HAVE_HOST_CPU_broadwell */ +/* #undef HAVE_HOST_CPU_skylake */ +/* #undef HAVE_HOST_CPU_silvermont */ +/* #undef HAVE_HOST_CPU_goldmont */ +/* #undef HAVE_HOST_CPU_k8 */ +/* #undef HAVE_HOST_CPU_k10 */ +/* #undef HAVE_HOST_CPU_bulldozer */ +/* #undef HAVE_HOST_CPU_piledriver */ +/* #undef HAVE_HOST_CPU_steamroller */ +/* #undef HAVE_HOST_CPU_excavator */ +/* #undef HAVE_HOST_CPU_zen */ +/* #undef HAVE_HOST_CPU_bobcat */ +/* #undef HAVE_HOST_CPU_jaguar */ +/* #undef HAVE_HOST_CPU_s390_z900 */ +/* #undef HAVE_HOST_CPU_s390_z990 */ +/* #undef HAVE_HOST_CPU_s390_z9 */ +/* #undef HAVE_HOST_CPU_s390_z10 */ +/* #undef HAVE_HOST_CPU_s390_z196 */ + +/* Define to 1 iff we have a s390 with 64-bit registers. */ +/* #undef HAVE_HOST_CPU_s390_zarch */ + +/* Define to 1 if the system has the type `intmax_t'. */ +#define HAVE_INTMAX_T 1 + +/* Define to 1 if the system has the type `intptr_t'. */ +#define HAVE_INTPTR_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_INVENT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_LANGINFO_H 1 + +/* Define one of these to 1 for the endianness of `mp_limb_t'. + If the endianness is not a simple big or little, or you don't know what + it is, then leave both undefined. */ +/* #undef HAVE_LIMB_BIG_ENDIAN */ +#define HAVE_LIMB_LITTLE_ENDIAN 1 + +/* Define to 1 if you have the `localeconv' function. */ +#define HAVE_LOCALECONV 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_LOCALE_H 1 + +/* Define to 1 if the system has the type `long double'. */ +#define HAVE_LONG_DOUBLE 1 + +/* Define to 1 if the system has the type `long long'. */ +#define HAVE_LONG_LONG 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_MACHINE_HAL_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `memset' function. */ +#define HAVE_MEMSET 1 + +/* Define to 1 if you have the `mmap' function. */ +#define HAVE_MMAP 1 + +/* Define to 1 if you have the `mprotect' function. */ +#define HAVE_MPROTECT 1 + +/* Define to 1 each of the following for which a native (ie. CPU specific) + implementation of the corresponding routine exists. */ +#define HAVE_NATIVE_mpn_add_n 1 +/* #undef HAVE_NATIVE_mpn_add_n_sub_n */ +#define HAVE_NATIVE_mpn_add_nc 1 +#define HAVE_NATIVE_mpn_addaddmul_1msb0 1 +#define HAVE_NATIVE_mpn_addlsh1_n 1 +#define HAVE_NATIVE_mpn_addlsh2_n 1 +#define HAVE_NATIVE_mpn_addlsh_n 1 +#define HAVE_NATIVE_mpn_addlsh1_nc 1 +#define HAVE_NATIVE_mpn_addlsh2_nc 1 +/* #undef HAVE_NATIVE_mpn_addlsh_nc */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_n_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh1_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh2_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addlsh_nc_ip2 */ +/* #undef HAVE_NATIVE_mpn_addmul_1c */ +#define HAVE_NATIVE_mpn_addmul_2 1 +/* #undef HAVE_NATIVE_mpn_addmul_3 */ +/* #undef HAVE_NATIVE_mpn_addmul_4 */ +/* #undef HAVE_NATIVE_mpn_addmul_5 */ +/* #undef HAVE_NATIVE_mpn_addmul_6 */ +/* #undef HAVE_NATIVE_mpn_addmul_7 */ +/* #undef HAVE_NATIVE_mpn_addmul_8 */ +/* #undef HAVE_NATIVE_mpn_addmul_2s */ +#define HAVE_NATIVE_mpn_and_n 1 +#define HAVE_NATIVE_mpn_andn_n 1 +#define HAVE_NATIVE_mpn_bdiv_dbm1c 1 +#define HAVE_NATIVE_mpn_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_pi1_bdiv_q_1 1 +#define HAVE_NATIVE_mpn_cnd_add_n 1 +#define HAVE_NATIVE_mpn_cnd_sub_n 1 +#define HAVE_NATIVE_mpn_com 1 +#define HAVE_NATIVE_mpn_copyd 1 +#define HAVE_NATIVE_mpn_copyi 1 +#define HAVE_NATIVE_mpn_div_qr_1n_pi1 1 +/* #undef HAVE_NATIVE_mpn_div_qr_2 */ +#define HAVE_NATIVE_mpn_divexact_1 1 +/* #undef HAVE_NATIVE_mpn_divexact_by3c */ +#define HAVE_NATIVE_mpn_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_divrem_1c */ +#define HAVE_NATIVE_mpn_divrem_2 1 +/* #undef HAVE_NATIVE_mpn_gcd_1 */ +#define HAVE_NATIVE_mpn_gcd_11 1 +#define HAVE_NATIVE_mpn_gcd_22 1 +#define HAVE_NATIVE_mpn_hamdist 1 +#define HAVE_NATIVE_mpn_invert_limb 1 +#define HAVE_NATIVE_mpn_ior_n 1 +#define HAVE_NATIVE_mpn_iorn_n 1 +#define HAVE_NATIVE_mpn_lshift 1 +#define HAVE_NATIVE_mpn_lshiftc 1 +/* #undef HAVE_NATIVE_mpn_lshsub_n */ +/* #undef HAVE_NATIVE_mpn_mod_1 */ +#define HAVE_NATIVE_mpn_mod_1_1p 1 +/* #undef HAVE_NATIVE_mpn_mod_1c */ +#define HAVE_NATIVE_mpn_mod_1s_2p 1 +#define HAVE_NATIVE_mpn_mod_1s_4p 1 +#define HAVE_NATIVE_mpn_mod_34lsub1 1 +#define HAVE_NATIVE_mpn_modexact_1_odd 1 +#define HAVE_NATIVE_mpn_modexact_1c_odd 1 +#define HAVE_NATIVE_mpn_mul_1 1 +/* #undef HAVE_NATIVE_mpn_mul_1c */ +#define HAVE_NATIVE_mpn_mul_2 1 +/* #undef HAVE_NATIVE_mpn_mul_3 */ +/* #undef HAVE_NATIVE_mpn_mul_4 */ +/* #undef HAVE_NATIVE_mpn_mul_5 */ +/* #undef HAVE_NATIVE_mpn_mul_6 */ +#define HAVE_NATIVE_mpn_mul_basecase 1 +#define HAVE_NATIVE_mpn_mullo_basecase 1 +#define HAVE_NATIVE_mpn_nand_n 1 +#define HAVE_NATIVE_mpn_nior_n 1 +#define HAVE_NATIVE_mpn_popcount 1 +#define HAVE_NATIVE_mpn_preinv_divrem_1 1 +/* #undef HAVE_NATIVE_mpn_preinv_mod_1 */ +#define HAVE_NATIVE_mpn_redc_1 1 +/* #undef HAVE_NATIVE_mpn_redc_2 */ +#define HAVE_NATIVE_mpn_rsblsh1_n 1 +#define HAVE_NATIVE_mpn_rsblsh2_n 1 +#define HAVE_NATIVE_mpn_rsblsh_n 1 +#define HAVE_NATIVE_mpn_rsblsh1_nc 1 +/* #undef HAVE_NATIVE_mpn_rsblsh2_nc */ +/* #undef HAVE_NATIVE_mpn_rsblsh_nc */ +#define HAVE_NATIVE_mpn_rsh1add_n 1 +#define HAVE_NATIVE_mpn_rsh1add_nc 1 +#define HAVE_NATIVE_mpn_rsh1sub_n 1 +#define HAVE_NATIVE_mpn_rsh1sub_nc 1 +#define HAVE_NATIVE_mpn_rshift 1 +/* #undef HAVE_NATIVE_mpn_sbpi1_bdiv_r */ +#define HAVE_NATIVE_mpn_sqr_basecase 1 +/* #undef HAVE_NATIVE_mpn_sqr_diagonal */ +#define HAVE_NATIVE_mpn_sqr_diag_addlsh1 1 +#define HAVE_NATIVE_mpn_sub_n 1 +#define HAVE_NATIVE_mpn_sub_nc 1 +#define HAVE_NATIVE_mpn_sublsh1_n 1 +#define HAVE_NATIVE_mpn_sublsh2_n 1 +/* #undef HAVE_NATIVE_mpn_sublsh_n */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc */ +/* #undef HAVE_NATIVE_mpn_sublsh1_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_n_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh1_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh2_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_sublsh_nc_ip1 */ +/* #undef HAVE_NATIVE_mpn_submul_1c */ +/* #undef HAVE_NATIVE_mpn_tabselect */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd */ +/* #undef HAVE_NATIVE_mpn_udiv_qrnnd_r */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm */ +/* #undef HAVE_NATIVE_mpn_umul_ppmm_r */ +#define HAVE_NATIVE_mpn_xor_n 1 +#define HAVE_NATIVE_mpn_xnor_n 1 + +/* Define to 1 if you have the `nl_langinfo' function. */ +#define HAVE_NL_LANGINFO 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_NL_TYPES_H 1 + +/* Define to 1 if you have the `obstack_vprintf' function. */ +/* #undef HAVE_OBSTACK_VPRINTF */ + +/* Define to 1 if you have the `popen' function. */ +#define HAVE_POPEN 1 + +/* Define to 1 if you have the `processor_info' function. */ +#define HAVE_PROCESSOR_INFO 1 + +/* Define to 1 if `struct pst_processor' exists and contains + `psp_iticksperclktick'. */ +/* #undef HAVE_PSP_ITICKSPERCLKTICK */ + +/* Define to 1 if you have the `pstat_getprocessor' function. */ +/* #undef HAVE_PSTAT_GETPROCESSOR */ + +/* Define to 1 if the system has the type `ptrdiff_t'. */ +#define HAVE_PTRDIFF_T 1 + +/* Define to 1 if the system has the type `quad_t'. */ +#define HAVE_QUAD_T 1 + +/* Define to 1 if you have the `raise' function. */ +#define HAVE_RAISE 1 + +/* Define to 1 if you have the `read_real_time' function. */ +/* #undef HAVE_READ_REAL_TIME */ + +/* Define to 1 if you have the `sigaction' function. */ +#define HAVE_SIGACTION 1 + +/* Define to 1 if you have the `sigaltstack' function. */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the `sigstack' function. */ +/* #undef HAVE_SIGSTACK */ + +/* Tune directory speed_cyclecounter, undef=none, 1=32bits, 2=64bits) */ +#define HAVE_SPEED_CYCLECOUNTER 2 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SSTREAM */ + +/* Define to 1 if the system has the type `stack_t'. */ +#define HAVE_STACK_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if the system has the type `std::locale'. */ +/* #undef HAVE_STD__LOCALE */ + +/* Define to 1 if you have the `strchr' function. */ +#define HAVE_STRCHR 1 + +/* Define to 1 if you have the `strerror' function. */ +#define HAVE_STRERROR 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the `strnlen' function. */ +#define HAVE_STRNLEN 1 + +/* Define to 1 if you have the `strtol' function. */ +#define HAVE_STRTOL 1 + +/* Define to 1 if you have the `strtoul' function. */ +#define HAVE_STRTOUL 1 + +/* Define to 1 if you have the `sysconf' function. */ +#define HAVE_SYSCONF 1 + +/* Define to 1 if you have the `sysctl' function. */ +#define HAVE_SYSCTL 1 + +/* Define to 1 if you have the `sysctlbyname' function. */ +#define HAVE_SYSCTLBYNAME 1 + +/* Define to 1 if you have the `syssgi' function. */ +/* #undef HAVE_SYSSGI */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ATTRIBUTES_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_IOGRAPH_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_MMAN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PROCESSOR_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_PSTAT_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_RESOURCE_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSCTL_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSINFO_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSSGI_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_SYSTEMCFG_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIMES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the `times' function. */ +#define HAVE_TIMES 1 + +/* Define to 1 if the system has the type `uint_least32_t'. */ +#define HAVE_UINT_LEAST32_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if you have the `vsnprintf' function and it works properly. */ +#define HAVE_VSNPRINTF 1 + +/* Define to 1 for Windos/64 */ +/* #undef HOST_DOS64 */ + +/* Assembler local label prefix */ +#define LSYM_PREFIX "L" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to 1 to disable the use of inline assembly */ +/* #undef NO_ASM */ + +/* Name of package */ +#define PACKAGE "gmp" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "gmp-bugs@gmplib.org, see https://gmplib.org/manual/Reporting-Bugs.html" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "GNU MP" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "GNU MP 6.2.1" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "gmp" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://www.gnu.org/software/gmp/" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "6.2.1" + +/* Define as the return type of signal handlers (`int' or `void'). */ +#define RETSIGTYPE void + +/* The size of `mp_limb_t', as computed by sizeof. */ +#define SIZEOF_MP_LIMB_T 8 + +/* The size of `unsigned', as computed by sizeof. */ +#define SIZEOF_UNSIGNED 4 + +/* The size of `unsigned long', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_LONG 8 + +/* The size of `unsigned short', as computed by sizeof. */ +#define SIZEOF_UNSIGNED_SHORT 2 + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if sscanf requires writable inputs */ +/* #undef SSCANF_WRITABLE_INPUT */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define to 1 if you can safely include both and . */ +#define TIME_WITH_SYS_TIME 1 + +/* Maximum size the tune program can test for SQR_TOOM2_THRESHOLD */ +/* #undef TUNE_SQR_TOOM2_MAX */ + +/* Version number of package */ +#define VERSION "6.2.1" + +/* Define to 1 to enable ASSERT checking, per --enable-assert */ +/* #undef WANT_ASSERT */ + +/* Define to 1 to enable GMP_CPU_TYPE faking cpuid, per --enable-fake-cpuid */ +/* #undef WANT_FAKE_CPUID */ + +/* Define to 1 when building a fat binary. */ +/* #undef WANT_FAT_BINARY */ + +/* Define to 1 to enable FFTs for multiplication, per --enable-fft */ +#define WANT_FFT 1 + +/* Define to 1 to enable old mpn_mul_fft_full for multiplication, per + --enable-old-fft-full */ +/* #undef WANT_OLD_FFT_FULL */ + +/* Define to 1 if --enable-profiling=gprof */ +/* #undef WANT_PROFILING_GPROF */ + +/* Define to 1 if --enable-profiling=instrument */ +/* #undef WANT_PROFILING_INSTRUMENT */ + +/* Define to 1 if --enable-profiling=prof */ +/* #undef WANT_PROFILING_PROF */ + +/* Define one of these to 1 for the desired temporary memory allocation + method, per --enable-alloca. */ +#define WANT_TMP_ALLOCA 1 +/* #undef WANT_TMP_REENTRANT */ +/* #undef WANT_TMP_NOTREENTRANT */ +/* #undef WANT_TMP_DEBUG */ + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +/* # undef WORDS_BIGENDIAN */ +# endif +#endif + +/* Define to 1 if the assembler understands the mulx instruction */ +#define X86_ASM_MULX 1 + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +#define YYTEXT_POINTER 1 + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#define restrict __restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif + +/* Define to empty if the keyword `volatile' does not work. Warning: valid + code using `volatile' can become incorrect without. Disable with care. */ +/* #undef volatile */ From 6453dafa461cd28799192343f796ddf97ea214d7 Mon Sep 17 00:00:00 2001 From: Santeri Hannula Date: Thu, 10 Oct 2024 19:38:55 +0300 Subject: [PATCH 3/3] ci: update path filters --- .github/workflows/develop.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 6f9acb4cb0..36aedc0d60 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -5,13 +5,13 @@ on: branches: - develop paths: - - '*.bazel' - - '.bazelrc' - - '.github/workflows/**.yml' - - 'PACE' - - 'VERSION' - - 'bazel/**' + - 'build.zig' + - 'build.zig.zon' + - 'ext/**' + - '!ext/**.md' - 'pkg/**' + - '.github/workflows/**.yml' + - '*.sh' jobs: urbit: