diff --git a/sm9/bn256/gfp_amd64.s b/sm9/bn256/gfp_amd64.s index 2dabf75a..2a71bf4f 100644 --- a/sm9/bn256/gfp_amd64.s +++ b/sm9/bn256/gfp_amd64.s @@ -136,198 +136,192 @@ TEXT ·gfpSub(SB),0,$0-24 RET TEXT ·gfpMul(SB),0,$0-24 - MOVQ res+0(FP), res_ptr MOVQ in1+8(FP), x_ptr MOVQ in2+16(FP), y_ptr - CMPB ·hasBMI2(SB), $0 - JE nobmi2Mul + CMPB ·supportADX(SB), $0 + JE noAdxMul + XORQ acc5, acc5 + XORQ res_ptr, res_ptr // x * y[0] MOVQ (8*0)(y_ptr), DX MULXQ (8*0)(x_ptr), acc0, acc1 MULXQ (8*1)(x_ptr), AX, acc2 - ADDQ AX, acc1 - ADCQ $0, acc2 + ADCXQ AX, acc1 MULXQ (8*2)(x_ptr), AX, acc3 - ADDQ AX, acc2 - ADCQ $0, acc3 + ADCXQ AX, acc2 MULXQ (8*3)(x_ptr), AX, acc4 - ADDQ AX, acc3 - ADCQ $0, acc4 - - XORQ acc5, acc5 + ADCXQ AX, acc3 + ADCXQ acc5, acc4 // First reduction step MOVQ acc0, DX MULXQ ·np+0x00(SB), DX, AX - MULXQ ·p2+0x00(SB), AX, t1 - ADDQ AX, acc0 - ADCQ t1, acc1 + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc0 MULXQ ·p2+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc1 - ADCQ t1, acc2 + ADCXQ t0, AX + ADOXQ AX, acc1 - MULXQ ·p2+0x10(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc2 MULXQ ·p2+0x18(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 - ADCQ $0, acc5 + ADCXQ t0, AX + ADOXQ AX, acc3 + + ADCXQ res_ptr, t1 + ADOXQ t1, acc4 + ADOXQ res_ptr, acc5 + XORQ acc0, acc0 // x * y[1] MOVQ (8*1)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, t1 - ADDQ AX, acc1 - ADCQ t1, acc2 + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc1 MULXQ (8*1)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 + ADCXQ t0, AX + ADOXQ AX, acc2 - MULXQ (8*2)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc3 - MULXQ (8*3)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 - ADCQ $0, acc0 + MULXQ (8*3)(x_ptr), AX, t1 + ADCXQ t0, AX + ADOXQ AX, acc4 + + ADCXQ acc0, t1 + ADOXQ t1, acc5 + ADOXQ res_ptr, acc0 // Second reduction step MOVQ acc1, DX MULXQ ·np+0x00(SB), DX, AX - MULXQ ·p2+0x00(SB), AX, t1 - ADDQ AX, acc1 - ADCQ t1, acc2 + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc1 MULXQ ·p2+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 + ADCXQ t0, AX + ADOXQ AX, acc2 - MULXQ ·p2+0x10(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc3 MULXQ ·p2+0x18(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 - ADCQ $0, acc0 + ADCXQ t0, AX + ADOXQ AX, acc4 + + ADCXQ res_ptr, t1 + ADOXQ t1, acc5 + ADOXQ res_ptr, acc0 + XORQ acc1, acc1 // x * y[2] MOVQ (8*2)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc2 MULXQ (8*1)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 + ADCXQ t0, AX + ADOXQ AX, acc3 - MULXQ (8*2)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc4 - MULXQ (8*3)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc5 - ADCQ t1, acc0 - ADCQ $0, acc1 + MULXQ (8*3)(x_ptr), AX, t1 + ADCXQ t0, AX + ADOXQ AX, acc5 + + ADCXQ res_ptr, t1 + ADOXQ t1, acc0 + ADOXQ res_ptr, acc1 // Third reduction step MOVQ acc2, DX MULXQ ·np+0x00(SB), DX, AX - MULXQ ·p2+0x00(SB), AX, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc2 MULXQ ·p2+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 + ADCXQ t0, AX + ADOXQ AX, acc3 - MULXQ ·p2+0x10(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc4 MULXQ ·p2+0x18(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc5 - ADCQ t1, acc0 - ADCQ $0, acc1 + ADCXQ t0, AX + ADOXQ AX, acc5 + + ADCXQ res_ptr, t1 + ADOXQ t1, acc0 + ADOXQ res_ptr, acc1 + XORQ acc2, acc2 // x * y[3] MOVQ (8*3)(y_ptr), DX - MULXQ (8*0)(x_ptr), AX, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 + MULXQ (8*0)(x_ptr), AX, t0 + ADOXQ AX, acc3 MULXQ (8*1)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 + ADCXQ t0, AX + ADOXQ AX, acc4 - MULXQ (8*2)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc5 - ADCQ t1, acc0 + MULXQ (8*2)(x_ptr), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc5 - MULXQ (8*3)(x_ptr), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc0 - ADCQ t1, acc1 - ADCQ $0, acc2 + MULXQ (8*3)(x_ptr), AX, t1 + ADCXQ t0, AX + ADOXQ AX, acc0 + + ADCXQ res_ptr, t1 + ADOXQ t1, acc1 + ADOXQ res_ptr, acc2 // Last reduction step MOVQ acc3, DX MULXQ ·np+0x00(SB), DX, AX - MULXQ ·p2+0x00(SB), AX, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc3 MULXQ ·p2+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc4 - ADCQ t1, acc5 + ADCXQ t0, AX + ADOXQ AX, acc4 - MULXQ ·p2+0x10(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc5 - ADCQ t1, acc0 + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc5 MULXQ ·p2+0x18(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc0 - ADCQ t1, acc1 - ADCQ $0, acc2 + ADCXQ t0, AX + ADOXQ AX, acc0 + + ADCXQ res_ptr, t1 + ADOXQ t1, acc1 + ADOXQ res_ptr, acc2 // Copy result [255:0] gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2) + MOVQ res+0(FP), res_ptr storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) - RET -nobmi2Mul: + +noAdxMul: // x * y[0] MOVQ (8*0)(y_ptr), t0 @@ -588,6 +582,7 @@ nobmi2Mul: ADCQ $0, acc2 // Copy result [255:0] gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,t1,acc2) + MOVQ res+0(FP), res_ptr storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr)) RET @@ -598,175 +593,171 @@ TEXT ·gfpSqr(SB),NOSPLIT,$0 MOVQ in+8(FP), x_ptr MOVQ n+16(FP), BX - CMPB ·hasBMI2(SB), $0 + CMPB ·supportADX(SB), $0 JE gfpSqrLoop -gfpSqrLoopBMI2: +gfpSqrLoopAdx: + XORQ acc0, acc0 + XORQ y_ptr, y_ptr // y[1:] * y[0] MOVQ (8*0)(x_ptr), DX MULXQ (8*1)(x_ptr), acc1, acc2 MULXQ (8*2)(x_ptr), AX, acc3 - ADDQ AX, acc2 - ADCQ $0, acc3 + ADOXQ AX, acc2 MULXQ (8*3)(x_ptr), AX, acc4 - ADDQ AX, acc3 - ADCQ $0, acc4 + ADOXQ AX, acc3 + ADOXQ y_ptr, acc4 // y[2:] * y[1] MOVQ (8*1)(x_ptr), DX MULXQ (8*2)(x_ptr), AX, t1 - ADDQ AX, acc3 - ADCQ t1, acc4 + ADOXQ AX, acc3 MULXQ (8*3)(x_ptr), AX, acc5 - ADCQ $0, acc5 - ADDQ AX, acc4 - ADCQ $0, acc5 + ADCXQ t1, AX + ADOXQ AX, acc4 + ADCXQ y_ptr, acc5 // y[3] * y[2] MOVQ (8*2)(x_ptr), DX MULXQ (8*3)(x_ptr), AX, y_ptr - ADDQ AX, acc5 - ADCQ $0, y_ptr + ADOXQ AX, acc5 + ADOXQ acc0, y_ptr XORQ t1, t1 // *2 - ADDQ acc1, acc1 - ADCQ acc2, acc2 - ADCQ acc3, acc3 - ADCQ acc4, acc4 - ADCQ acc5, acc5 - ADCQ y_ptr, y_ptr - ADCQ $0, t1 + ADOXQ acc1, acc1 + ADOXQ acc2, acc2 + ADOXQ acc3, acc3 + ADOXQ acc4, acc4 + ADOXQ acc5, acc5 + ADOXQ y_ptr, y_ptr + ADOXQ acc0, t1 // Missing products MOVQ (8*0)(x_ptr), DX MULXQ DX, acc0, t0 - ADDQ t0, acc1 + ADCXQ t0, acc1 MOVQ (8*1)(x_ptr), DX MULXQ DX, AX, t0 - ADCQ AX, acc2 - ADCQ t0, acc3 + ADCXQ AX, acc2 + ADCXQ t0, acc3 MOVQ (8*2)(x_ptr), DX MULXQ DX, AX, t0 - ADCQ AX, acc4 - ADCQ t0, acc5 + ADCXQ AX, acc4 + ADCXQ t0, acc5 MOVQ (8*3)(x_ptr), DX MULXQ DX, AX, x_ptr - ADCQ AX, y_ptr - ADCQ t1, x_ptr + ADCXQ AX, y_ptr + ADCXQ t1, x_ptr // First reduction step MOVQ acc0, DX MULXQ ·np+0x00(SB), DX, AX - MULXQ ·p2+0x00(SB), AX, t1 - ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 - ADCQ t1, acc1 + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 MULXQ ·p2+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc1 - ADCQ t1, acc2 + ADCXQ t0, AX + ADOXQ AX, acc1 - MULXQ ·p2+0x10(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc2 MULXQ ·p2+0x18(SB), AX, acc0 - ADCQ $0, acc0 - ADDQ AX, acc3 - ADCQ $0, acc0 + ADCXQ t0, AX + ADOXQ AX, acc3 + MOVQ $0, t0 + ADCXQ t0, acc0 + ADOXQ t0, acc0 // Second reduction step MOVQ acc1, DX MULXQ ·np+0x00(SB), DX, AX - MULXQ ·p2+0x00(SB), AX, t1 - ADDQ AX, acc1 - ADCQ t1, acc2 + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc1 MULXQ ·p2+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 + ADCXQ t0, AX + ADOXQ AX, acc2 - MULXQ ·p2+0x10(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc0 + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc3 MULXQ ·p2+0x18(SB), AX, acc1 - ADCQ $0, acc1 - ADDQ AX, acc0 - ADCQ $0, acc1 + ADCXQ t0, AX + ADOXQ AX, acc4 + MOVQ $0, t0 + ADCXQ t0, acc1 + ADOXQ t0, acc1 // Third reduction step MOVQ acc2, DX MULXQ ·np+0x00(SB), DX, AX - MULXQ ·p2+0x00(SB), AX, t1 - ADDQ AX, acc2 - ADCQ t1, acc3 + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc2 MULXQ ·p2+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc3 - ADCQ t1, acc0 + ADCXQ t0, AX + ADOXQ AX, acc3 - MULXQ ·p2+0x10(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc0 - ADCQ t1, acc1 + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc0 MULXQ ·p2+0x18(SB), AX, acc2 - ADCQ $0, acc2 - ADDQ AX, acc1 - ADCQ $0, acc2 + ADCXQ t0, AX + ADOXQ AX, acc1 + MOVQ $0, t0 + ADCXQ t0, acc2 + ADOXQ t0, acc2 // Last reduction step MOVQ acc3, DX MULXQ ·np+0x00(SB), DX, AX - MULXQ ·p2+0x00(SB), AX, t1 - ADDQ AX, acc3 - ADCQ t1, acc0 + MULXQ ·p2+0x00(SB), AX, t0 + ADOXQ AX, acc3 MULXQ ·p2+0x08(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc0 - ADCQ t1, acc1 + ADCXQ t0, AX + ADOXQ AX, acc0 - MULXQ ·p2+0x10(SB), AX, t1 - ADCQ $0, t1 - ADDQ AX, acc1 - ADCQ t1, acc2 + MULXQ ·p2+0x10(SB), AX, t0 + ADCXQ t1, AX + ADOXQ AX, acc1 MULXQ ·p2+0x18(SB), AX, acc3 - ADCQ $0, acc3 - ADDQ AX, acc2 - ADCQ $0, acc3 + ADCXQ t0, AX + ADOXQ AX, acc2 + MOVQ $0, t0 + ADCXQ t0, acc3 + ADOXQ t0, acc3 - XORQ t0, t0 + XORQ t1, t1 // Add bits [511:256] of the sqr result - ADDQ acc4, acc0 - ADCQ acc5, acc1 - ADCQ y_ptr, acc2 - ADCQ x_ptr, acc3 - ADCQ $0, t0 + ADCXQ acc4, acc0 + ADCXQ acc5, acc1 + ADCXQ y_ptr, acc2 + ADCXQ x_ptr, acc3 + ADCXQ t1, t0 gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,t1,t0) storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr)) MOVQ res_ptr, x_ptr DECQ BX - JNE gfpSqrLoopBMI2 + JNE gfpSqrLoopAdx RET diff --git a/sm9/bn256/gfp_decl.go b/sm9/bn256/gfp_decl.go index 37923855..7c0a4a13 100644 --- a/sm9/bn256/gfp_decl.go +++ b/sm9/bn256/gfp_decl.go @@ -10,7 +10,10 @@ import ( "golang.org/x/sys/cpu" ) -var hasBMI2 = cpu.X86.HasBMI2 +// amd64 assembly uses ADCX/ADOX/MULX if ADX is available to run two carry +// chains in the flags in parallel across the whole operation, and aggressively +// unrolls loops. arm64 processes four words at a time. +var supportADX = cpu.X86.HasADX && cpu.X86.HasBMI2 // Set c = p - a, if c == p, then c = 0 // diff --git a/sm9/bn256/gfp_generic.go b/sm9/bn256/gfp_generic.go index fbbf00d2..8a57f63e 100644 --- a/sm9/bn256/gfp_generic.go +++ b/sm9/bn256/gfp_generic.go @@ -134,5 +134,15 @@ func gfpSqr(res, in *gfP, n int) { } func gfpFromMont(res, in *gfP) { - gfpMul(res, in, &gfP{1}) + var T [8]uint64 + var carry uint64 + copy(T[:], in[:]) + for i := 0; i < 4; i++ { + Y := T[i] * np[0] + c2 := addMulVVW(T[i:4+i], p2[:], Y) + T[4+i], carry = bits.Add64(uint64(0), c2, carry) + } + + *res = gfP{T[4], T[5], T[6], T[7]} + gfpCarry(res, carry) } diff --git a/sm9/bn256/gfp_test.go b/sm9/bn256/gfp_test.go index da3dbef4..85d1123e 100644 --- a/sm9/bn256/gfp_test.go +++ b/sm9/bn256/gfp_test.go @@ -48,6 +48,28 @@ func Test_gfpBasicOperations(t *testing.T) { } } +func Test_gfpSqr(t *testing.T) { + // p - 1 + pMinusOne := new(big.Int).Sub(p, big.NewInt(1)) + x := fromBigInt(pMinusOne) + ret := &gfP{} + gfpSqr(ret, x, 1) + pMinusOne.Mul(pMinusOne, pMinusOne) + pMinusOne.Mod(pMinusOne, p) + if *ret != *fromBigInt(pMinusOne) { + t.Errorf("bad sqr") + } + // p + 1 + pPlusOne := new(big.Int).Add(p, big.NewInt(1)) + x = fromBigInt(pPlusOne) + gfpSqr(ret, x, 1) + pPlusOne.Mul(pPlusOne, pPlusOne) + pPlusOne.Mod(pPlusOne, p) + if *ret != *fromBigInt(pPlusOne) { + t.Errorf("bad sqr") + } +} + func TestFromMont(t *testing.T) { x := fromBigInt(bigFromHex("85AEF3D078640C98597B6027B441A01FF1DD2C190F5E93C454806C11D8806141")) ret1, ret2 := &gfP{}, &gfP{}