From 4d019006ea774d5581efb91701cecc9ce8ec6858 Mon Sep 17 00:00:00 2001 From: armfazh Date: Fri, 10 Feb 2023 20:48:53 -0800 Subject: [PATCH 1/9] Detecting successful compilation as a plugin. --- .etc/all_imports.go | 49 ++++++++++++++++++++++++++++++++ .github/workflows/ci-actions.yml | 15 ++++++++++ Makefile | 14 +++++++++ abe/cpabe/doc.go | 2 ++ abe/doc.go | 2 ++ cipher/doc.go | 2 ++ ot/doc.go | 2 ++ tss/doc.go | 1 + zk/doc.go | 2 ++ 9 files changed, 89 insertions(+) create mode 100644 .etc/all_imports.go create mode 100644 abe/cpabe/doc.go create mode 100644 abe/doc.go create mode 100644 cipher/doc.go create mode 100644 ot/doc.go create mode 100644 tss/doc.go create mode 100644 zk/doc.go diff --git a/.etc/all_imports.go b/.etc/all_imports.go new file mode 100644 index 000000000..0ba0c7b24 --- /dev/null +++ b/.etc/all_imports.go @@ -0,0 +1,49 @@ +//go:build ignore +// +build ignore + +// Generates a Go program with all the public imports of CIRCL. It is used to +// test compilation using static (buildmode=default) and dynamic linking +// (buildmode=plugin). +package main + +import ( + "flag" + "fmt" + "io/fs" + "os" + "strings" +) + +func main() { + outputFileName := flag.String("out", "circl.go", "name of the output file.") + flag.Parse() + + f, err := os.Create(*outputFileName) + if err != nil { + panic(err) + } + defer f.Close() + + skipDirs := []string{".", "testdata", "internal", "templates"} + circl := "github.com/cloudflare/circl/" + + fmt.Fprintln(f, "package main") + err = fs.WalkDir(os.DirFS("."), ".", func(path string, d fs.DirEntry, err error) error { + if err != nil { + panic(err) + } + if d.IsDir() { + for _, sd := range skipDirs { + if strings.Contains(path, sd) { + return nil + } + } + fmt.Fprintf(f, "import _ \"%v%v\"\n", circl, path) + } + return nil + }) + if err != nil { + panic(err) + } + fmt.Fprintln(f, "func main() {}") +} diff --git a/.github/workflows/ci-actions.yml b/.github/workflows/ci-actions.yml index f610283d9..ca09f630f 100644 --- a/.github/workflows/ci-actions.yml +++ b/.github/workflows/ci-actions.yml @@ -62,6 +62,21 @@ jobs: - name: Testing run: | docker run --rm -v `pwd`:`pwd` -w `pwd` ${{matrix.CFG[1]}}/golang:${{matrix.CFG[2]}} go test -v ./... + build_modes: + needs: [amd64_job] + runs-on: ubuntu-22.04 + name: Testing Build Modes + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Setup Go + uses: actions/setup-go@v3 + with: + go-version: '1.20' + - name: Build as Static + run: make circl_static + - name: Build as Plugin + run: make circl_plugin coverage_amd64_job: needs: [amd64_job] if: github.event_name == 'push' diff --git a/Makefile b/Makefile index 59c191c93..5d53d65cb 100644 --- a/Makefile +++ b/Makefile @@ -54,3 +54,17 @@ bootstrap: clean: rm -rf $(GOPATH_BUILD) + +.INTERMEDIATE: circl.go circl_static.exe circl_plugin.so +circl_static: circl_static.exe +circl_static.exe: circl.go + go clean -cache -modcache + go build -buildmode=default -o $@ $^ + +circl_plugin: circl_plugin.so +circl_plugin.so: circl.go + go clean -cache -modcache + go build -buildmode=plugin -o $@ $^ + +circl.go: + go run .etc/all_imports.go -out $@ diff --git a/abe/cpabe/doc.go b/abe/cpabe/doc.go new file mode 100644 index 000000000..c73b86da6 --- /dev/null +++ b/abe/cpabe/doc.go @@ -0,0 +1,2 @@ +// Package cpabe provides Ciphertext-Policy Attribute-based Encryption algorithms. +package cpabe diff --git a/abe/doc.go b/abe/doc.go new file mode 100644 index 000000000..58e2d76f8 --- /dev/null +++ b/abe/doc.go @@ -0,0 +1,2 @@ +// Package abe provides Attribute-based data encryption algorithms. +package abe diff --git a/cipher/doc.go b/cipher/doc.go new file mode 100644 index 000000000..cbc3ced2c --- /dev/null +++ b/cipher/doc.go @@ -0,0 +1,2 @@ +// Package cipher provides data encryption algorithms. +package cipher diff --git a/ot/doc.go b/ot/doc.go new file mode 100644 index 000000000..6e1da00ca --- /dev/null +++ b/ot/doc.go @@ -0,0 +1,2 @@ +// Package ot provides oblivious-transfer protocols. +package ot diff --git a/tss/doc.go b/tss/doc.go new file mode 100644 index 000000000..547e8491e --- /dev/null +++ b/tss/doc.go @@ -0,0 +1 @@ +package tss diff --git a/zk/doc.go b/zk/doc.go new file mode 100644 index 000000000..3aae91c27 --- /dev/null +++ b/zk/doc.go @@ -0,0 +1,2 @@ +// Package zk provides primitives for zero-knowledge proofs of knowledge. +package zk From 383b28e2db0052deedd070af95b7b97457077b0d Mon Sep 17 00:00:00 2001 From: armfazh Date: Sat, 25 Feb 2023 22:22:08 -0800 Subject: [PATCH 2/9] math/fp25519: Workaround to remove R15 from integerMulAdx. See bug in the compiler (issue #58632). --- math/fp25519/fp_amd64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/math/fp25519/fp_amd64.h b/math/fp25519/fp_amd64.h index 7b25f16e4..b884b584a 100644 --- a/math/fp25519/fp_amd64.h +++ b/math/fp25519/fp_amd64.h @@ -99,6 +99,7 @@ // Uses: AX, DX, R8-R15, FLAGS // Instr: x86_64, bmi2, adx #define integerMulAdx(z,x,y) \ + MOVL $0,R15; \ MOVQ 0+y, DX; XORL AX, AX; \ MULXQ 0+x, AX, R8; MOVQ AX, 0+z; \ MULXQ 8+x, AX, R9; ADCXQ AX, R8; \ From b16121428bdff9029c59da3b703dd54ddee2264c Mon Sep 17 00:00:00 2001 From: armfazh Date: Sat, 25 Feb 2023 22:25:00 -0800 Subject: [PATCH 3/9] math/fp448: Workaround to remove R15 from integerMulAdx. See bug in the compiler (issue #58632). --- math/fp448/fp_amd64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/math/fp448/fp_amd64.h b/math/fp448/fp_amd64.h index 0b7dea174..536fe5bdf 100644 --- a/math/fp448/fp_amd64.h +++ b/math/fp448/fp_amd64.h @@ -158,6 +158,7 @@ // Uses: AX, DX, R8-R15, FLAGS // Instr: x86_64, bmi2, adx #define integerMulAdx(z,x,y) \ + MOVL $0,R15; \ MOVQ 0+y, DX; XORL AX, AX; MOVQ $0, R8; \ MULXQ 0+x, AX, R9; MOVQ AX, 0+z; \ MULXQ 8+x, AX, R10; ADCXQ AX, R9; \ From d583f49c7b9128a0ded742b24f0101a20cb705e5 Mon Sep 17 00:00:00 2001 From: armfazh Date: Sat, 25 Feb 2023 22:28:07 -0800 Subject: [PATCH 4/9] ecc/fourq: Workaround to remove R15 from fpMulBmi2. See bug in the compiler (issue #58632). --- ecc/fourq/fq_amd64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/ecc/fourq/fq_amd64.h b/ecc/fourq/fq_amd64.h index 746a99979..241b1d237 100644 --- a/ecc/fourq/fq_amd64.h +++ b/ecc/fourq/fq_amd64.h @@ -9,6 +9,7 @@ _fpSub(16+c,16+a,16+b) #define _fqMulBmi2(c, a, b) \ + MOVL $0, R15 \ \ // T0 = a0 * b0, R11:R10:R9:R8 <- 0+ra:8+ra * 0+rb:8+rb MOVQ 0+b, DX \ MULXQ 0+a, R8, R9 \ From 09dba53207da7dcac9b881d45ead36a95c6068d9 Mon Sep 17 00:00:00 2001 From: armfazh Date: Tue, 21 Feb 2023 19:07:24 -0800 Subject: [PATCH 5/9] ecc/p384: Remove R15 from arith p384. See bug in the compiler (issue #58632). --- ecc/p384/arith_amd64.s | 171 +++++++++++++++++++---------------------- 1 file changed, 78 insertions(+), 93 deletions(-) diff --git a/ecc/p384/arith_amd64.s b/ecc/p384/arith_amd64.s index 7866bca4a..5f53c637d 100644 --- a/ecc/p384/arith_amd64.s +++ b/ecc/p384/arith_amd64.s @@ -289,7 +289,7 @@ \ // m = (T * P') mod R, store m in R8:R9:R10:R11:R12:R13 MOVQ ·pp+0(SB), AX \ MULQ 0+stack \ - MOVQ AX, R8 \ + MOVQ AX, R8 ; MOVQ R8, 96+stack\ MOVQ DX, R9 \ MOVQ ·pp+0(SB), AX \ MULQ 8+stack \ @@ -324,10 +324,10 @@ MOVQ ·pp+16(SB), AX \ MULQ 0+stack \ MOVQ AX, R14 \ - MOVQ DX, R15 \ + MOVQ DX, R8 \ MOVQ ·pp+16(SB), AX \ MULQ 8+stack \ - ADDQ AX, R15 \ + ADDQ AX, R8 \ ADCQ $0, DX \ MOVQ DX, BX \ MOVQ ·pp+16(SB), AX \ @@ -340,17 +340,17 @@ ADDQ AX, CX \ \ ADDQ R14, R10 \ - ADCQ R15, R11 \ + ADCQ R8, R11 \ ADCQ BX, R12 \ ADCQ CX, R13 \ \ MOVQ ·pp+24(SB), AX \ MULQ 0+stack \ MOVQ AX, R14 \ - MOVQ DX, R15 \ + MOVQ DX, R8 \ MOVQ ·pp+24(SB), AX \ MULQ 8+stack \ - ADDQ AX, R15 \ + ADDQ AX, R8 \ ADCQ $0, DX \ MOVQ DX, BX \ MOVQ ·pp+24(SB), AX \ @@ -358,33 +358,35 @@ ADDQ AX, BX \ \ ADDQ R14, R11 \ - ADCQ R15, R12 \ + ADCQ R8, R12 \ ADCQ BX, R13 \ \ MOVQ ·pp+32(SB), AX \ MULQ 0+stack \ MOVQ AX, R14 \ - MOVQ DX, R15 \ + MOVQ DX, R8 \ MOVQ ·pp+32(SB), AX \ MULQ 8+stack \ - ADDQ AX, R15 \ + ADDQ AX, R8 \ \ ADDQ R14, R12 \ - ADCQ R15, R13 \ + ADCQ R8, R13 \ \ MOVQ ·pp+40(SB), AX \ MULQ 0+stack \ ADDQ AX, R13 \ \ + MOVQ 96+stack, R8 \ + \ storeBlock(R8,R9,R10,R11,R12,R13, 96+stack) \ \ \ // m * P mul(·p+0(SB),·p+8(SB),·p+16(SB),·p+24(SB),·p+32(SB),·p+40(SB), 96+stack, 144+stack) \ \ \ // Add the 768-bit intermediate to m*N - MOVQ $0, DI \ + MOVQ $0, R15 \ loadBlock(144+stack, R8,R9,R10,R11,R12,R13) \ - loadBlock(192+stack, R14,R15,AX,BX,CX,DX) \ + loadBlock(192+stack, R14,SI,AX,BX,CX,DX) \ \ ADDQ 0+stack, R8 \ ADCQ 8+stack, R9 \ @@ -393,18 +395,18 @@ ADCQ 32+stack, R12 \ ADCQ 40+stack, R13 \ ADCQ 48+stack, R14 \ - ADCQ 56+stack, R15 \ + ADCQ 56+stack, SI \ ADCQ 64+stack, AX \ ADCQ 72+stack, BX \ ADCQ 80+stack, CX \ ADCQ 88+stack, DX \ - ADCQ $0, DI \ + ADCQ $0, R15 \ \ - fp384Carry(R14,R15,AX,BX,CX,DX,DI, R8,R9,R10,R11,R12,R13,SI) + fp384Carry(R14,SI,AX,BX,CX,DX,R15, R8,R9,R10,R11,R12,R13,DI) #define mulBMI2(a0,a1,a2,a3,a4,a5, rb, stack) \ MOVQ a0, DX \ - MULXQ 0+rb, R8, R9 \ + MULXQ 0+rb, R8, R9; MOVQ R8, 0+stack; MOVQ $0, R8 \ MULXQ 8+rb, AX, R10 \ ADDQ AX, R9 \ MULXQ 16+rb, AX, R11 \ @@ -417,13 +419,9 @@ ADCQ AX, R13 \ ADCQ $0, R14 \ \ - MOVQ R8, 0+stack \ - MOVQ $0, R15 \ - MOVQ $0, R8 \ - \ MOVQ a1, DX \ MULXQ 0+rb, AX, BX \ - ADDQ AX, R9 \ + ADDQ AX, R9; MOVQ R9, 8+stack; MOVL $0, R9 \ ADCQ BX, R10 \ MULXQ 16+rb, AX, BX \ ADCQ AX, R11 \ @@ -431,7 +429,7 @@ MULXQ 32+rb, AX, BX \ ADCQ AX, R13 \ ADCQ BX, R14 \ - ADCQ $0, R15 \ + ADCQ $0, R8 \ MULXQ 8+rb, AX, BX \ ADDQ AX, R10 \ ADCQ BX, R11 \ @@ -440,23 +438,20 @@ ADCQ BX, R13 \ MULXQ 40+rb, AX, BX \ ADCQ AX, R14 \ - ADCQ BX, R15 \ - ADCQ $0, R8 \ - \ - MOVQ R9, 8+stack \ - MOVQ $0, R9 \ + ADCQ BX, R8 \ + ADCQ $0, R9 \ \ MOVQ a2, DX \ MULXQ 0+rb, AX, BX \ - ADDQ AX, R10 \ + ADDQ AX, R10; MOVQ R10, 16+stack; MOVL $0, R10 \ ADCQ BX, R11 \ MULXQ 16+rb, AX, BX \ ADCQ AX, R12 \ ADCQ BX, R13 \ MULXQ 32+rb, AX, BX \ ADCQ AX, R14 \ - ADCQ BX, R15 \ - ADCQ $0, R8 \ + ADCQ BX, R8 \ + ADCQ $0, R9 \ MULXQ 8+rb, AX, BX \ ADDQ AX, R11 \ ADCQ BX, R12 \ @@ -464,84 +459,74 @@ ADCQ AX, R13 \ ADCQ BX, R14 \ MULXQ 40+rb, AX, BX \ - ADCQ AX, R15 \ - ADCQ BX, R8 \ - ADCQ $0, R9 \ - \ - MOVQ R10, 16+stack \ - MOVQ $0, R10 \ + ADCQ AX, R8 \ + ADCQ BX, R9 \ + ADCQ $0, R10 \ \ MOVQ a3, DX \ MULXQ 0+rb, AX, BX \ - ADDQ AX, R11 \ + ADDQ AX, R11; MOVQ R11, 24+stack; MOVL $0, R11 \ ADCQ BX, R12 \ MULXQ 16+rb, AX, BX \ ADCQ AX, R13 \ ADCQ BX, R14 \ MULXQ 32+rb, AX, BX \ - ADCQ AX, R15 \ - ADCQ BX, R8 \ - ADCQ $0, R9 \ + ADCQ AX, R8 \ + ADCQ BX, R9 \ + ADCQ $0, R10 \ MULXQ 8+rb, AX, BX \ ADDQ AX, R12 \ ADCQ BX, R13 \ MULXQ 24+rb, AX, BX \ ADCQ AX, R14 \ - ADCQ BX, R15 \ + ADCQ BX, R8 \ MULXQ 40+rb, AX, BX \ - ADCQ AX, R8 \ - ADCQ BX, R9 \ - ADCQ $0, R10 \ - \ - MOVQ R11, 24+stack \ - MOVQ $0, R11 \ + ADCQ AX, R9 \ + ADCQ BX, R10 \ + ADCQ $0, R11 \ \ MOVQ a4, DX \ MULXQ 0+rb, AX, BX \ - ADDQ AX, R12 \ + ADDQ AX, R12; MOVQ R12, 32+stack; MOVL $0, R12 \ ADCQ BX, R13 \ MULXQ 16+rb, AX, BX \ ADCQ AX, R14 \ - ADCQ BX, R15 \ + ADCQ BX, R8 \ MULXQ 32+rb, AX, BX \ - ADCQ AX, R8 \ - ADCQ BX, R9 \ - ADCQ $0, R10 \ + ADCQ AX, R9 \ + ADCQ BX, R10 \ + ADCQ $0, R11 \ MULXQ 8+rb, AX, BX \ ADDQ AX, R13 \ ADCQ BX, R14 \ MULXQ 24+rb, AX, BX \ - ADCQ AX, R15 \ - ADCQ BX, R8 \ + ADCQ AX, R8 \ + ADCQ BX, R9 \ MULXQ 40+rb, AX, BX \ - ADCQ AX, R9 \ - ADCQ BX, R10 \ - ADCQ $0, R11 \ - \ - MOVQ R12, 32+stack \ + ADCQ AX, R10 \ + ADCQ BX, R11 \ + ADCQ $0, R12 \ \ MOVQ a5, DX \ MULXQ 0+rb, AX, BX \ - ADDQ AX, R13 \ + ADDQ AX, R13; MOVQ R13, 40+stack \ ADCQ BX, R14 \ MULXQ 16+rb, AX, BX \ - ADCQ AX, R15 \ - ADCQ BX, R8 \ + ADCQ AX, R8 \ + ADCQ BX, R9 \ MULXQ 32+rb, AX, BX \ - ADCQ AX, R9 \ - ADCQ BX, R10 \ - ADCQ $0, R11 \ + ADCQ AX, R10 \ + ADCQ BX, R11 \ + ADCQ $0, R12 \ MULXQ 8+rb, AX, BX \ ADDQ AX, R14 \ - ADCQ BX, R15 \ + ADCQ BX, R8 \ MULXQ 24+rb, AX, BX \ - ADCQ AX, R8 \ - ADCQ BX, R9 \ + ADCQ AX, R9 \ + ADCQ BX, R10 \ MULXQ 40+rb, AX, BX \ - ADCQ AX, R10 \ - ADCQ BX, R11 \ - \ - MOVQ R13, 40+stack + ADCQ AX, R11 \ + ADCQ BX, R12 #define fp384ReduceBMI2(stack) \ \ // m = (T * P') mod R, store m in R8:R9:R10:R11:R12:R13 @@ -604,24 +589,24 @@ mulBMI2(·p+0(SB),·p+8(SB),·p+16(SB),·p+24(SB),·p+32(SB),·p+40(SB), 96+stack, 144+stack) \ \ \ // Add the 768-bit intermediate to m*N - MOVQ $0, AX \ - loadBlock(144+stack, R12,R13,BX,CX,DX,DI) \ + loadBlock(144+stack, AX,R13,BX,CX,DX,DI) \ \ - ADDQ 0+stack, R12 \ + ADDQ 0+stack, AX \ ADCQ 8+stack, R13 \ ADCQ 16+stack, BX \ ADCQ 24+stack, CX \ ADCQ 32+stack, DX \ ADCQ 40+stack, DI \ ADCQ 48+stack, R14 \ - ADCQ 56+stack, R15 \ - ADCQ 64+stack, R8 \ - ADCQ 72+stack, R9 \ - ADCQ 80+stack, R10 \ - ADCQ 88+stack, R11 \ - ADCQ $0, AX \ + ADCQ 56+stack, R8 \ + ADCQ 64+stack, R9 \ + ADCQ 72+stack, R10 \ + ADCQ 80+stack, R11 \ + ADCQ 88+stack, R12 \ + MOVQ $0, 0+stack \ + ADCQ $0, 0+stack \ \ - fp384Carry(R14,R15,R8,R9,R10,R11,AX, R12,R13,BX,CX,DX,DI,SI) + fp384Carry(R14,R8,R9,R10,R11,R12, 0+stack, AX,R13,BX,CX,DX,DI,SI) TEXT ·fp384Neg(SB), NOSPLIT, $0-16 MOVQ ·p+0(SB), R8 @@ -639,8 +624,8 @@ TEXT ·fp384Neg(SB), NOSPLIT, $0-16 SBBQ 32(DI), R12 SBBQ 40(DI), R13 - MOVQ $0, R14 - fp384Carry(R8,R9,R10,R11,R12,R13,R14, R15,AX,BX,CX,DX,DI,SI) + MOVQ $0, R15 + fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI) MOVQ c+0(FP), DI storeBlock(R8,R9,R10,R11,R12,R13, 0(DI)) @@ -651,7 +636,7 @@ TEXT ·fp384Add(SB), NOSPLIT, $0-24 MOVQ b+16(FP), SI loadBlock(0(DI), R8,R9,R10,R11,R12,R13) - MOVQ $0, R14 + MOVQ $0, R15 ADDQ 0(SI), R8 ADCQ 8(SI), R9 @@ -659,9 +644,9 @@ TEXT ·fp384Add(SB), NOSPLIT, $0-24 ADCQ 24(SI), R11 ADCQ 32(SI), R12 ADCQ 40(SI), R13 - ADCQ $0, R14 + ADCQ $0, R15 - fp384Carry(R8,R9,R10,R11,R12,R13,R14, R15,AX,BX,CX,DX,DI,SI) + fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI) MOVQ c+0(FP), DI storeBlock(R8,R9,R10,R11,R12,R13, 0(DI)) @@ -683,7 +668,7 @@ TEXT ·fp384Sub(SB), NOSPLIT, $0-24 SBBQ 32(DI), R12 SBBQ 40(DI), R13 - MOVQ $0, R14 + MOVQ $0, R15 MOVQ a+8(FP), DI ADDQ 0(DI), R8 ADCQ 8(DI), R9 @@ -691,9 +676,9 @@ TEXT ·fp384Sub(SB), NOSPLIT, $0-24 ADCQ 24(DI), R11 ADCQ 32(DI), R12 ADCQ 40(DI), R13 - ADCQ $0, R14 + ADCQ $0, R15 - fp384Carry(R8,R9,R10,R11,R12,R13,R14, R15,AX,BX,CX,DX,DI,SI) + fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI) MOVQ c+0(FP), DI storeBlock(R8,R9,R10,R11,R12,R13, 0(DI)) @@ -709,13 +694,13 @@ TEXT ·fp384Mul(SB), NOSPLIT, $240-24 // T = a * b mulBMI2(0(DI),8(DI),16(DI),24(DI),32(DI),40(DI), 0(SI), 0(SP)) - storeBlock(R14,R15,R8,R9,R10,R11, 48(SP)) + storeBlock(R14,R8,R9,R10,R11,R12, 48(SP)) // Reduce T. fp384ReduceBMI2(0(SP)) MOVQ c+0(FP), DI - storeBlock(R14,R15,R8,R9,R10,R11, 0(DI)) + storeBlock(R14,R8,R9,R10,R11,R12, 0(DI)) JMP end nobmi2Mul: @@ -726,7 +711,7 @@ nobmi2Mul: fp384Reduce(0(SP)) MOVQ c+0(FP), DI - storeBlock(R14,R15,AX,BX,CX,DX, 0(DI)) + storeBlock(R14,SI,AX,BX,CX,DX, 0(DI)) end: RET From f50097140c569dd94f52d98560ef8db9ecb23f9c Mon Sep 17 00:00:00 2001 From: armfazh Date: Sun, 26 Feb 2023 00:30:57 -0800 Subject: [PATCH 6/9] dh/csidh: Avoid reference to global variable with MULX for mulBmiAsm. See bug in the compiler (issue #58735). --- dh/csidh/fp511_amd64.s | 63 +++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/dh/csidh/fp511_amd64.s b/dh/csidh/fp511_amd64.s index c248c0a3a..612a0c5f3 100644 --- a/dh/csidh/fp511_amd64.s +++ b/dh/csidh/fp511_amd64.s @@ -101,7 +101,7 @@ TEXT ·mulBmiAsm(SB),NOSPLIT,$8-24 XORQ R12, R12 XORQ R13, R13 XORQ R14, R14 - XORQ R15, R15 + XORQ CX, CX MOVQ BP, 0(SP) // push: BP is Callee-save. XORQ BP, BP @@ -113,42 +113,43 @@ TEXT ·mulBmiAsm(SB),NOSPLIT,$8-24 #define MULS_MULX_512(idx, r0, r1, r2, r3, r4, r5, r6, r7, r8) \ \ // Reduction step MOVQ ( 0)(SI), DX \ - MULXQ ( 8*idx)(DI), DX, CX \ + MULXQ ( 8*idx)(DI), DX, AX \ ADDQ r0, DX \ - MULXQ ·pNegInv(SB), DX, CX \ + MOVQ ·pNegInv(SB), AX \ + MULXQ AX, DX, AX \ \ - XORQ AX, AX \ - MULXQ ·p+ 0(SB), AX, BX; ; ADOXQ AX, r0 \ - MULXQ ·p+ 8(SB), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \ - MULXQ ·p+16(SB), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \ - MULXQ ·p+24(SB), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \ - MULXQ ·p+32(SB), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \ - MULXQ ·p+40(SB), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \ - MULXQ ·p+48(SB), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \ - MULXQ ·p+56(SB), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \ - MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8 \ + XORQ AX, AX; \ + MOVQ ·p+ 0(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r0; ADCXQ BX, r1 \ + MOVQ ·p+ 8(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r1; ADCXQ BX, r2 \ + MOVQ ·p+16(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r2; ADCXQ BX, r3 \ + MOVQ ·p+24(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r3; ADCXQ BX, r4 \ + MOVQ ·p+32(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r4; ADCXQ BX, r5 \ + MOVQ ·p+40(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r5; ADCXQ BX, r6 \ + MOVQ ·p+48(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r6; ADCXQ BX, r7 \ + MOVQ ·p+56(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r7; ADCXQ BX, r8 \ + MOVQ $0, AX; ;;;;;;;;;;;;;;;;;;;;;;; ADOXQ AX, r8; \ \ // Multiplication step MOVQ (8*idx)(DI), DX \ \ XORQ AX, AX \ - MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0 \ - MULXQ ( 8)(SI), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \ - MULXQ (16)(SI), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \ - MULXQ (24)(SI), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \ - MULXQ (32)(SI), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \ - MULXQ (40)(SI), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \ - MULXQ (48)(SI), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \ - MULXQ (56)(SI), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \ - MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8 - - MULS_MULX_512(0, R8, R9, R10, R11, R12, R13, R14, R15, BP) - MULS_MULX_512(1, R9, R10, R11, R12, R13, R14, R15, BP, R8) - MULS_MULX_512(2, R10, R11, R12, R13, R14, R15, BP, R8, R9) - MULS_MULX_512(3, R11, R12, R13, R14, R15, BP, R8, R9, R10) - MULS_MULX_512(4, R12, R13, R14, R15, BP, R8, R9, R10, R11) - MULS_MULX_512(5, R13, R14, R15, BP, R8, R9, R10, R11, R12) - MULS_MULX_512(6, R14, R15, BP, R8, R9, R10, R11, R12, R13) - MULS_MULX_512(7, R15, BP, R8, R9, R10, R11, R12, R13, R14) + MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0; ADCXQ BX, r1 \ + MULXQ ( 8)(SI), AX, BX; ADOXQ AX, r1; ADCXQ BX, r2 \ + MULXQ (16)(SI), AX, BX; ADOXQ AX, r2; ADCXQ BX, r3 \ + MULXQ (24)(SI), AX, BX; ADOXQ AX, r3; ADCXQ BX, r4 \ + MULXQ (32)(SI), AX, BX; ADOXQ AX, r4; ADCXQ BX, r5 \ + MULXQ (40)(SI), AX, BX; ADOXQ AX, r5; ADCXQ BX, r6 \ + MULXQ (48)(SI), AX, BX; ADOXQ AX, r6; ADCXQ BX, r7 \ + MULXQ (56)(SI), AX, BX; ADOXQ AX, r7; ADCXQ BX, r8 \ + MOVQ $0, AX ; ADOXQ AX, r8; + + MULS_MULX_512(0, R8, R9, R10, R11, R12, R13, R14, CX, BP) + MULS_MULX_512(1, R9, R10, R11, R12, R13, R14, CX, BP, R8) + MULS_MULX_512(2, R10, R11, R12, R13, R14, CX, BP, R8, R9) + MULS_MULX_512(3, R11, R12, R13, R14, CX, BP, R8, R9, R10) + MULS_MULX_512(4, R12, R13, R14, CX, BP, R8, R9, R10, R11) + MULS_MULX_512(5, R13, R14, CX, BP, R8, R9, R10, R11, R12) + MULS_MULX_512(6, R14, CX, BP, R8, R9, R10, R11, R12, R13) + MULS_MULX_512(7, CX, BP, R8, R9, R10, R11, R12, R13, R14) #undef MULS_MULX_512 MOVQ res+0(FP), DI From 2265077d15cf0b2298cff258e280c29ab82f1508 Mon Sep 17 00:00:00 2001 From: armfazh Date: Sun, 26 Feb 2023 00:59:36 -0800 Subject: [PATCH 7/9] dh/sidh: Avoid reference to global variable with MULX for rdcP434. See bug in the compiler (issue #58735). --- dh/sidh/internal/p434/arith_amd64.s | 64 +++++++++++++++++++---------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/dh/sidh/internal/p434/arith_amd64.s b/dh/sidh/internal/p434/arith_amd64.s index 29e281e21..785997b5d 100644 --- a/dh/sidh/internal/p434/arith_amd64.s +++ b/dh/sidh/internal/p434/arith_amd64.s @@ -29,27 +29,36 @@ // |-128-| x |--- 256 ---| = |------ 384 ------| // Assuming the first digit multiplication was already performed. #define MULX128x256(I1, M1, T1, T2, T3, T4, T5) \ - MULXQ M1+ 8(SB), T4, T2 \ + MOVQ M1+ 8(SB), AX \ + MULXQ AX, T4, T2 \ XORQ AX, AX \ - MULXQ M1+16(SB), T5, T3 \ + MOVQ M1+16(SB), AX \ + MULXQ AX, T5, T3 \ ADOXQ T4, T1 \ // T1: interm1 ADOXQ T5, T2 \ // T2: interm2 - MULXQ M1+24(SB), T5, T4 \ + MOVQ M1+24(SB), AX \ + MULXQ AX, T5, T4 \ ADOXQ T5, T3 \ // T3: interm3 + MOVL $0, AX \ ADOXQ AX, T4 \ // T4: interm4 \ XORQ AX, AX \ MOVQ I1, DX \ - MULXQ M1+ 0(SB), T5, I1 \ // T0 <- C0 + MOVQ M1+ 0(SB), AX \ + MULXQ AX, T5, I1 \ // T0 <- C0 ADCXQ T5, T1 \ ADCXQ I1, T2 \ // T1 <- C1 - MULXQ M1+ 8(SB), I1, T5 \ + MOVQ M1+ 8(SB), AX \ + MULXQ AX, I1, T5 \ ADCXQ T5, T3 \ ADOXQ I1, T2 \ // T2 <- C2 - MULXQ M1+16(SB), I1, T5 \ + MOVQ M1+16(SB), AX \ + MULXQ AX, I1, T5 \ ADCXQ T5, T4 \ ADOXQ I1, T3 \ // T3 <- C3 - MULXQ M1+24(SB), I1, T5 \ + MOVQ M1+24(SB), AX \ + MULXQ AX, I1, T5 \ + MOVL $0, AX \ ADCXQ AX, T5 \ ADOXQ I1, T4 \ // T4 <- C4 ADOXQ AX, T5 // T5 <- C5 @@ -63,13 +72,17 @@ // |64| x |--- 256 ---| = |----- 320 ----| // Assuming the first digit multiplication was already performed. #define MULX64x256(M1, T1, T2, T3, T4, T5) \ - MULXQ M1+ 8(SB), T4, T2 \ + MOVQ M1+ 8(SB), AX \ + MULXQ AX, T4, T2 \ XORQ AX, AX \ - MULXQ M1+16(SB), T5, T3 \ + MOVQ M1+16(SB), AX \ + MULXQ AX, T5, T3 \ ADOXQ T4, T1 \ // T1 <- C1 ADOXQ T5, T2 \ // T2 <- C2 - MULXQ M1+24(SB), T5, T4 \ + MOVQ M1+24(SB), AX \ + MULXQ AX, T5, T4 \ ADOXQ T5, T3 \ // T3 <- C3 + MOVL $0, AX \ ADOXQ AX, T4 // T4 <- C4 // Performs schoolbook multiplication of two 192-bit numbers @@ -284,7 +297,8 @@ #define REDC_MULX(P1, MUL01, MUL23, MUL45, MUL67) \ MOVQ 0x0(DI), DX \ MOVQ 0x8(DI), R14 \ - MULXQ P1, R8, R9 \ + MOVQ P1, AX \ + MULXQ AX, R8, R9 \ MUL01 \ MOVQ 0x10(DI), DX \ MOVQ 0x48(DI), CX \ @@ -295,12 +309,14 @@ ADCQ 0x38(DI), R12 \ ADCQ 0x40(DI), R13 \ ADCQ $0, CX \ - MULXQ P1, BX, BP \ + MOVQ P1, AX \ + MULXQ AX, BX, BP \ MOVQ R9, 0x0(SI) \ MOVQ R10, 0x8(SI) \ MOVQ R11, 0x10(SI) \ MOVQ R12, 0x18(SI) \ MOVQ R13, 0x20(SI) \ + MOVQ CX, 0x28(SI) \ MOVQ 0x50(DI), R9 \ MOVQ 0x58(DI), R10 \ MOVQ 0x60(DI), R11 \ @@ -315,11 +331,14 @@ ADCQ 0x10(SI), BP \ ADCQ 0x18(SI), R12 \ ADCQ 0x20(SI), R13 \ - ADCQ CX, R14 \ + ADCQ 0x28(SI), R14 \ + MOVQ R14, 0x18(SI) \ + MOVQ CX, R14 \ MOVQ $0, CX \ - ADCQ R9, R15 \ + ADCQ R9, R14 \ ADCQ R10, CX \ - MULXQ P1, R8, R9 \ + MOVQ P1, AX \ + MULXQ AX, R8, R9 \ MOVQ BP, 0x0(SI) \ MOVQ R12, 0x8(SI) \ MOVQ R13, 0x10(SI) \ @@ -329,22 +348,23 @@ MOVQ 0x0(SI), DX \ ADDQ 0x8(SI), R8 \ ADCQ 0x10(SI), R9 \ - ADCQ R14, R10 \ - ADCQ R15, BP \ + ADCQ 0x18(SI), R10 \ + ADCQ R14, BP \ ADCQ CX, R12 \ ADCQ R11, R13 \ ADCQ $0, DI \ - MULXQ P1, R14, R15 \ + MOVQ P1, AX \ + MULXQ AX, R14, BX \ MOVQ R8, 0x0(SI) \ MOVQ R9, 0x8(SI) \ MUL67 \ ADDQ R10, R14 \ - ADCQ BP, R15 \ + ADCQ BP, BX \ ADCQ R12, R8 \ ADCQ R13, R9 \ ADCQ DI, R11 \ MOVQ R14, 0x10(SI) \ - MOVQ R15, 0x18(SI) \ + MOVQ BX, 0x18(SI) \ MOVQ R8, 0x20(SI) \ MOVQ R9, 0x28(SI) \ MOVQ R11, 0x30(SI) @@ -1314,9 +1334,9 @@ TEXT ·rdcP434(SB),$0-16 // available on Broadwell micro-architectures and newer. redc_bdw: #define MULX01 MULX128x256(R14,·P434p1+(8*P434_P1_ZEROS),R9 ,R10,R11,R12,R13) -#define MULX23 MULX128x256(R8 ,·P434p1+(8*P434_P1_ZEROS),BP ,R12,R13,R14,R15) +#define MULX23 MULX128x256(R8 ,·P434p1+(8*P434_P1_ZEROS),BP ,R12,R13,R14,CX ) #define MULX45 MULX128x256(BX ,·P434p1+(8*P434_P1_ZEROS),R9 ,R10,BP ,R12,R13) -#define MULX67 MULX64x256 ( ·P434p1+(8*P434_P1_ZEROS),R15,R8 ,R9 ,R11,CX ) +#define MULX67 MULX64x256 ( ·P434p1+(8*P434_P1_ZEROS),BX ,R8 ,R9 ,R11,CX ) REDC_MULX(·P434p1+(8*P434_P1_ZEROS)+0(SB), MULX01, MULX23, MULX45, MULX67) #undef MULX01 #undef MULX23 From 1f6e82be8e14eb062bf4a1715b183284f8736420 Mon Sep 17 00:00:00 2001 From: armfazh Date: Sun, 26 Feb 2023 01:29:36 -0800 Subject: [PATCH 8/9] dh/sidh: Avoid reference to global variable with MULX for rdcP503. See bug in the compiler (issue #58735). --- dh/sidh/internal/p503/arith_amd64.s | 46 +++++++++++++++++++---------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/dh/sidh/internal/p503/arith_amd64.s b/dh/sidh/internal/p503/arith_amd64.s index 08303e281..2845cbd17 100644 --- a/dh/sidh/internal/p503/arith_amd64.s +++ b/dh/sidh/internal/p503/arith_amd64.s @@ -193,29 +193,41 @@ #define MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, add1, add2, adc1, adc2) \ \ // Column 0 MOVQ I0, DX \ - MULXQ I1+24(SB), T0, T1 \ - MULXQ I1+32(SB), T4, T2 \ + MOVQ I1+24(SB), AX \ + MULXQ AX, T0, T1 \ + MOVQ I1+32(SB), AX \ + MULXQ AX, T4, T2 \ + MOVQ I1+40(SB), AX \ + MULXQ AX, T5, T3 \ XORQ AX, AX \ - MULXQ I1+40(SB), T5, T3 \ add1 T4, T1 \ adc1 T5, T2 \ - MULXQ I1+48(SB), T7, T4 \ + MOVQ I1+48(SB), AX \ + MULXQ AX, T7, T4 \ adc1 T7, T3 \ - MULXQ I1+56(SB), T6, T5 \ + MOVQ I1+56(SB), AX \ + MULXQ AX, T6, T5 \ adc1 T6, T4 \ + MOVL $0, AX \ adc1 AX, T5 \ \ // Column 1 MOVQ 8+I0, DX \ - MULXQ I1+24(SB), T6, T7 \ + MOVQ I1+24(SB), AX \ + MULXQ AX, T6, T7 \ add2 T6, T1 \ adc2 T7, T2 \ - MULXQ I1+32(SB), T8, T6 \ + MOVQ I1+32(SB), AX \ + MULXQ AX, T8, T6 \ adc2 T6, T3 \ - MULXQ I1+40(SB), T7, T9 \ + MOVQ I1+40(SB), AX \ + MULXQ AX, T7, T9 \ adc2 T9, T4 \ - MULXQ I1+48(SB), T9, T6 \ + MOVQ I1+48(SB), AX \ + MULXQ AX, T9, T6 \ adc2 T6, T5 \ - MULXQ I1+56(SB), DX, T6 \ + MOVQ I1+56(SB), AX \ + MULXQ AX, DX, T6 \ + MOVL $0, AX \ adc2 AX, T6 \ \ // Output XORQ AX, AX \ @@ -361,7 +373,7 @@ // * MULS: either MULS_128x320_MULX or MULS_128x320_MULX_ADCX_ADOX // Output: OUT 512-bit #define REDC(OUT, IN, MULS) \ - MULS(0(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \ + MULS(0(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP) \ XORQ R15, R15 \ ADDQ (24)(IN), R8 \ ADCQ (32)(IN), R9 \ @@ -395,7 +407,7 @@ MOVQ R11, (112)(IN) \ MOVQ R12, (120)(IN) \ \ - MULS(16(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \ + MULS(16(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP) \ XORQ R15, R15 \ ADDQ (40)(IN), R8 \ ADCQ (48)(IN), R9 \ @@ -423,7 +435,7 @@ MOVQ R9, (112)(IN) \ MOVQ R10, (120)(IN) \ \ - MULS(32(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \ + MULS(32(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP) \ XORQ R15, R15 \ XORQ BX, BX \ ADDQ ( 56)(IN), R8 \ @@ -445,7 +457,7 @@ MOVQ BX, (120)(IN) \ MOVQ R9, ( 0)(OUT) \ // Result: OUT[0] \ - MULS(48(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \ + MULS(48(IN), ·P503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, BP) \ ADDQ ( 72)(IN), R8 \ ADCQ ( 80)(IN), R9 \ ADCQ ( 88)(IN), R10 \ @@ -1218,7 +1230,7 @@ mul_with_mulx: MUL(CX, REG_P1, REG_P2, MULS256_MULX) RET -TEXT ·rdcP503(SB), $0-16 +TEXT ·rdcP503(SB), $8-16 MOVQ z+0(FP), REG_P2 MOVQ x+8(FP), REG_P1 @@ -1536,13 +1548,17 @@ redc_with_mulx_adcx_adox: // Implementation of the Montgomery reduction for CPUs // supporting two independent carry chain (ADOX/ADCX) // instructions and carry-less MULX multiplier + MOVQ BP, 0(SP) // push: BP is Callee-save. REDC(REG_P2, REG_P1, MULS_128x320_MULX_ADCX_ADOX) + MOVQ 0(SP), BP // pop: BP is Callee-save. RET redc_with_mulx: // Implementation of the Montgomery reduction for CPUs // supporting carry-less MULX multiplier. + MOVQ BP, 0(SP) // push: BP is Callee-save. REDC(REG_P2, REG_P1, MULS_128x320_MULX) + MOVQ 0(SP), BP // pop: BP is Callee-save. RET TEXT ·adlP503(SB), NOSPLIT, $0-24 From 183d74fe2f46235a2b4f9d0e077b0e4139d3a2e7 Mon Sep 17 00:00:00 2001 From: armfazh Date: Sun, 26 Feb 2023 03:15:27 -0800 Subject: [PATCH 9/9] dh/sidh: Avoid reference to global variable with MULX for rdcP751. See bug in the compiler (issue #58735). --- dh/sidh/internal/p751/arith_amd64.s | 211 ++++++++++++++++++---------- 1 file changed, 134 insertions(+), 77 deletions(-) diff --git a/dh/sidh/internal/p751/arith_amd64.s b/dh/sidh/internal/p751/arith_amd64.s index 22d6de1b2..59f5fa601 100644 --- a/dh/sidh/internal/p751/arith_amd64.s +++ b/dh/sidh/internal/p751/arith_amd64.s @@ -1431,44 +1431,58 @@ TEXT ·mulP751(SB), $96-24 // C points to the place to store the result and should be at least 192 bits. // This should only be used when the BMI2 and ADX instruction set extensions // are available. -#define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ +#define mul256x448bmi2adx(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \ MOVQ 0+M0, DX \ - MULXQ M1+40(SB), T1, T0 \ - MULXQ M1+48(SB), T3, T2 \ + MOVQ M1+40(SB), AX \ + MULXQ AX, T1, T0 \ + MOVQ M1+48(SB), AX \ + MULXQ AX, T3, T2 \ MOVQ T1, 0+C \ // C0_final - XORQ AX, AX \ - MULXQ M1+56(SB), T5, T4 \ + MOVQ M1+56(SB), AX \ + MULXQ AX, T5, T4 \ ADOXQ T3, T0 \ ADOXQ T5, T2 \ - MULXQ M1+64(SB), T3, T1 \ + MOVQ M1+64(SB), AX \ + MULXQ AX, T3, T1 \ ADOXQ T3, T4 \ - MULXQ M1+72(SB), T6, T5 \ + MOVQ M1+72(SB), AX \ + MULXQ AX, T6, T5 \ ADOXQ T6, T1 \ - MULXQ M1+80(SB), T7, T3 \ + MOVQ M1+80(SB), AX \ + MULXQ AX, T7, T3 \ ADOXQ T7, T5 \ - MULXQ M1+88(SB), T8, T6 \ + MOVQ M1+88(SB), AX \ + MULXQ AX, T8, T6 \ ADOXQ T8, T3 \ + MOVL $0, AX \ ADOXQ AX, T6 \ \ MOVQ 8+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ - XORQ AX, AX \ + MOVQ M1+40(SB), AX \ + MULXQ AX, T7, T8 \ ADCXQ T7, T0 \ MOVQ T0, 8+C \ // C1_final ADCXQ T8, T2 \ - MULXQ M1+48(SB), T8, T7 \ + MOVQ M1+48(SB), AX \ + MULXQ AX, T8, T7 \ ADOXQ T8, T2 \ ADCXQ T7, T4 \ - MULXQ M1+56(SB), T8, T0 \ + MOVQ M1+56(SB), AX \ + MULXQ AX, T8, T0 \ ADOXQ T8, T4 \ ADCXQ T1, T0 \ - MULXQ M1+64(SB), T7, T1 \ + MOVQ M1+64(SB), AX \ + MULXQ AX, T7, T1 \ ADCXQ T5, T1 \ - MULXQ M1+72(SB), T8, T5 \ + MOVQ M1+72(SB), AX \ + MULXQ AX, T8, T5 \ ADCXQ T5, T3 \ - MULXQ M1+80(SB), T9, T5 \ + MOVQ M1+80(SB), AX \ + MULXQ AX, T9, T5 \ ADCXQ T5, T6 \ - MULXQ M1+88(SB), DX, T5 \ + MOVQ M1+88(SB), AX \ + MULXQ AX, DX, T5 \ + MOVL $0, AX \ ADCXQ AX, T5 \ \ ADOXQ T7, T0 \ @@ -1478,24 +1492,31 @@ TEXT ·mulP751(SB), $96-24 ADOXQ AX, T5 \ \ MOVQ 16+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ - XORQ AX, AX \ + MOVQ M1+40(SB), AX \ + MULXQ AX, T7, T8 \ ADCXQ T7, T2 \ MOVQ T2, 16+C \ // C2_final ADCXQ T8, T4 \ - MULXQ M1+48(SB), T7, T8 \ + MOVQ M1+48(SB), AX \ + MULXQ AX, T7, T8 \ ADOXQ T7, T4 \ ADCXQ T8, T0 \ - MULXQ M1+56(SB), T8, T2 \ + MOVQ M1+56(SB), AX \ + MULXQ AX, T8, T2 \ ADOXQ T8, T0 \ ADCXQ T2, T1 \ - MULXQ M1+64(SB), T7, T2 \ + MOVQ M1+64(SB), AX \ + MULXQ AX, T7, T2 \ ADCXQ T2, T3 \ - MULXQ M1+72(SB), T8, T2 \ + MOVQ M1+72(SB), AX \ + MULXQ AX, T8, T2 \ ADCXQ T2, T6 \ - MULXQ M1+80(SB), T9, T2 \ + MOVQ M1+80(SB), AX \ + MULXQ AX, T9, T2 \ ADCXQ T2, T5 \ - MULXQ M1+88(SB), DX, T2 \ + MOVQ M1+88(SB), AX \ + MULXQ AX, DX, T2 \ + MOVL $0, AX \ ADCXQ AX, T2 \ \ ADOXQ T7, T1 \ @@ -1505,26 +1526,33 @@ TEXT ·mulP751(SB), $96-24 ADOXQ AX, T2 \ \ MOVQ 24+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ - XORQ AX, AX \ + MOVQ M1+40(SB), AX \ + MULXQ AX, T7, T8 \ ADCXQ T4, T7 \ ADCXQ T8, T0 \ - MULXQ M1+48(SB), T10, T8 \ - ADOXQ T10, T0 \ + MOVQ M1+48(SB), AX \ + MULXQ AX, T9, T8 \ + ADOXQ T9, T0 \ ADCXQ T8, T1 \ - MULXQ M1+56(SB), T8, T4 \ + MOVQ M1+56(SB), AX \ + MULXQ AX, T8, T4 \ ADOXQ T8, T1 \ ADCXQ T4, T3 \ - MULXQ M1+64(SB), T10, T4 \ + MOVQ M1+64(SB), AX \ + MULXQ AX, AX, T4 \ ADCXQ T4, T6 \ - MULXQ M1+72(SB), T8, T4 \ + ADOXQ AX, T3 \ + MOVQ M1+72(SB), AX \ + MULXQ AX, T8, T4 \ ADCXQ T4, T5 \ - MULXQ M1+80(SB), T9, T4 \ + MOVQ M1+80(SB), AX \ + MULXQ AX, T9, T4 \ ADCXQ T4, T2 \ - MULXQ M1+88(SB), DX, T4 \ + MOVQ M1+88(SB), AX \ + MULXQ AX, DX, T4 \ + MOVL $0, AX \ ADCXQ AX, T4 \ \ - ADOXQ T10, T3 \ ADOXQ T8, T6 \ ADOXQ T9, T5 \ ADOXQ DX, T2 \ @@ -1535,44 +1563,57 @@ TEXT ·mulP751(SB), $96-24 // C points to the place to store the result and should be at least 192 bits. // This should only be used when the BMI2 instruction set extension is // available. -#define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10) \ +#define mul256x448bmi2(M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \ MOVQ 0+M0, DX \ - MULXQ M1+40(SB), T1, T0 \ - MULXQ M1+48(SB), T3, T2 \ + MOVQ M1+40(SB), AX \ + MULXQ AX, T1, T0 \ + MOVQ M1+48(SB), AX \ + MULXQ AX, T3, T2 \ MOVQ T1, 0+C \ // C0_final - XORQ AX, AX \ - MULXQ M1+56(SB), T5, T4 \ + MOVQ M1+56(SB), AX \ + MULXQ AX, T5, T4 \ ADDQ T3, T0 \ ADCQ T5, T2 \ - MULXQ M1+64(SB), T3, T1 \ + MOVQ M1+64(SB), AX \ + MULXQ AX, T3, T1 \ ADCQ T3, T4 \ - MULXQ M1+72(SB), T6, T5 \ + MOVQ M1+72(SB), AX \ + MULXQ AX, T6, T5 \ ADCQ T6, T1 \ - MULXQ M1+80(SB), T7, T3 \ + MOVQ M1+80(SB), AX \ + MULXQ AX, T7, T3 \ ADCQ T7, T5 \ - MULXQ M1+88(SB), T8, T6 \ + MOVQ M1+88(SB), AX \ + MULXQ AX, T8, T6 \ ADCQ T8, T3 \ - ADCQ AX, T6 \ + ADCQ $0, T6 \ \ MOVQ 8+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ + MOVQ M1+40(SB), AX \ + MULXQ AX, T7, T8 \ ADDQ T7, T0 \ MOVQ T0, 8+C \ // C1_final ADCQ T8, T2 \ - MULXQ M1+48(SB), T8, T7 \ + MOVQ M1+48(SB), AX \ + MULXQ AX, T8, T7 \ MOVQ T8, 32+C \ ADCQ T7, T4 \ - MULXQ M1+56(SB), T8, T0 \ - MOVQ T8, 40+C \ + MOVQ M1+56(SB), AX \ + MULXQ AX, T8, T0 \ + MOVQ T8, 40+C \ ADCQ T1, T0 \ - MULXQ M1+64(SB), T7, T1 \ + MOVQ M1+64(SB), AX \ + MULXQ AX, T7, T1 \ ADCQ T5, T1 \ - MULXQ M1+72(SB), T8, T5 \ + MOVQ M1+72(SB), AX \ + MULXQ AX, T8, T5 \ ADCQ T5, T3 \ - MULXQ M1+80(SB), T9, T5 \ + MOVQ M1+80(SB), AX \ + MULXQ AX, T9, T5 \ ADCQ T5, T6 \ - MULXQ M1+88(SB), DX, T5 \ - ADCQ AX, T5 \ + MOVQ M1+88(SB), AX \ + MULXQ AX, DX, T5 \ + ADCQ $0, T5 \ \ XORQ AX, AX \ ADDQ 32+C, T2 \ @@ -1584,24 +1625,31 @@ TEXT ·mulP751(SB), $96-24 ADCQ AX, T5 \ \ MOVQ 16+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ + MOVQ M1+40(SB), AX \ + MULXQ AX, T7, T8 \ ADDQ T7, T2 \ MOVQ T2, 16+C \ // C2_final ADCQ T8, T4 \ - MULXQ M1+48(SB), T7, T8 \ + MOVQ M1+48(SB), AX \ + MULXQ AX, T7, T8 \ MOVQ T7, 32+C \ ADCQ T8, T0 \ - MULXQ M1+56(SB), T8, T2 \ + MOVQ M1+56(SB), AX \ + MULXQ AX, T8, T2 \ MOVQ T8, 40+C \ ADCQ T2, T1 \ - MULXQ M1+64(SB), T7, T2 \ + MOVQ M1+64(SB), AX \ + MULXQ AX, T7, T2 \ ADCQ T2, T3 \ - MULXQ M1+72(SB), T8, T2 \ + MOVQ M1+72(SB), AX \ + MULXQ AX, T8, T2 \ ADCQ T2, T6 \ - MULXQ M1+80(SB), T9, T2 \ + MOVQ M1+80(SB), AX \ + MULXQ AX, T9, T2 \ ADCQ T2, T5 \ - MULXQ M1+88(SB), DX, T2 \ - ADCQ AX, T2 \ + MOVQ M1+88(SB), AX \ + MULXQ AX, DX, T2 \ + ADCQ $0, T2 \ \ XORQ AX, AX \ ADDQ 32+C, T4 \ @@ -1613,32 +1661,41 @@ TEXT ·mulP751(SB), $96-24 ADCQ AX, T2 \ \ MOVQ 24+M0, DX \ - MULXQ M1+40(SB), T7, T8 \ + MOVQ M1+40(SB), AX \ + MULXQ AX, T7, T8 \ ADDQ T4, T7 \ + MOVQ T7, 8(SP) /* push T7 */ \ ADCQ T8, T0 \ - MULXQ M1+48(SB), T10, T8 \ - MOVQ T10, 32+C \ + MOVQ M1+48(SB), AX \ + MULXQ AX, T9, T8 \ + MOVQ T9, 32+C \ ADCQ T8, T1 \ - MULXQ M1+56(SB), T8, T4 \ + MOVQ M1+56(SB), AX \ + MULXQ AX, T8, T4 \ MOVQ T8, 40+C \ ADCQ T4, T3 \ - MULXQ M1+64(SB), T10, T4 \ + MOVQ M1+64(SB), AX \ + MULXQ AX, T7, T4 \ ADCQ T4, T6 \ - MULXQ M1+72(SB), T8, T4 \ + MOVQ M1+72(SB), AX \ + MULXQ AX, T8, T4 \ ADCQ T4, T5 \ - MULXQ M1+80(SB), T9, T4 \ + MOVQ M1+80(SB), AX \ + MULXQ AX, T9, T4 \ ADCQ T4, T2 \ - MULXQ M1+88(SB), DX, T4 \ - ADCQ AX, T4 \ + MOVQ M1+88(SB), AX \ + MULXQ AX, DX, T4 \ + ADCQ $0, T4 \ \ XORQ AX, AX \ ADDQ 32+C, T0 \ ADCQ 40+C, T1 \ - ADCQ T10, T3 \ + ADCQ T7, T3 \ ADCQ T8, T6 \ ADCQ T9, T5 \ ADCQ DX, T2 \ - ADCQ AX, T4 + ADCQ AX, T4 \ + MOVQ 8(SP), T7 /* pop T7 */ // Template for calculating the Montgomery reduction algorithm described in // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be @@ -1651,7 +1708,7 @@ TEXT ·mulP751(SB), $96-24 // Output: OUT 768-bit #define REDC(C, M0, MULS) \ \ // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - MULS(M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ + MULS(M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX) \ XORQ R15, R15 \ MOVQ 48+C, AX \ MOVQ 56+C, DX \ @@ -1702,7 +1759,7 @@ TEXT ·mulP751(SB), $96-24 MOVQ R13, 176+M0 \ MOVQ R14, 184+M0 \ \ // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - MULS(32+M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ + MULS(32+M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX) \ XORQ R15, R15 \ MOVQ 48+C, AX \ MOVQ 56+C, DX \ @@ -1741,7 +1798,7 @@ TEXT ·mulP751(SB), $96-24 MOVQ R13, 176+M0 \ MOVQ R14, 184+M0 \ \ // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - MULS(64+M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ + MULS(64+M0, ·P751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX) \ MOVQ 48+C, AX \ // Final result c1:c11 MOVQ 56+C, DX \ MOVQ 64+C, BX \ @@ -1768,7 +1825,7 @@ TEXT ·mulP751(SB), $96-24 MOVQ R13, 80+C \ MOVQ R14, 88+C -TEXT ·rdcP751(SB), $8-16 +TEXT ·rdcP751(SB), $16-16 MOVQ z+0(FP), REG_P2 MOVQ x+8(FP), REG_P1