From 9ec8d3bc04328d1da821eb6187309b9e23fe8497 Mon Sep 17 00:00:00 2001
From: Sun Yimin <emmansun@users.noreply.github.com>
Date: Tue, 11 Jul 2023 17:30:48 +0800
Subject: [PATCH] sm9/bn256: add double & triple, fix gfpNeg issue, use Square
 as possible

---
 sm9/bn256/bn_pair.go     | 28 +++++++--------
 sm9/bn256/curve.go       | 46 ++++++++++++------------
 sm9/bn256/gfp12.go       | 18 +++++-----
 sm9/bn256/gfp2.go        | 41 ++++++++++-----------
 sm9/bn256/gfp2_test.go   | 29 ++++++++++++++-
 sm9/bn256/gfp4.go        |  6 ++++
 sm9/bn256/gfp_amd64.s    | 45 +++++++++++++++++++++++
 sm9/bn256/gfp_arm64.s    | 77 ++++++++++++++++++++++++++++++++++++++++
 sm9/bn256/gfp_decl.go    | 12 ++++++-
 sm9/bn256/gfp_generic.go |  9 +++++
 sm9/bn256/gfp_test.go    | 61 +++++++++++++++++++++++++++++++
 sm9/bn256/twist.go       | 22 ++++++------
 12 files changed, 313 insertions(+), 81 deletions(-)

diff --git a/sm9/bn256/bn_pair.go b/sm9/bn256/bn_pair.go
index 3a40cf77..ff9fd81e 100644
--- a/sm9/bn256/bn_pair.go
+++ b/sm9/bn256/bn_pair.go
@@ -12,8 +12,8 @@ func lineFunctionAdd(r, p, rOut *twistPoint, q *curvePoint, r2, a, b, c *gfP2) {
 	H := (&gfP2{}).Sub(B, &r.x) // H = Xp * Zr^2 - Xr
 	I := (&gfP2{}).SquareNC(H)  // I = (Xp * Zr^2 - Xr)^2 = Xp^2*Zr^4 + Xr^2 - 2Xr*Xp*Zr^2
 
-	E := (&gfP2{}).Add(I, I) // E = 2*(Xp * Zr^2 - Xr)^2
-	E.Add(E, E)              // E = 4*(Xp * Zr^2 - Xr)^2
+	E := (&gfP2{}).Double(I) // E = 2*(Xp * Zr^2 - Xr)^2
+	E.Double(E)              // E = 4*(Xp * Zr^2 - Xr)^2
 
 	J := (&gfP2{}).MulNC(H, E) // J =  4*(Xp * Zr^2 - Xr)^3
 
@@ -29,7 +29,7 @@ func lineFunctionAdd(r, p, rOut *twistPoint, q *curvePoint, r2, a, b, c *gfP2) {
 	t := (&gfP2{}).Sub(V, &rOut.x) // t = V - rOut.x
 	t.Mul(t, L1)                   // t = L1*(V-rOut.x)
 	t2 := (&gfP2{}).MulNC(&r.y, J)
-	t2.Add(t2, t2)    // t2 = 2Yr * J
+	t2.Double(t2)     // t2 = 2Yr * J
 	rOut.y.Sub(t, t2) // rOut.y = L1*(V-rOut.x) - 2Yr*J
 
 	rOut.t.SquareNC(&rOut.z)
@@ -38,14 +38,14 @@ func lineFunctionAdd(r, p, rOut *twistPoint, q *curvePoint, r2, a, b, c *gfP2) {
 	t.Add(&p.y, &rOut.z).Square(t).Sub(t, r2).Sub(t, &rOut.t)
 
 	t2.Mul(L1, &p.x)
-	t2.Add(t2, t2) // t2 = 2 L1 * Xp
-	a.Sub(t2, t)   // a =  2 L1 * Xp - 2 Yp * rOut.z = 2 L1 * Xp - (Yp + rOut.Z)^2 + Yp^2 + rOut.Z^2
+	t2.Double(t2) // t2 = 2 L1 * Xp
+	a.Sub(t2, t)  // a =  2 L1 * Xp - 2 Yp * rOut.z = 2 L1 * Xp - (Yp + rOut.Z)^2 + Yp^2 + rOut.Z^2
 
 	c.MulScalar(&rOut.z, &q.y) // c = rOut.z * Yq
-	c.Add(c, c)                // c = 2 * rOut.z * Yq
+	c.Double(c)                // c = 2 * rOut.z * Yq
 
 	b.Neg(L1)                      // b= -L1
-	b.MulScalar(b, &q.x).Add(b, b) // b = -2 * L1 * Xq
+	b.MulScalar(b, &q.x).Double(b) // b = -2 * L1 * Xq
 }
 
 func lineFunctionDouble(r, rOut *twistPoint, q *curvePoint, a, b, c *gfP2) {
@@ -56,9 +56,9 @@ func lineFunctionDouble(r, rOut *twistPoint, q *curvePoint, a, b, c *gfP2) {
 	C := (&gfP2{}).SquareNC(B) // C = Yr ^ 4
 
 	D := (&gfP2{}).Add(&r.x, B)
-	D.Square(D).Sub(D, A).Sub(D, C).Add(D, D)
+	D.Square(D).Sub(D, A).Sub(D, C).Double(D)
 
-	E := (&gfP2{}).Add(A, A) //
+	E := (&gfP2{}).Double(A) //
 	E.Add(E, A)              // E = 3 * Xr ^ 2
 
 	G := (&gfP2{}).SquareNC(E) // G = 9 * Xr^4
@@ -68,23 +68,23 @@ func lineFunctionDouble(r, rOut *twistPoint, q *curvePoint, a, b, c *gfP2) {
 	rOut.z.Add(&r.y, &r.z).Square(&rOut.z).Sub(&rOut.z, B).Sub(&rOut.z, &r.t) // Z3 = (Yr + Zr)^2 - Yr^2 - Zr^2 = 2Yr*Zr
 
 	rOut.y.Sub(D, &rOut.x).Mul(&rOut.y, E)
-	t := (&gfP2{}).Add(C, C) // t = 2 * r.y ^ 4
-	t.Add(t, t).Add(t, t)    // t = 8 * Yr ^ 4
+	t := (&gfP2{}).Double(C) // t = 2 * r.y ^ 4
+	t.Double(t).Double(t)    // t = 8 * Yr ^ 4
 	rOut.y.Sub(&rOut.y, t)
 
 	rOut.t.SquareNC(&rOut.z)
 
-	t.Mul(E, &r.t).Add(t, t) // t = 2(E * Tr)
+	t.Mul(E, &r.t).Double(t) // t = 2(E * Tr)
 	b.Neg(t)                 // b = -2(E * Tr)
 	b.MulScalar(b, &q.x)     // b = -2(E * Tr * Xq)
 
 	a.Add(&r.x, E)                  // a = Xr + E
 	a.Square(a).Sub(a, A).Sub(a, G) // a = (Xr + E) ^ 2 - A - G
-	t.Add(B, B).Add(t, t)           // t = 4B
+	t.Double(B).Double(t)           // t = 4B
 	a.Sub(a, t)                     // a = (Xr + E) ^ 2 - A - G - 4B
 
 	c.Mul(&rOut.z, &r.t)           // c = rOut.z * Tr
-	c.Add(c, c).MulScalar(c, &q.y) // c = 2 rOut.z * Tr * Yq
+	c.Double(c).MulScalar(c, &q.y) // c = 2 rOut.z * Tr * Yq
 }
 
 // (ret.z + ret.y*w + ret.x*w^2)* ((cv+a) + b*w^2)
diff --git a/sm9/bn256/curve.go b/sm9/bn256/curve.go
index b0acbfb8..a2d23288 100644
--- a/sm9/bn256/curve.go
+++ b/sm9/bn256/curve.go
@@ -38,7 +38,7 @@ func (c *curvePoint) Set(a *curvePoint) {
 
 func (c *curvePoint) polynomial(x *gfP) *gfP {
 	x3 := &gfP{}
-	gfpMul(x3, x, x)
+	gfpSqr(x3, x, 1)
 	gfpMul(x3, x3, x)
 	gfpAdd(x3, x3, curveB)
 	return x3
@@ -52,7 +52,7 @@ func (c *curvePoint) IsOnCurve() bool {
 	}
 
 	y2 := &gfP{}
-	gfpMul(y2, &c.y, &c.y)
+	gfpSqr(y2, &c.y, 1)
 
 	x3 := c.polynomial(&c.x)
 
@@ -98,8 +98,8 @@ func (c *curvePoint) Add(a, b *curvePoint) {
 	// by [u1:s1:z1·z2] and [u2:s2:z1·z2]
 	// where u1 = x1·z2², s1 = y1·z2³ and u1 = x2·z1², s2 = y2·z1³
 	z12, z22 := &gfP{}, &gfP{}
-	gfpMul(z12, &a.z, &a.z)
-	gfpMul(z22, &b.z, &b.z)
+	gfpSqr(z12, &a.z, 1)
+	gfpSqr(z22, &b.z, 1)
 
 	u1, u2 := &gfP{}, &gfP{}
 	gfpMul(u1, &a.x, z22)
@@ -123,10 +123,10 @@ func (c *curvePoint) Add(a, b *curvePoint) {
 	h := &gfP{}
 	gfpSub(h, u2, u1)
 
-	gfpAdd(t, h, h)
+	gfpDouble(t, h)
 	// i = 4h²
 	i := &gfP{}
-	gfpMul(i, t, t)
+	gfpSqr(i, t, 1)
 	// j = 4h³
 	j := &gfP{}
 	gfpMul(j, h, i)
@@ -138,15 +138,15 @@ func (c *curvePoint) Add(a, b *curvePoint) {
 		return
 	}
 	r := &gfP{}
-	gfpAdd(r, t, t)
+	gfpDouble(r, t)
 
 	v := &gfP{}
 	gfpMul(v, u1, i)
 
 	// t4 = 4(s2-s1)²
 	t4, t6 := &gfP{}, &gfP{}
-	gfpMul(t4, r, r)
-	gfpAdd(t, v, v)
+	gfpSqr(t4, r, 1)
+	gfpDouble(t, v)
 	gfpSub(t6, t4, j)
 
 	gfpSub(&c.x, t6, t)
@@ -156,13 +156,13 @@ func (c *curvePoint) Add(a, b *curvePoint) {
 	// y = - 2·s1·j - (s2-s1)(2x - 2i·u1) = r(v-x) - 2·s1·j
 	gfpSub(t, v, &c.x) // t7
 	gfpMul(t4, s1, j)  // t8
-	gfpAdd(t6, t4, t4) // t9
+	gfpDouble(t6, t4) // t9
 	gfpMul(t4, r, t)   // t10
 	gfpSub(&c.y, t4, t6)
 
 	// Set z = 2(u2-u1)·z1·z2 = 2h·z1·z2
 	gfpAdd(t, &a.z, &b.z) // t11
-	gfpMul(t4, t, t)      // t12
+	gfpSqr(t4, t, 1)      // t12
 	gfpSub(t, t4, z12)    // t13
 	gfpSub(t4, t, z22)    // t14
 	gfpMul(&c.z, t4, h)
@@ -171,31 +171,31 @@ func (c *curvePoint) Add(a, b *curvePoint) {
 func (c *curvePoint) Double(a *curvePoint) {
 	// See http://hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian-0/doubling/dbl-2009-l.op3
 	A, B, C := &gfP{}, &gfP{}, &gfP{}
-	gfpMul(A, &a.x, &a.x)
-	gfpMul(B, &a.y, &a.y)
-	gfpMul(C, B, B)
+	gfpSqr(A, &a.x, 1)
+	gfpSqr(B, &a.y, 1)
+	gfpSqr(C, B, 1)
 
 	t, t2 := &gfP{}, &gfP{}
 	gfpAdd(t, &a.x, B)
-	gfpMul(t2, t, t)
+	gfpSqr(t2, t, 1)
 	gfpSub(t, t2, A)
 	gfpSub(t2, t, C)
 
 	d, e, f := &gfP{}, &gfP{}, &gfP{}
 	gfpAdd(d, t2, t2)
-	gfpAdd(t, A, A)
+	gfpDouble(t, A)
 	gfpAdd(e, t, A)
-	gfpMul(f, e, e)
+	gfpSqr(f, e, 1)
 
-	gfpAdd(t, d, d)
+	gfpDouble(t, d)
 	gfpSub(&c.x, f, t)
 
 	gfpMul(&c.z, &a.y, &a.z)
-	gfpAdd(&c.z, &c.z, &c.z)
+	gfpDouble(&c.z, &c.z)
 
-	gfpAdd(t, C, C)
-	gfpAdd(t2, t, t)
-	gfpAdd(t, t2, t2)
+	gfpDouble(t, C)
+	gfpDouble(t2, t)
+	gfpDouble(t, t2)
 	gfpSub(&c.y, d, &c.x)
 	gfpMul(t2, e, &c.y)
 	gfpSub(&c.y, t2, t)
@@ -232,7 +232,7 @@ func (c *curvePoint) MakeAffine() {
 
 	t, zInv2 := &gfP{}, &gfP{}
 	gfpMul(t, &c.y, zInv)
-	gfpMul(zInv2, zInv, zInv)
+	gfpSqr(zInv2, zInv, 1)
 
 	gfpMul(&c.x, &c.x, zInv2)
 	gfpMul(&c.y, t, zInv2)
diff --git a/sm9/bn256/gfp12.go b/sm9/bn256/gfp12.go
index cf3765fe..cd51aa67 100644
--- a/sm9/bn256/gfp12.go
+++ b/sm9/bn256/gfp12.go
@@ -258,11 +258,11 @@ func (e *gfP12) SpecialSquares(a *gfP12, n int) *gfP12 {
 	ty.Triple(v1)
 	tz.Triple(v2)
 
-	v0.Add(&a.x, &a.x) // (f12, f01)
+	v0.Double(&a.x) // (f12, f01)
 	v0.y.Neg(&v0.y)
-	v1.Add(&a.y, &a.y) // (f02, f10)
+	v1.Double(&a.y) // (f02, f10)
 	v1.x.Neg(&v1.x)
-	v2.Add(&a.z, &a.z) // (f11, f00)
+	v2.Double(&a.z) // (f11, f00)
 	v2.y.Neg(&v2.y)
 
 	v0.Add(ty, v0)
@@ -285,11 +285,11 @@ func (e *gfP12) SpecialSquares(a *gfP12, n int) *gfP12 {
 		ty.Triple(v1)
 		tz.Triple(v2)
 
-		v0.Add(&in.x, &in.x) // (f12, f01)
+		v0.Double(&in.x) // (f12, f01)
 		v0.y.Neg(&v0.y)
-		v1.Add(&in.y, &in.y) // (f02, f10)
+		v1.Double(&in.y) // (f02, f10)
 		v1.x.Neg(&v1.x)
-		v2.Add(&in.z, &in.z) // (f11, f00)
+		v2.Double(&in.z) // (f11, f00)
 		v2.y.Neg(&v2.y)
 
 		v0.Add(ty, v0)
@@ -321,11 +321,11 @@ func (e *gfP12) SpecialSquareNC(a *gfP12) *gfP12 {
 	ty.Triple(v1)
 	tz.Triple(v2)
 
-	v0.Add(&a.x, &a.x) // (f12, f01)
+	v0.Double(&a.x) // (f12, f01)
 	v0.y.Neg(&v0.y)
-	v1.Add(&a.y, &a.y) // (f02, f10)
+	v1.Double(&a.y) // (f02, f10)
 	v1.x.Neg(&v1.x)
-	v2.Add(&a.z, &a.z) // (f11, f00)
+	v2.Double(&a.z) // (f11, f00)
 	v2.y.Neg(&v2.y)
 
 	v0.Add(ty, v0)
diff --git a/sm9/bn256/gfp2.go b/sm9/bn256/gfp2.go
index 90b3865b..0c46e5b0 100644
--- a/sm9/bn256/gfp2.go
+++ b/sm9/bn256/gfp2.go
@@ -75,13 +75,13 @@ func (e *gfP2) IsOne() bool {
 
 func (e *gfP2) Conjugate(a *gfP2) *gfP2 {
 	e.y.Set(&a.y)
-	gfpSub(&e.x, zero, &a.x)
+	gfpNeg(&e.x, &a.x)
 	return e
 }
 
 func (e *gfP2) Neg(a *gfP2) *gfP2 {
-	gfpSub(&e.x, zero, &a.x)
-	gfpSub(&e.y, zero, &a.y)
+	gfpNeg(&e.x, &a.x)
+	gfpNeg(&e.y, &a.y)
 	return e
 }
 
@@ -98,17 +98,14 @@ func (e *gfP2) Sub(a, b *gfP2) *gfP2 {
 }
 
 func (e *gfP2) Double(a *gfP2) *gfP2 {
-	gfpAdd(&e.x, &a.x, &a.x)
-	gfpAdd(&e.y, &a.y, &a.y)
+	gfpDouble(&e.x, &a.x)
+	gfpDouble(&e.y, &a.y)
 	return e
 }
 
 func (e *gfP2) Triple(a *gfP2) *gfP2 {
-	gfpAdd(&e.x, &a.x, &a.x)
-	gfpAdd(&e.y, &a.y, &a.y)
-
-	gfpAdd(&e.x, &e.x, &a.x)
-	gfpAdd(&e.y, &e.y, &a.y)
+	gfpTriple(&e.x, &a.x)
+	gfpTriple(&e.y, &a.y)
 	return e
 }
 
@@ -172,8 +169,8 @@ func (e *gfP2) MulUNC(a, b *gfP2) *gfP2 {
 	gfpMul(ty, tx, ty)
 	gfpSub(ty, ty, v0)
 	gfpSub(ty, ty, v1)
-	gfpAdd(ty, ty, ty)
-	gfpSub(ty, zero, ty)
+	gfpDouble(ty, ty)
+	gfpNeg(ty, ty)
 
 	gfpSub(tx, v0, v1)
 	gfpSub(tx, tx, v1)
@@ -187,8 +184,8 @@ func (e *gfP2) MulUNC(a, b *gfP2) *gfP2 {
 // c0 = -2a1
 func (e *gfP2) MulU1(a *gfP2) *gfP2 {
 	t := &gfP{}
-	gfpAdd(t, &a.x, &a.x)
-	gfpSub(t, zero, t)
+	gfpDouble(t, &a.x)
+	gfpNeg(t, t)
 
 	gfpCopy(&e.x, &a.y)
 	gfpCopy(&e.y, t)
@@ -212,12 +209,12 @@ func (e *gfP2) SquareNC(a *gfP2) *gfP2 {
 	ty := &e.y
 
 	gfpAdd(ty, &a.x, &a.y)
-	gfpAdd(tx, &a.x, &a.x)
+	gfpDouble(tx, &a.x)
 	gfpSub(tx, &a.y, tx)
 	gfpMul(ty, tx, ty)
 	gfpMul(tx, &a.x, &a.y)
 	gfpAdd(ty, tx, ty)
-	gfpAdd(tx, tx, tx)
+	gfpDouble(tx, tx)
 
 	return e
 }
@@ -240,14 +237,14 @@ func (e *gfP2) SquareUNC(a *gfP2) *gfP2 {
 	ty := &e.y
 
 	gfpAdd(tx, &a.x, &a.y)
-	gfpAdd(ty, &a.x, &a.x)
+	gfpDouble(ty, &a.x)
 	gfpSub(ty, &a.y, ty)
 	gfpMul(tx, tx, ty)
 	gfpMul(ty, &a.x, &a.y)
 	gfpAdd(tx, tx, ty)
-	gfpAdd(ty, ty, ty)
-	gfpAdd(ty, ty, ty)
-	gfpSub(ty, zero, ty)
+	gfpDouble(ty, ty)
+	gfpDouble(ty, ty)
+	gfpNeg(ty, ty)
 
 	return e
 }
@@ -263,14 +260,14 @@ func (e *gfP2) Invert(a *gfP2) *gfP2 {
 	// ftp://136.206.11.249/pub/crypto/pairings.pdf
 	t1, t2, t3 := &gfP{}, &gfP{}, &gfP{}
 	gfpSqr(t1, &a.x, 1)
-	gfpAdd(t3, t1, t1)
+	gfpDouble(t3, t1)
 	gfpSqr(t2, &a.y, 1)
 	gfpAdd(t3, t3, t2)
 
 	inv := &gfP{}
 	inv.Invert(t3) // inv = (2 * a.x ^ 2 + a.y ^ 2) ^ (-1)
 
-	gfpSub(t1, zero, &a.x)
+	gfpNeg(t1, &a.x)
 
 	gfpMul(&e.x, t1, inv)   // x = - a.x * inv
 	gfpMul(&e.y, &a.y, inv) // y = a.y * inv
diff --git a/sm9/bn256/gfp2_test.go b/sm9/bn256/gfp2_test.go
index 92d17bb5..10ad6a6b 100644
--- a/sm9/bn256/gfp2_test.go
+++ b/sm9/bn256/gfp2_test.go
@@ -152,10 +152,11 @@ func BenchmarkGfP2MulU(b *testing.B) {
 		*fromBigInt(bigFromHex("17509B092E845C1266BA0D262CBEE6ED0736A96FA347C8BD856DC76B84EBEB96")),
 		*fromBigInt(bigFromHex("A7CF28D519BE3DA65F3170153D278FF247EFBA98A71A08116215BBA5C999A7C7")),
 	}
+
+	t := &gfP2{}
 	b.ReportAllocs()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		t := &gfP2{}
 		t.MulU(x, y)
 	}
 }
@@ -184,6 +185,32 @@ func BenchmarkGfP2SquareU(b *testing.B) {
 	}
 }
 
+func BenchmarkGfP2Neg(b *testing.B) {
+	x := &gfP2{
+		*fromBigInt(bigFromHex("85AEF3D078640C98597B6027B441A01FF1DD2C190F5E93C454806C11D8806141")),
+		*fromBigInt(bigFromHex("3722755292130B08D2AAB97FD34EC120EE265948D19C17ABF9B7213BAF82D65B")),
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		gfpNeg(&x.x, &x.x)
+		gfpNeg(&x.y, &x.y)
+	}
+}
+
+func BenchmarkGfP2Neg2(b *testing.B) {
+	x := &gfP2{
+		*fromBigInt(bigFromHex("85AEF3D078640C98597B6027B441A01FF1DD2C190F5E93C454806C11D8806141")),
+		*fromBigInt(bigFromHex("3722755292130B08D2AAB97FD34EC120EE265948D19C17ABF9B7213BAF82D65B")),
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		gfpSub(&x.x, zero, &x.x)
+		gfpSub(&x.y, zero, &x.y)
+	}
+}
+
 /*
 func Test_gfP2QuadraticResidue(t *testing.T) {
 	x := &gfP2{
diff --git a/sm9/bn256/gfp4.go b/sm9/bn256/gfp4.go
index 4f4b828a..eb9bc0de 100644
--- a/sm9/bn256/gfp4.go
+++ b/sm9/bn256/gfp4.go
@@ -73,6 +73,12 @@ func (e *gfP4) Add(a, b *gfP4) *gfP4 {
 	return e
 }
 
+func (e *gfP4) Double(a *gfP4) *gfP4 {
+	e.x.Double(&a.x)
+	e.y.Double(&a.y)
+	return e
+}
+
 func (e *gfP4) Triple(a *gfP4) *gfP4 {
 	e.x.Triple(&a.x)
 	e.y.Triple(&a.y)
diff --git a/sm9/bn256/gfp_amd64.s b/sm9/bn256/gfp_amd64.s
index 5fdc2250..8acdc305 100644
--- a/sm9/bn256/gfp_amd64.s
+++ b/sm9/bn256/gfp_amd64.s
@@ -104,6 +104,51 @@ TEXT ·gfpAdd(SB),0,$0-24
 	storeBlock(R8,R9,R10,R11, 0(DI))
 	RET
 
+TEXT ·gfpDouble(SB),0,$0-16
+	MOVQ a+0(FP), DI
+	MOVQ b+8(FP), SI
+
+	loadBlock(0(SI), R8,R9,R10,R11)
+	XORQ R12, R12
+
+	ADDQ  R8, R8
+	ADCQ  R9, R9
+	ADCQ  R10, R10
+	ADCQ  R11, R11
+	ADCQ  $0, R12
+
+	gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12)
+
+	storeBlock(R8,R9,R10,R11, 0(DI))
+	RET
+
+TEXT ·gfpTriple(SB),0,$0-16
+	MOVQ a+0(FP), DI
+	MOVQ b+8(FP), SI
+
+	loadBlock(0(SI), R8,R9,R10,R11)
+	XORQ R12, R12
+
+	ADDQ  R8, R8
+	ADCQ  R9, R9
+	ADCQ  R10, R10
+	ADCQ  R11, R11
+	ADCQ $0, R12
+
+	gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12)
+
+	XORQ R12, R12
+	ADDQ  0(SI), R8
+	ADCQ  8(SI), R9
+	ADCQ 16(SI), R10
+	ADCQ 24(SI), R11
+	ADCQ $0, R12
+
+	gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12)
+
+	storeBlock(R8,R9,R10,R11, 0(DI))
+	RET
+
 TEXT ·gfpSub(SB),0,$0-24
 	MOVQ a+8(FP), DI
 	MOVQ b+16(FP), SI
diff --git a/sm9/bn256/gfp_arm64.s b/sm9/bn256/gfp_arm64.s
index dc7f23b4..96b8b25c 100644
--- a/sm9/bn256/gfp_arm64.s
+++ b/sm9/bn256/gfp_arm64.s
@@ -109,6 +109,83 @@ TEXT ·gfpAdd(SB),0,$0-24
 	storeBlock(R1,R2,R3,R4, 0(R0))
 	RET
 
+TEXT ·gfpDouble(SB),0,$0-16
+	MOVD a+8(FP), R0
+	loadBlock(0(R0), R1,R2,R3,R4)
+	loadModulus(R9,R10,R11,R12)
+	MOVD ZR, R0
+
+	ADDS R1, R1
+	ADCS R2, R2
+	ADCS R3, R3
+	ADCS R4, R4
+	ADCS ZR, R0
+
+	SUBS  R9, R1, R5
+	SBCS R10, R2, R6
+	SBCS R11, R3, R7
+	SBCS R12, R4, R8
+	SBCS  ZR, R0, R0
+
+	CSEL CS, R5, R1, R1
+	CSEL CS, R6, R2, R2
+	CSEL CS, R7, R3, R3
+	CSEL CS, R8, R4, R4
+
+	MOVD c+0(FP), R0
+	storeBlock(R1,R2,R3,R4, 0(R0))
+	RET
+
+TEXT ·gfpTriple(SB),0,$0-16
+	MOVD a+8(FP), R0
+	loadBlock(0(R0), R1,R2,R3,R4)
+	MOVD R1, R19
+	MOVD R2, R20
+	MOVD R3, R21
+	MOVD R4, R22
+	loadModulus(R9,R10,R11,R12)
+	MOVD ZR, R0
+
+	ADDS R1, R1
+	ADCS R2, R2
+	ADCS R3, R3
+	ADCS R4, R4
+	ADCS ZR, R0
+
+	SUBS  R9, R1, R5
+	SBCS R10, R2, R6
+	SBCS R11, R3, R7
+	SBCS R12, R4, R8
+	SBCS  ZR, R0, R0
+
+	CSEL CS, R5, R1, R1
+	CSEL CS, R6, R2, R2
+	CSEL CS, R7, R3, R3
+	CSEL CS, R8, R4, R4
+
+	MOVD ZR, R0
+
+	ADDS R19, R1
+	ADCS R20, R2
+	ADCS R21, R3
+	ADCS R22, R4
+	ADCS ZR, R0
+
+	SUBS  R9, R1, R5
+	SBCS R10, R2, R6
+	SBCS R11, R3, R7
+	SBCS R12, R4, R8
+	SBCS  ZR, R0, R0
+
+	CSEL CS, R5, R1, R1
+	CSEL CS, R6, R2, R2
+	CSEL CS, R7, R3, R3
+	CSEL CS, R8, R4, R4
+
+	MOVD c+0(FP), R0
+	storeBlock(R1,R2,R3,R4, 0(R0))
+	RET
+
 TEXT ·gfpSub(SB),0,$0-24
 	MOVD a+8(FP), R0
 	loadBlock(0(R0), R1,R2,R3,R4)
diff --git a/sm9/bn256/gfp_decl.go b/sm9/bn256/gfp_decl.go
index 1e2ff8fe..589bd3f6 100644
--- a/sm9/bn256/gfp_decl.go
+++ b/sm9/bn256/gfp_decl.go
@@ -18,7 +18,7 @@ var supportADX = cpu.X86.HasADX && cpu.X86.HasBMI2
 // Set c = p - a, if c == p, then c = 0
 // It seems this function's performance is worse than gfpSub with zero.
 //
-// go:noescape
+//go:noescape
 func gfpNeg(c, a *gfP)
 
 // Set c = a + b, if c >= p, then c = c - p
@@ -26,6 +26,16 @@ func gfpNeg(c, a *gfP)
 //go:noescape
 func gfpAdd(c, a, b *gfP)
 
+// Set c = a + a
+//
+//go:noescape
+func gfpDouble(c, a *gfP)
+
+// Set c = a + a + a
+//
+//go:noescape
+func gfpTriple(c, a *gfP)
+
 // Set c = a - b, if c is negative, then c = c + p
 //
 //go:noescape
diff --git a/sm9/bn256/gfp_generic.go b/sm9/bn256/gfp_generic.go
index f62ed09e..c6c2f8a3 100644
--- a/sm9/bn256/gfp_generic.go
+++ b/sm9/bn256/gfp_generic.go
@@ -42,6 +42,15 @@ func gfpAdd(c, a, b *gfP) {
 	gfpCarry(c, carry)
 }
 
+func gfpDouble(c, a *gfP) {
+	gfpAdd(c, a, a)
+}
+
+func gfpTriple(c, a *gfP) {
+	gfpAdd(c, a, a)
+	gfpAdd(c, c, a)
+}
+
 func gfpSub(c, a, b *gfP) {
 	t := &gfP{}
 
diff --git a/sm9/bn256/gfp_test.go b/sm9/bn256/gfp_test.go
index 5c9eb73f..016e2ab0 100644
--- a/sm9/bn256/gfp_test.go
+++ b/sm9/bn256/gfp_test.go
@@ -225,3 +225,64 @@ func BenchmarkGfPSqr(b *testing.B) {
 		gfpSqr(ret, x, 1)
 	}
 }
+
+func BenchmarkGfPTriple(b *testing.B) {
+	x := fromBigInt(bigFromHex("9093a2b979e6186f43a9b28d41ba644d533377f2ede8c66b19774bf4a9c7a596"))
+	b.ReportAllocs()
+	b.ResetTimer()
+	ret := &gfP{}
+	for i := 0; i < b.N; i++ {
+		gfpTriple(ret, x)
+	}
+}
+
+func BenchmarkGfPTriple2(b *testing.B) {
+	x := fromBigInt(bigFromHex("9093a2b979e6186f43a9b28d41ba644d533377f2ede8c66b19774bf4a9c7a596"))
+	b.ReportAllocs()
+	b.ResetTimer()
+	ret := &gfP{}
+	for i := 0; i < b.N; i++ {
+		gfpAdd(ret, x, x)
+		gfpAdd(ret, ret, x)
+	}
+}
+
+func BenchmarkGfPDouble(b *testing.B) {
+	x := fromBigInt(bigFromHex("9093a2b979e6186f43a9b28d41ba644d533377f2ede8c66b19774bf4a9c7a596"))
+	b.ReportAllocs()
+	b.ResetTimer()
+	ret := &gfP{}
+	for i := 0; i < b.N; i++ {
+		gfpDouble(ret, x)
+	}
+}
+
+func BenchmarkGfPDouble2(b *testing.B) {
+	x := fromBigInt(bigFromHex("9093a2b979e6186f43a9b28d41ba644d533377f2ede8c66b19774bf4a9c7a596"))
+	b.ReportAllocs()
+	b.ResetTimer()
+	ret := &gfP{}
+	for i := 0; i < b.N; i++ {
+		gfpAdd(ret, x, x)
+	}
+}
+
+func BenchmarkGfPNeg(b *testing.B) {
+	x := fromBigInt(bigFromHex("9093a2b979e6186f43a9b28d41ba644d533377f2ede8c66b19774bf4a9c7a596"))
+	b.ReportAllocs()
+	b.ResetTimer()
+	ret := &gfP{}
+	for i := 0; i < b.N; i++ {
+		gfpNeg(ret, x)
+	}
+}
+
+func BenchmarkGfPNeg2(b *testing.B) {
+	x := fromBigInt(bigFromHex("9093a2b979e6186f43a9b28d41ba644d533377f2ede8c66b19774bf4a9c7a596"))
+	b.ReportAllocs()
+	b.ResetTimer()
+	ret := &gfP{}
+	for i := 0; i < b.N; i++ {
+		gfpSub(ret, zero, x)
+	}
+}
diff --git a/sm9/bn256/twist.go b/sm9/bn256/twist.go
index 0c93f86e..1f760ade 100644
--- a/sm9/bn256/twist.go
+++ b/sm9/bn256/twist.go
@@ -114,7 +114,7 @@ func (c *twistPoint) Add(a, b *twistPoint) {
 	h := (&gfP2{}).Sub(u2, u1)
 	xEqual := h.IsZero()
 
-	t.Add(h, h)
+	t.Double(h)
 	i := (&gfP2{}).SquareNC(t)
 	j := (&gfP2{}).MulNC(h, i)
 
@@ -124,18 +124,18 @@ func (c *twistPoint) Add(a, b *twistPoint) {
 		c.Double(a)
 		return
 	}
-	r := (&gfP2{}).Add(t, t)
+	r := (&gfP2{}).Double(t)
 
 	v := (&gfP2{}).MulNC(u1, i)
 
 	t4 := (&gfP2{}).SquareNC(r)
-	t.Add(v, v)
+	t.Double(v)
 	t6 := (&gfP2{}).Sub(t4, j)
 	c.x.Sub(t6, t)
 
 	t.Sub(v, &c.x) // t7
 	t4.Mul(s1, j)  // t8
-	t6.Add(t4, t4) // t9
+	t6.Double(t4)  // t9
 	t4.Mul(r, t)   // t10
 	c.y.Sub(t4, t6)
 
@@ -156,20 +156,20 @@ func (c *twistPoint) Double(a *twistPoint) {
 	t2 := (&gfP2{}).SquareNC(t)
 	t.Sub(t2, A)
 	t2.Sub(t, C)
-	d := (&gfP2{}).Add(t2, t2)
-	t.Add(A, A)
+	d := (&gfP2{}).Double(t2)
+	t.Double(A)
 	e := (&gfP2{}).Add(t, A)
 	f := (&gfP2{}).SquareNC(e)
 
-	t.Add(d, d)
+	t.Double(d)
 	c.x.Sub(f, t)
 
 	c.z.Mul(&a.y, &a.z)
-	c.z.Add(&c.z, &c.z)
+	c.z.Double(&c.z)
 
-	t.Add(C, C)
-	t2.Add(t, t)
-	t.Add(t2, t2)
+	t.Double(C)
+	t2.Double(t)
+	t.Double(t2)
 	c.y.Sub(d, &c.x)
 	t2.Mul(e, &c.y)
 	c.y.Sub(t2, t)