From fc2f105dd26f8c69469f9650f65f006899509f82 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Thu, 13 Jul 2023 08:47:05 +0800 Subject: [PATCH] sm4: fix AVX version use AVX2 inst. issue --- sm4/aesni_macros_amd64.s | 56 +++++++++++++++++++++++++++++++++++ sm4/asm_amd64.s | 60 ++++++++------------------------------ sm4/cbc_cipher_asm_amd64.s | 52 +++++---------------------------- sm4/gcm_amd64.s | 48 ++++++++++-------------------- 4 files changed, 92 insertions(+), 124 deletions(-) diff --git a/sm4/aesni_macros_amd64.s b/sm4/aesni_macros_amd64.s index e7679481..1e4d334d 100644 --- a/sm4/aesni_macros_amd64.s +++ b/sm4/aesni_macros_amd64.s @@ -264,6 +264,25 @@ GLOBL fk_mask<>(SB), 8, $16 VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3] VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2] +// SM4 round function, AVX version, handle 128 bits +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - index: round key index immediate number +// - x: 128 bits temp register +// - y: 128 bits temp register +// - t0: 128 bits register for data as result +// - t1: 128 bits register for data +// - t2: 128 bits register for data +// - t3: 128 bits register for data +#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ + MOVL (index * 4)(RK)(IND*1), x; \ + VPSHUFD $0, x, x; \ + VPXOR t1, x, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \ + VPXOR x, t0, t0 + // SM4 sbox function, AVX2 version // parameters: // - x: 256 bits register as sbox input/output data @@ -321,3 +340,40 @@ GLOBL fk_mask<>(SB), 8, $16 VPSHUFB z, x, z; \ VPXOR y, x, x; \ VPXOR x, z, x + +// SM4 round function, AVX2 version, handle 256 bits +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - index: round key index immediate number +// - x: 256 bits temp register, MUST use XDWORD! +// - y: 256 bits temp register, MUST use YDWORD! +// - t0: 256 bits register for data as result +// - t1: 256 bits register for data +// - t2: 256 bits register for data +// - t3: 256 bits register for data +#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \ + VPBROADCASTD (index * 4)(RK)(IND*1), x; \ + VPXOR t1, x, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ + VPXOR x, t0, t0 + +// SM4 round function, AVX version, handle 128 bits +// t0 ^= tao_l1(t1^t2^t3^xk) +// parameters: +// - index: round key index immediate number +// - x: 128 bits temp register +// - y: 128 bits temp register +// - t0: 128 bits register for data as result +// - t1: 128 bits register for data +// - t2: 128 bits register for data +// - t3: 128 bits register for data +#define AVX2_SM4_ROUND_4BLOCKS(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ + VPBROADCASTD (index * 4)(RK)(IND*1), x; \ + VPSHUFD $0, x, x; \ + VPXOR t1, x, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \ + VPXOR x, t0, t0 diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index 12839223..35ead144 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -88,42 +88,6 @@ #define XWORD X8 #define YWORD X9 -// SM4 round function, AVX2 version, handle 256 bits -// t0 ^= tao_l1(t1^t2^t3^xk) -// parameters: -// - index: round key index immediate number -// - x: 256 bits temp register, MUST use XDWORD! -// - y: 256 bits temp register, MUST use YDWORD! -// - t0: 256 bits register for data as result -// - t1: 256 bits register for data -// - t2: 256 bits register for data -// - t3: 256 bits register for data -#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ - VPBROADCASTD (index * 4)(AX)(CX*1), x; \ - VPXOR t1, x, x; \ - VPXOR t2, x, x; \ - VPXOR t3, x, x; \ - AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \ - VPXOR x, t0, t0 - -// SM4 round function, AVX version, handle 128 bits -// t0 ^= tao_l1(t1^t2^t3^xk) -// parameters: -// - index: round key index immediate number -// - x: 128 bits temp register -// - y: 128 bits temp register -// - t0: 128 bits register for data as result -// - t1: 128 bits register for data -// - t2: 128 bits register for data -// - t3: 128 bits register for data -#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ - VPBROADCASTD (index * 4)(AX)(CX*1), x; \ - VPXOR t1, x, x; \ - VPXOR t2, x, x; \ - VPXOR t3, x, x; \ - AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \ - VPXOR x, t0, t0 - // func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int) TEXT ·expandKeyAsm(SB),NOSPLIT,$0 MOVQ key+0(FP), AX @@ -225,10 +189,10 @@ avx: XORL CX, CX avx_loop: - AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3) - AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0) - AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1) - AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2) + AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3) + AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0) + AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1) + AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2) ADDL $16, CX CMPL CX, $4*32 @@ -274,10 +238,10 @@ avx2_8blocks: XORL CX, CX avx2_loop: - AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2) ADDL $16, CX CMPL CX, $4*32 @@ -317,10 +281,10 @@ avx2_4blocks: XORL CX, CX avx2_4blocks_loop: - AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3) - AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0) - AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1) - AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2) + AVX2_SM4_ROUND_4BLOCKS(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3) + AVX2_SM4_ROUND_4BLOCKS(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0) + AVX2_SM4_ROUND_4BLOCKS(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1) + AVX2_SM4_ROUND_4BLOCKS(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2) ADDL $16, CX CMPL CX, $4*32 diff --git a/sm4/cbc_cipher_asm_amd64.s b/sm4/cbc_cipher_asm_amd64.s index e46e2326..c1aa62ee 100644 --- a/sm4/cbc_cipher_asm_amd64.s +++ b/sm4/cbc_cipher_asm_amd64.s @@ -106,42 +106,6 @@ done_sm4: #define XWORD X8 #define YWORD X9 -// SM4 round function, AVX2 version, handle 256 bits -// t0 ^= tao_l1(t1^t2^t3^xk) -// parameters: -// - index: round key index immediate number -// - x: 256 bits temp register -// - y: 256 bits temp register -// - t0: 256 bits register for data as result -// - t1: 256 bits register for data -// - t2: 256 bits register for data -// - t3: 256 bits register for data -#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ - VPBROADCASTD (index * 4)(AX)(CX*1), x; \ - VPXOR t1, x, x; \ - VPXOR t2, x, x; \ - VPXOR t3, x, x; \ - AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \ - VPXOR x, t0, t0 - -// SM4 round function, AVX version, handle 128 bits -// t0 ^= tao_l1(t1^t2^t3^xk) -// parameters: -// - index: round key index immediate number -// - x: 128 bits temp register -// - y: 128 bits temp register -// - t0: 128 bits register for data as result -// - t1: 128 bits register for data -// - t2: 128 bits register for data -// - t3: 128 bits register for data -#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \ - VPBROADCASTD (index * 4)(AX)(CX*1), x; \ - VPXOR t1, x, x; \ - VPXOR t2, x, x; \ - VPXOR t3, x, x; \ - AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \ - VPXOR x, t0, t0 - // func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 MOVQ xk+0(FP), AX @@ -217,10 +181,10 @@ avx: XORL CX, CX avx_loop: - AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3) - AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0) - AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1) - AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2) + AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3) + AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0) + AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1) + AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2) ADDL $16, CX CMPL CX, $4*32 @@ -269,10 +233,10 @@ avx2_8blocks: XORL CX, CX avx2_loop: - AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2) ADDL $16, CX CMPL CX, $4*32 diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index 4823fdf0..07646c58 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -263,22 +263,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 PSHUFB BSWAP, t1; \ PSHUFB BSWAP, t0 -#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \ - VPBROADCASTD (index * 4)(RK)(IND*1), x; \ - VPXOR t1, x, x; \ - VPXOR t2, x, x; \ - VPXOR t3, x, x; \ - AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ - VPXOR x, t0, t0 - -#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ - VPBROADCASTD (index * 4)(RK)(IND*1), x; \ - VPXOR t1, x, x; \ - VPXOR t2, x, x; \ - VPXOR t3, x, x; \ - AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \ - VPXOR x, t0, t0 - // func gcmSm4Init(productTable *[256]byte, rk []uint32) TEXT ·gcmSm4Init(SB),NOSPLIT,$0 #define dst DI @@ -1614,10 +1598,10 @@ avx2GcmSm4EncNibbles: VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK avx2GcmSm4Enc4Loop2: - AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) - AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) - AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) - AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) + AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) + AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) + AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) + AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) ADDL $16, BX CMPL BX, $4*32 @@ -1676,10 +1660,10 @@ avx2GcmSm4EncSingles: VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK avx2GcmSm4Enc4Loop1: - AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) - AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) - AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) - AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) + AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) + AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) + AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) + AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) ADDL $16, BX CMPL BX, $4*32 @@ -2472,10 +2456,10 @@ avx2GcmSm4DecNibbles: VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK avx2GcmSm4Dec4Loop2: - AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) - AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) - AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) - AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) + AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) + AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) + AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) + AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) ADDL $16, BX CMPL BX, $4*32 @@ -2538,10 +2522,10 @@ avx2GcmSm4DecSingles: VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK avx2GcmSm4Dec4Loop1: - AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) - AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) - AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) - AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) + AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3) + AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0) + AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1) + AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2) ADDL $16, BX CMPL BX, $4*32