Skip to content

Commit

Permalink
sm4: fix AVX version use AVX2 inst. issue
Browse files Browse the repository at this point in the history
  • Loading branch information
emmansun authored Jul 13, 2023
1 parent fc287b6 commit fc2f105
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 124 deletions.
56 changes: 56 additions & 0 deletions sm4/aesni_macros_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,25 @@ GLOBL fk_mask<>(SB), 8, $16
VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = [w31, w27, w15, w7, w27, w19, w11, w3] r3 = [w15, w11, w7, w3]
VPUNPCKLQDQ r2, tmp2, r2 // r2 = [w30, w22, w14, w6, w26, w18, w10, w2] r2 = [w14, w10, w6, w2]

// SM4 round function, AVX version, handle 128 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 128 bits temp register
// - y: 128 bits temp register
// - t0: 128 bits register for data as result
// - t1: 128 bits register for data
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
MOVL (index * 4)(RK)(IND*1), x; \
VPSHUFD $0, x, x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
VPXOR x, t0, t0

// SM4 sbox function, AVX2 version
// parameters:
// - x: 256 bits register as sbox input/output data
Expand Down Expand Up @@ -321,3 +340,40 @@ GLOBL fk_mask<>(SB), 8, $16
VPSHUFB z, x, z; \
VPXOR y, x, x; \
VPXOR x, z, x

// SM4 round function, AVX2 version, handle 256 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 256 bits temp register, MUST use XDWORD!
// - y: 256 bits temp register, MUST use YDWORD!
// - t0: 256 bits register for data as result
// - t1: 256 bits register for data
// - t2: 256 bits register for data
// - t3: 256 bits register for data
#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
VPXOR x, t0, t0

// SM4 round function, AVX version, handle 128 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 128 bits temp register
// - y: 128 bits temp register
// - t0: 128 bits register for data as result
// - t1: 128 bits register for data
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define AVX2_SM4_ROUND_4BLOCKS(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPSHUFD $0, x, x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
VPXOR x, t0, t0
60 changes: 12 additions & 48 deletions sm4/asm_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -88,42 +88,6 @@
#define XWORD X8
#define YWORD X9

// SM4 round function, AVX2 version, handle 256 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 256 bits temp register, MUST use XDWORD!
// - y: 256 bits temp register, MUST use YDWORD!
// - t0: 256 bits register for data as result
// - t1: 256 bits register for data
// - t2: 256 bits register for data
// - t3: 256 bits register for data
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
VPXOR x, t0, t0

// SM4 round function, AVX version, handle 128 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 128 bits temp register
// - y: 128 bits temp register
// - t0: 128 bits register for data as result
// - t1: 128 bits register for data
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
VPXOR x, t0, t0

// func expandKeyAsm(key *byte, ck, enc, dec *uint32, inst int)
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
MOVQ key+0(FP), AX
Expand Down Expand Up @@ -225,10 +189,10 @@ avx:
XORL CX, CX

avx_loop:
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)

ADDL $16, CX
CMPL CX, $4*32
Expand Down Expand Up @@ -274,10 +238,10 @@ avx2_8blocks:
XORL CX, CX

avx2_loop:
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)

ADDL $16, CX
CMPL CX, $4*32
Expand Down Expand Up @@ -317,10 +281,10 @@ avx2_4blocks:
XORL CX, CX

avx2_4blocks_loop:
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
AVX2_SM4_ROUND_4BLOCKS(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
AVX2_SM4_ROUND_4BLOCKS(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
AVX2_SM4_ROUND_4BLOCKS(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
AVX2_SM4_ROUND_4BLOCKS(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)

ADDL $16, CX
CMPL CX, $4*32
Expand Down
52 changes: 8 additions & 44 deletions sm4/cbc_cipher_asm_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -106,42 +106,6 @@ done_sm4:
#define XWORD X8
#define YWORD X9

// SM4 round function, AVX2 version, handle 256 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 256 bits temp register
// - y: 256 bits temp register
// - t0: 256 bits register for data as result
// - t1: 256 bits register for data
// - t2: 256 bits register for data
// - t3: 256 bits register for data
#define AVX2_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, XDWTMP0, XWORD, YWORD, X_NIBBLE_MASK, NIBBLE_MASK); \
VPXOR x, t0, t0

// SM4 round function, AVX version, handle 128 bits
// t0 ^= tao_l1(t1^t2^t3^xk)
// parameters:
// - index: round key index immediate number
// - x: 128 bits temp register
// - y: 128 bits temp register
// - t0: 128 bits register for data as result
// - t1: 128 bits register for data
// - t2: 128 bits register for data
// - t3: 128 bits register for data
#define AVX_SM4_ROUND(index, x, y, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(AX)(CX*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, XWTMP0); \
VPXOR x, t0, t0

// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte)
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0
MOVQ xk+0(FP), AX
Expand Down Expand Up @@ -217,10 +181,10 @@ avx:
XORL CX, CX

avx_loop:
AVX_SM4_ROUND(0, XWORD, YWORD, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, XWORD, YWORD, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, XWORD, YWORD, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, XWORD, YWORD, XWORD3, XWORD0, XWORD1, XWORD2)
AVX_SM4_ROUND(0, AX, CX, XWORD, YWORD, XWTMP0, XWORD0, XWORD1, XWORD2, XWORD3)
AVX_SM4_ROUND(1, AX, CX, XWORD, YWORD, XWTMP0, XWORD1, XWORD2, XWORD3, XWORD0)
AVX_SM4_ROUND(2, AX, CX, XWORD, YWORD, XWTMP0, XWORD2, XWORD3, XWORD0, XWORD1)
AVX_SM4_ROUND(3, AX, CX, XWORD, YWORD, XWTMP0, XWORD3, XWORD0, XWORD1, XWORD2)

ADDL $16, CX
CMPL CX, $4*32
Expand Down Expand Up @@ -269,10 +233,10 @@ avx2_8blocks:
XORL CX, CX

avx2_loop:
AVX2_SM4_ROUND(0, XDWORD, YDWORD, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
AVX2_SM4_ROUND(1, XDWORD, YDWORD, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
AVX2_SM4_ROUND(2, XDWORD, YDWORD, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
AVX2_SM4_ROUND(3, XDWORD, YDWORD, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
AVX2_SM4_ROUND(0, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
AVX2_SM4_ROUND(1, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
AVX2_SM4_ROUND(2, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
AVX2_SM4_ROUND(3, AX, CX, XDWORD, YDWORD, XWORD, YWORD, XDWTMP0, XDWORD3, XDWORD0, XDWORD1, XDWORD2)

ADDL $16, CX
CMPL CX, $4*32
Expand Down
48 changes: 16 additions & 32 deletions sm4/gcm_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -263,22 +263,6 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0
PSHUFB BSWAP, t1; \
PSHUFB BSWAP, t0

#define AVX2_SM4_ROUND(index, RK, IND, x, y, xw, yw, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \
VPXOR x, t0, t0

#define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \
VPBROADCASTD (index * 4)(RK)(IND*1), x; \
VPXOR t1, x, x; \
VPXOR t2, x, x; \
VPXOR t3, x, x; \
AVX_SM4_TAO_L1(x, y, X_NIBBLE_MASK, tmp); \
VPXOR x, t0, t0

// func gcmSm4Init(productTable *[256]byte, rk []uint32)
TEXT ·gcmSm4Init(SB),NOSPLIT,$0
#define dst DI
Expand Down Expand Up @@ -1614,10 +1598,10 @@ avx2GcmSm4EncNibbles:
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK

avx2GcmSm4Enc4Loop2:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)

ADDL $16, BX
CMPL BX, $4*32
Expand Down Expand Up @@ -1676,10 +1660,10 @@ avx2GcmSm4EncSingles:
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK

avx2GcmSm4Enc4Loop1:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)

ADDL $16, BX
CMPL BX, $4*32
Expand Down Expand Up @@ -2472,10 +2456,10 @@ avx2GcmSm4DecNibbles:
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK

avx2GcmSm4Dec4Loop2:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)

ADDL $16, BX
CMPL BX, $4*32
Expand Down Expand Up @@ -2538,10 +2522,10 @@ avx2GcmSm4DecSingles:
VMOVDQU nibble_mask<>(SB), X_NIBBLE_MASK

avx2GcmSm4Dec4Loop1:
AVX_SM4_ROUND(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX_SM4_ROUND(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX_SM4_ROUND(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX_SM4_ROUND(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)
AVX2_SM4_ROUND_4BLOCKS(0, rk, BX, B4, B5, B6, B0, B1, B2, B3)
AVX2_SM4_ROUND_4BLOCKS(1, rk, BX, B4, B5, B6, B1, B2, B3, B0)
AVX2_SM4_ROUND_4BLOCKS(2, rk, BX, B4, B5, B6, B2, B3, B0, B1)
AVX2_SM4_ROUND_4BLOCKS(3, rk, BX, B4, B5, B6, B3, B0, B1, B2)

ADDL $16, BX
CMPL BX, $4*32
Expand Down

1 comment on commit fc2f105

@emmansun
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.