-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
324 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
//go:build (amd64 || arm64) && !purego | ||
//go:build (amd64 || arm64 || ppc64 || ppc64le) && !purego | ||
|
||
package sm4 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
//go:build (amd64 || arm64) && !purego | ||
//go:build (amd64 || arm64 || ppc64 || ppc64le) && !purego | ||
|
||
package sm4 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,314 @@ | ||
// Copyright 2024 Sun Yimin. All rights reserved. | ||
// Use of this source code is governed by a MIT-style | ||
// license that can be found in the LICENSE file. | ||
|
||
//go:build (ppc64 || ppc64le) && !purego | ||
|
||
#include "textflag.h" | ||
|
||
#define REVERSE_WORDS V19 | ||
#define M1L V20 | ||
#define M1H V21 | ||
#define M2L V22 | ||
#define M2H V23 | ||
#define V_FOUR V24 | ||
#define M0 V25 | ||
#define M1 V26 | ||
#define M2 V27 | ||
#define M3 V28 | ||
#define NIBBLE_MASK V29 | ||
#define INVERSE_SHIFT_ROWS V30 | ||
// For instruction emulation | ||
#define ESPERMW V31 // Endian swapping permute into BE | ||
|
||
#define TMP0 V10 | ||
#define TMP1 V11 | ||
#define TMP2 V12 | ||
#define TMP3 V13 | ||
#define IV V18 | ||
|
||
#include "aesni_macros_ppc64x.s" | ||
|
||
// func decryptBlocksChain(xk *uint32, dst, src []byte, iv *byte) | ||
TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 | ||
#define dstPtr R3 | ||
#define srcPtr R4 | ||
#define rk R5 | ||
#define srcLen R6 | ||
// prepare/load constants | ||
VSPLTISW $4, V_FOUR; | ||
#ifdef NEEDS_PERMW | ||
MOVD $·rcon(SB), R4 | ||
LVX (R4), ESPERMW | ||
#endif | ||
MOVD $·rcon+0x10(SB), R4 | ||
LOAD_CONSTS(R4, R3) | ||
|
||
// Load IV | ||
MOVD iv+56(FP), R7 | ||
PPC64X_LXVW4X(R7, R0, IV) | ||
|
||
MOVD xk+0(FP), rk | ||
MOVD dst+8(FP), dstPtr | ||
MOVD src+32(FP), srcPtr | ||
MOVD src_len+40(FP), srcLen | ||
|
||
MOVD $16, R7 | ||
MOVD $32, R8 | ||
MOVD $48, R9 | ||
MOVD $64, R10 | ||
MOVD $80, R11 | ||
MOVD $96, R12 | ||
MOVD $112, R14 | ||
|
||
ADD srcPtr, srcLen, R15 | ||
ADD $-16, R15, R15 | ||
LXVD2X (R15)(R0), V14 // Load last 16 bytes of src into V14 | ||
|
||
CMP srcLen, $144 // 9 blocks | ||
BLT lessThan9blocks | ||
|
||
PCALIGN $16 | ||
loop8blocks: | ||
ADD $-128, srcLen | ||
ADD srcPtr, srcLen, R15 | ||
ADD $-16, R15, R16 | ||
ADD dstPtr, srcLen, R17 | ||
PPC64X_LXVW4X(R15, R0, V0) | ||
PPC64X_LXVW4X(R15, R7, V1) | ||
PPC64X_LXVW4X(R15, R8, V2) | ||
PPC64X_LXVW4X(R15, R9, V3) | ||
PPC64X_LXVW4X(R15, R10, V4) | ||
PPC64X_LXVW4X(R15, R11, V5) | ||
PPC64X_LXVW4X(R15, R12, V6) | ||
PPC64X_LXVW4X(R15, R14, V7) | ||
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) | ||
PRE_TRANSPOSE_MATRIX(V4, V5, V6, V7) | ||
|
||
LXVW4X (rk)(R0), V8 | ||
PROCESS_8BLOCKS_4ROUND | ||
LXVW4X (rk)(R7), V8 | ||
PROCESS_8BLOCKS_4ROUND | ||
LXVW4X (rk)(R8), V8 | ||
PROCESS_8BLOCKS_4ROUND | ||
LXVW4X (rk)(R9), V8 | ||
PROCESS_8BLOCKS_4ROUND | ||
LXVW4X (rk)(R10), V8 | ||
PROCESS_8BLOCKS_4ROUND | ||
LXVW4X (rk)(R11), V8 | ||
PROCESS_8BLOCKS_4ROUND | ||
LXVW4X (rk)(R12), V8 | ||
PROCESS_8BLOCKS_4ROUND | ||
LXVW4X (rk)(R14), V8 | ||
PROCESS_8BLOCKS_4ROUND | ||
|
||
TRANSPOSE_MATRIX(V0, V1, V2, V3) | ||
TRANSPOSE_MATRIX(V4, V5, V6, V7) | ||
|
||
LXVW4X (R16)(R0), TMP0 | ||
LXVW4X (R16)(R7), TMP1 | ||
LXVW4X (R16)(R8), TMP2 | ||
LXVW4X (R16)(R9), TMP3 | ||
CBC_STXVW4X(V0, TMP0, R17, R0) | ||
CBC_STXVW4X(V1, TMP1, R17, R7) | ||
CBC_STXVW4X(V2, TMP2, R17, R8) | ||
CBC_STXVW4X(V3, TMP3, R17, R9) | ||
|
||
LXVW4X (R16)(R10), TMP0 | ||
LXVW4X (R16)(R11), TMP1 | ||
LXVW4X (R16)(R12), TMP2 | ||
LXVW4X (R16)(R14), TMP3 | ||
CBC_STXVW4X(V4, TMP0, R17, R10) | ||
CBC_STXVW4X(V5, TMP1, R17, R11) | ||
CBC_STXVW4X(V6, TMP2, R17, R12) | ||
CBC_STXVW4X(V7, TMP3, R17, R14) | ||
|
||
CMP srcLen, $144 // 9 blocks | ||
BGE loop8blocks | ||
|
||
lessThan9blocks: | ||
CMP srcLen, $64 | ||
BLE ble4blocks | ||
|
||
ADD $-64, srcLen | ||
ADD srcPtr, srcLen, R15 | ||
ADD $-16, R15, R16 | ||
ADD dstPtr, srcLen, R17 | ||
PPC64X_LXVW4X(R15, R0, V0) | ||
PPC64X_LXVW4X(R15, R7, V1) | ||
PPC64X_LXVW4X(R15, R8, V2) | ||
PPC64X_LXVW4X(R15, R9, V3) | ||
VOR V0, V0, V5 | ||
VOR V1, V1, V6 | ||
VOR V2, V2, V7 | ||
|
||
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) | ||
|
||
LXVW4X (rk)(R0), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R7), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R8), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R9), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R10), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R11), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R12), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R14), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
|
||
TRANSPOSE_MATRIX(V0, V1, V2, V3) | ||
PPC64X_LXVW4X(R16, R0, V4) | ||
VXOR V0, V4, V0 | ||
VXOR V1, V5, V1 | ||
VXOR V2, V6, V2 | ||
VXOR V3, V7, V3 | ||
PPC64X_STXVW4X(V0, R17, R0) | ||
PPC64X_STXVW4X(V1, R17, R7) | ||
PPC64X_STXVW4X(V2, R17, R8) | ||
PPC64X_STXVW4X(V3, R17, R9) | ||
|
||
ble4blocks: | ||
CMPU srcLen, $48, CR1 | ||
CMPU srcLen, $32, CR2 | ||
CMPU srcLen, $16, CR3 | ||
BEQ CR1, eq3blocks | ||
BEQ CR2, eq2blocks | ||
BEQ CR3, eq1block | ||
|
||
PPC64X_LXVW4X(srcPtr, R0, V0) | ||
PPC64X_LXVW4X(srcPtr, R7, V1) | ||
PPC64X_LXVW4X(srcPtr, R8, V2) | ||
PPC64X_LXVW4X(srcPtr, R9, V3) | ||
VOR V0, V0, V4 | ||
VOR V1, V1, V5 | ||
VOR V2, V2, V6 | ||
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) | ||
|
||
LXVW4X (rk)(R0), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R7), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R8), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R9), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R10), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R11), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R12), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R14), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
|
||
TRANSPOSE_MATRIX(V0, V1, V2, V3) | ||
VXOR V0, IV, V0 | ||
VXOR V1, V4, V1 | ||
VXOR V2, V5, V2 | ||
VXOR V3, V6, V3 | ||
PPC64X_STXVW4X(V0, dstPtr, R0) | ||
PPC64X_STXVW4X(V1, dstPtr, R7) | ||
PPC64X_STXVW4X(V2, dstPtr, R8) | ||
PPC64X_STXVW4X(V3, dstPtr, R9) | ||
BR done | ||
|
||
eq3blocks: | ||
PPC64X_LXVW4X(srcPtr, R0, V0) | ||
PPC64X_LXVW4X(srcPtr, R7, V1) | ||
PPC64X_LXVW4X(srcPtr, R8, V2) | ||
VOR V0, V0, V4 | ||
VOR V1, V1, V5 | ||
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) | ||
|
||
LXVW4X (rk)(R0), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R7), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R8), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R9), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R10), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R11), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R12), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R14), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
|
||
TRANSPOSE_MATRIX(V0, V1, V2, V3) | ||
VXOR V0, IV, V0 | ||
VXOR V1, V4, V1 | ||
VXOR V2, V5, V2 | ||
PPC64X_STXVW4X(V0, dstPtr, R0) | ||
PPC64X_STXVW4X(V1, dstPtr, R7) | ||
PPC64X_STXVW4X(V2, dstPtr, R8) | ||
BR done | ||
|
||
eq2blocks: | ||
PPC64X_LXVW4X(srcPtr, R0, V0) | ||
PPC64X_LXVW4X(srcPtr, R7, V1) | ||
VOR V0, V0, V4 | ||
PRE_TRANSPOSE_MATRIX(V0, V1, V2, V3) | ||
LXVW4X (rk)(R0), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R7), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R8), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R9), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R10), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R11), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R12), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
LXVW4X (rk)(R14), V8 | ||
PROCESS_4BLOCKS_4ROUND | ||
|
||
TRANSPOSE_MATRIX(V0, V1, V2, V3) | ||
VXOR V0, IV, V0 | ||
VXOR V1, V4, V1 | ||
PPC64X_STXVW4X(V0, dstPtr, R0) | ||
PPC64X_STXVW4X(V1, dstPtr, R7) | ||
BR done | ||
|
||
eq1block: | ||
PPC64X_LXVW4X(srcPtr, R0, V0) | ||
VSLDOI $4, V0, V0, V1 | ||
VSLDOI $4, V1, V1, V2 | ||
VSLDOI $4, V2, V2, V3 | ||
LXVW4X (rk)(R0), V8 | ||
PROCESS_SINGLEBLOCK_4ROUND | ||
LXVW4X (rk)(R7), V8 | ||
PROCESS_SINGLEBLOCK_4ROUND | ||
LXVW4X (rk)(R8), V8 | ||
PROCESS_SINGLEBLOCK_4ROUND | ||
LXVW4X (rk)(R9), V8 | ||
PROCESS_SINGLEBLOCK_4ROUND | ||
LXVW4X (rk)(R10), V8 | ||
PROCESS_SINGLEBLOCK_4ROUND | ||
LXVW4X (rk)(R11), V8 | ||
PROCESS_SINGLEBLOCK_4ROUND | ||
LXVW4X (rk)(R12), V8 | ||
PROCESS_SINGLEBLOCK_4ROUND | ||
LXVW4X (rk)(R14), V8 | ||
PROCESS_SINGLEBLOCK_4ROUND | ||
VSLDOI $4, V3, V3, V3 | ||
VSLDOI $4, V3, V2, V2 | ||
VSLDOI $4, V2, V1, V1 | ||
VSLDOI $4, V1, V0, V0 | ||
VXOR V0, IV, V0 | ||
PPC64X_STXVW4X(V0, dstPtr, R0) | ||
|
||
done: | ||
MOVD iv+56(FP), R7 | ||
STXVD2X V14, (R7)(R0) | ||
RET |