From 2f3ec65c88e5901c0a1e51a3a1f77a30bda0b266 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Mon, 30 Sep 2024 11:36:06 +0800 Subject: [PATCH] zuc: eia128 ppc64x --- zuc/eia.go | 50 ++++++++++++++------- zuc/eia_asm.go | 2 +- zuc/eia_asm_ppc64x.s | 101 +++++++++++++++++++++++++++++++++++++++++++ zuc/eia_generic.go | 2 +- 4 files changed, 137 insertions(+), 18 deletions(-) create mode 100644 zuc/eia_asm_ppc64x.s diff --git a/zuc/eia.go b/zuc/eia.go index ccdb9a23..1ed2a3bd 100644 --- a/zuc/eia.go +++ b/zuc/eia.go @@ -10,14 +10,14 @@ const ( ) type ZUC128Mac struct { - zucState32 - k0 [8]uint32 - t uint32 - x [chunk]byte - nx int - len uint64 - tagSize int - initState zucState32 + zucState32 // current zuc state + k0 [8]uint32 // keywords + t uint32 // tag + x [chunk]byte //buffer + nx int // remaining data in x + len uint64 // total data length + tagSize int // tag size + initState zucState32 // initial state for reset } // NewHash create hash for zuc-128 eia, with arguments key and iv. @@ -94,17 +94,22 @@ func (m *ZUC128Mac) Reset() { } func blockGeneric(m *ZUC128Mac, p []byte) { + // use 64 bits to shift left 2 keywords var k64, t64 uint64 t64 = uint64(m.t) << 32 for len(p) >= chunk { + // generate next 4 keywords m.genKeywords(m.k0[4:]) k64 = uint64(m.k0[0])<<32 | uint64(m.k0[1]) + // process first 32 bits w := binary.BigEndian.Uint32(p[0:4]) for j := 0; j < 32; j++ { + // t64 ^= (w >> 31) ? k64 : 0 t64 ^= ^(uint64(w>>31) - 1) & k64 w <<= 1 k64 <<= 1 } + // process second 32 bits k64 = uint64(m.k0[1])<<32 | uint64(m.k0[2]) w = binary.BigEndian.Uint32(p[4:8]) for j := 0; j < 32; j++ { @@ -112,6 +117,7 @@ func blockGeneric(m *ZUC128Mac, p []byte) { w <<= 1 k64 <<= 1 } + // process third 32 bits k64 = uint64(m.k0[2])<<32 | uint64(m.k0[3]) w = binary.BigEndian.Uint32(p[8:12]) for j := 0; j < 32; j++ { @@ -119,6 +125,7 @@ func blockGeneric(m *ZUC128Mac, p []byte) { w <<= 1 k64 <<= 1 } + // process fourth 32 bits k64 = uint64(m.k0[3])<<32 | uint64(m.k0[4]) w = binary.BigEndian.Uint32(p[12:16]) for j := 0; j < 32; j++ { @@ -126,6 +133,7 @@ func blockGeneric(m *ZUC128Mac, p []byte) { w <<= 1 k64 <<= 1 } + // Move the new keywords to the first 4 copy(m.k0[:4], m.k0[4:]) p = p[chunk:] } @@ -164,12 +172,16 @@ func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte { var k64, t64 uint64 t64 = uint64(m.t) << 32 m.x[m.nx] = b + // total bits to handle nRemainBits := 8*m.nx + additionalBits if nRemainBits > 2*32 { + // generate next 2 keywords m.genKeywords(m.k0[4:6]) } - words := (nRemainBits + 31) / 32 - for i := 0; i < words-1; i++ { + // nwords <= 4 + nwords := (nRemainBits + 31) / 32 + // process 32 bits at a time for first complete words + for i := 0; i < nwords-1; i++ { k64 = uint64(m.k0[i])<<32 | uint64(m.k0[i+1]) w := binary.BigEndian.Uint32(m.x[i*4:]) for j := 0; j < 32; j++ { @@ -178,18 +190,21 @@ func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte { k64 <<= 1 } } - nRemainBits -= (words - 1) * 32 - kIdx = words - 1 + nRemainBits -= (nwords - 1) * 32 + // current key word index, 0 <= kIdx <= 3 + kIdx = nwords - 1 + // process remaining bits less than 32 if nRemainBits > 0 { k64 = uint64(m.k0[kIdx])<<32 | uint64(m.k0[kIdx+1]) - w := binary.BigEndian.Uint32(m.x[(words-1)*4:]) + w := binary.BigEndian.Uint32(m.x[(nwords-1)*4:]) for j := 0; j < nRemainBits; j++ { t64 ^= ^(uint64(w>>31) - 1) & k64 w <<= 1 k64 <<= 1 } - m.k0[kIdx] = uint32(k64 >> 32) - m.k0[kIdx+1] = m.k0[kIdx+2] + // Reset for fianal computation + m.k0[kIdx] = uint32(k64 >> 32) // key[LENGTH] + m.k0[kIdx+1] = m.k0[kIdx+2] // Last key word } m.t = uint32(t64 >> 32) } @@ -201,8 +216,10 @@ func (m *ZUC128Mac) checkSum(additionalBits int, b byte) [4]byte { return digest } -// Finish this function hash nbits data in p and return mac value +// Finish this function hash nbits data in p and return mac value, after this function call, +// the hash state will be reset. // In general, we will use byte level function, this is just for test/verify. +// nbits: number of bits to hash in p. func (m *ZUC128Mac) Finish(p []byte, nbits int) []byte { if len(p) < (nbits+7)/8 { panic("invalid p length") @@ -217,6 +234,7 @@ func (m *ZUC128Mac) Finish(p []byte, nbits int) []byte { b = p[nbytes] } digest := m.checkSum(nRemainBits, b) + m.Reset() return digest[:] } diff --git a/zuc/eia_asm.go b/zuc/eia_asm.go index 8f10dd61..bf8f6147 100644 --- a/zuc/eia_asm.go +++ b/zuc/eia_asm.go @@ -1,4 +1,4 @@ -//go:build (amd64 || arm64) && !purego +//go:build (amd64 || arm64 || ppc64 || ppc64le) && !purego package zuc diff --git a/zuc/eia_asm_ppc64x.s b/zuc/eia_asm_ppc64x.s new file mode 100644 index 00000000..c7f320fc --- /dev/null +++ b/zuc/eia_asm_ppc64x.s @@ -0,0 +1,101 @@ +// Copyright 2024 Sun Yimin. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +//go:build (ppc64 || ppc64le) && !purego + +#include "textflag.h" + +DATA ·rcon+0x00(SB)/8, $0x0706050403020100 // Permute for vector doubleword endian swap +DATA ·rcon+0x08(SB)/8, $0x0f0e0d0c0b0a0908 +DATA ·rcon+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f // bit_reverse_and_table +DATA ·rcon+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f +DATA ·rcon+0x20(SB)/8, $0x0008040c020a060e // bit_reverse_table_l +DATA ·rcon+0x28(SB)/8, $0x0109050d030b070f // bit_reverse_table_l +DATA ·rcon+0x30(SB)/8, $0x0000000010111213 // data mask +DATA ·rcon+0x38(SB)/8, $0x0000000014151617 // data mask +DATA ·rcon+0x40(SB)/8, $0x0000000018191a1b // data mask +DATA ·rcon+0x48(SB)/8, $0x000000001c1d1e1f // data mask +DATA ·rcon+0x50(SB)/8, $0x0405060708090a0b // ks mask +DATA ·rcon+0x58(SB)/8, $0x0001020304050607 // ks mask +GLOBL ·rcon(SB), RODATA, $96 + +#define XTMP1 V0 +#define XTMP2 V1 +#define XTMP3 V2 +#define XTMP4 V3 +#define XDATA V6 +#define XDIGEST V7 +#define KS_L V8 +#define KS_M1 V9 +#define BIT_REV_TAB_L V12 +#define BIT_REV_TAB_H V13 +#define BIT_REV_AND_TAB V14 + +#define PTR R7 + +// func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int) +TEXT ·eia3Round16B(SB),NOSPLIT,$0 + MOVD t+0(FP), R3 + MOVD ks+8(FP), R4 + MOVD p+16(FP), R5 + +#ifndef GOARCH_ppc64le + MOVD $·rcon(SB), PTR // PTR points to rcon addr + LVX (PTR), XTMP1 + ADD $0x10, PTR +#else + MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector) +#endif + + LXVD2X (R5)(R0), XDATA +#ifndef GOARCH_ppc64le + VPERM XDATA, XDATA, XTMP1, XDATA +#endif + + LXVD2X (PTR)(R0), BIT_REV_AND_TAB + VAND BIT_REV_AND_TAB, XDATA, XTMP3 + VSPLTISB $4, XTMP2; + VSRW XDATA, XTMP2, XTMP1 + VAND BIT_REV_AND_TAB, XTMP1, XTMP1 + + MOVD $0x20, PTR + LXVD2X (PTR)(R0), BIT_REV_TAB_L + VSLB BIT_REV_TAB_L, XTMP2, BIT_REV_TAB_H + VPERM BIT_REV_TAB_L, BIT_REV_TAB_L, XTMP3, XTMP3 + VPERM BIT_REV_TAB_H, BIT_REV_TAB_H, XTMP1, XTMP1 + VXOR XTMP1, XTMP3, XTMP3 // XTMP3 - bit reverse data bytes + + // ZUC authentication part, 4x32 data bits + // setup data + VSPLTISB $0, XTMP2 + MOVD $0x30, PTR + LXVD2X (PTR)(R0), XTMP4 + VPERM XTMP2, XTMP3, XTMP4, XTMP1 + MOVD $0x40, PTR + LXVD2X (PTR)(R0), XTMP4 + VPERM XTMP2, XTMP3, XTMP4, XTMP2 + + // setup KS + LXVW4X (R4), KS_L + MOVD $8, PTR + LXVW4X (PTR)(R4), KS_M1 + MOVD $0x50, PTR + LXVD2X (PTR)(R0), XTMP1 + VPERM KS_L, KS_L, XTMP1, KS_L + VPERM KS_M1, KS_M1, XTMP1, KS_M1 + + // clmul + // xor the results from 4 32-bit words together + // Calculate lower 32 bits of tag + VPMSUMD XTMP1, KS_L, XTMP3 + VPMSUMD XTMP2, KS_M1, XTMP4 + VXOR XTMP3, XTMP4, XTMP3 + VSPLTW $2, XTMP3, XDIGEST + + MFVSRWZ XDIGEST, PTR + MOVWZ (R3), R6 + XOR R6, PTR, R6 + MOVW R6, (R3) + + RET diff --git a/zuc/eia_generic.go b/zuc/eia_generic.go index 51e8bde0..6bcca2c7 100644 --- a/zuc/eia_generic.go +++ b/zuc/eia_generic.go @@ -1,4 +1,4 @@ -//go:build purego || !(amd64 || arm64) +//go:build purego || !(amd64 || arm64 || ppc64 || ppc64le) package zuc