From 7a4318c939ec887692242864083b9d9b3b8441e1 Mon Sep 17 00:00:00 2001 From: Bas Westerbaan Date: Wed, 19 Apr 2023 00:23:19 +0200 Subject: [PATCH] Add KangarooTwelve draft -10 --- internal/sha3/sha3.go | 4 + internal/sha3/shake.go | 4 + xof/k12/k12.go | 377 +++++++++++++++++++++++++++++++++++++++++ xof/k12/k12_test.go | 96 +++++++++++ 4 files changed, 481 insertions(+) create mode 100644 xof/k12/k12.go create mode 100644 xof/k12/k12_test.go diff --git a/internal/sha3/sha3.go b/internal/sha3/sha3.go index 01806d7d1..a0df5aa6c 100644 --- a/internal/sha3/sha3.go +++ b/internal/sha3/sha3.go @@ -194,3 +194,7 @@ func (d *State) Sum(in []byte) []byte { _, _ = dup.Read(hash) return append(in, hash...) } + +func (d *State) IsAbsorbing() bool { + return d.state == spongeAbsorbing +} diff --git a/internal/sha3/shake.go b/internal/sha3/shake.go index 2a14d78a3..77817f758 100644 --- a/internal/sha3/shake.go +++ b/internal/sha3/shake.go @@ -113,3 +113,7 @@ func TurboShakeSum256(hash, data []byte, D byte) { _, _ = h.Write(data) _, _ = h.Read(hash) } + +func (d *State) SwitchDS(D byte) { + d.dsbyte = D +} diff --git a/xof/k12/k12.go b/xof/k12/k12.go new file mode 100644 index 000000000..ba8567744 --- /dev/null +++ b/xof/k12/k12.go @@ -0,0 +1,377 @@ +// k12 implements the KangarooTwelve XOF. +// +// KangarooTwelve is being standardised at the CFFRG working group +// of the IRTF. This package implements draft 10. +// +// https://datatracker.ietf.org/doc/draft-irtf-cfrg-kangarootwelve/10/ +package k12 + +import ( + "encoding/binary" + + "github.com/cloudflare/circl/internal/sha3" + "github.com/cloudflare/circl/simd/keccakf1600" +) + +const chunkSize = 8192 // aka B + +// KangarooTwelve splits the message into chunks of 8192 bytes each. +// The first chunk is absorbed directly in a TurboSHAKE128 instance, which +// we call the stalk. The subsequent chunks aren't absorbed directly, but +// instead their hash is absorbed: they're like leafs on a stalk. +// If we have a fast TurboSHAKE128 available, we buffer chunks until we have +// enough to do the parallel TurboSHAKE128. If not, we absorb directly into +// a separate TurboSHAKE128 state. + +type State struct { + initialTodo int // Bytes left to absorb for the first chunk. + + stalk sha3.State + + context []byte // context string "C" provided by the user + + // buffer of incoming data so we can do parallel TurboSHAKE128: + // nil when we haven't aborbed the first chunk yet; + // empty if we have, but we do not have a fast parallel TurboSHAKE128; + // and chunkSize*lanes in length if we have. + buf []byte + + offset int // offset in buf or bytes written to leaf + + // Number of chunk hashes ("CV_i") absorbed into the stalk. + chunk uint + + // TurboSHAKE128 instance to compute the leaf in case we don't have + // a fast parallel TurboSHAKE128, viz when lanes == 1. + leaf *sha3.State + + lanes uint8 // number of TurboSHAKE128s to compute in parallel +} + +// NewDraft10 creates a new instance of Kangaroo12 draft version -10. +func NewDraft10(c []byte) State { + var lanes byte = 1 + + if keccakf1600.IsEnabledX4() { + lanes = 4 + } else if keccakf1600.IsEnabledX2() { + lanes = 2 + } + + return newDraft10(c, lanes) +} + +func newDraft10(c []byte, lanes byte) State { + return State{ + initialTodo: chunkSize, + stalk: sha3.NewTurboShake128(0x07), + context: c, + lanes: lanes, + } +} + +func (s *State) Reset() { + s.initialTodo = chunkSize + s.stalk.Reset() + s.stalk.SwitchDS(0x07) + s.buf = nil + s.offset = 0 + s.chunk = 0 +} + +func Draft10Sum(hash []byte, msg []byte, c []byte) { + // TODO Tweak number of lanes depending on the length of the message + s := NewDraft10(c) + _, _ = s.Write(msg) + _, _ = s.Read(hash) +} + +func (s *State) Write(p []byte) (int, error) { + written := len(p) + + // The first chunk is written directly to the stalk. + if s.initialTodo > 0 { + taken := s.initialTodo + if len(p) < taken { + taken = len(p) + } + headP := p[:taken] + _, _ = s.stalk.Write(headP) + s.initialTodo -= taken + p = p[taken:] + } + + if len(p) == 0 { + return written, nil + } + + // If this is the first bit of data written after the initial chunk, + // we're out of the fast-path and allocate some buffers. + if s.buf == nil { + if s.lanes != 1 { + s.buf = make([]byte, int(s.lanes)*chunkSize) + } else { + // We create the buffer to signal we're past the first chunk, + // but do not use it. + s.buf = make([]byte, 0) + h := sha3.NewTurboShake128(0x0B) + s.leaf = &h + } + _, _ = s.stalk.Write([]byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}) + s.stalk.SwitchDS(0x06) + } + + // If we're just using one lane, we don't need to cache in a buffer + // for parallel hashing. Instead, we feed directly to TurboSHAKE. + if s.lanes == 1 { + for len(p) > 0 { + // Write to current leaf. + to := chunkSize - s.offset + if len(p) < to { + to = len(p) + } + _, _ = s.leaf.Write(p[:to]) + p = p[to:] + s.offset += to + + // Did we fill the chunk? + if s.offset == chunkSize { + var cv [32]byte + _, _ = s.leaf.Read(cv[:]) + _, _ = s.stalk.Write(cv[:]) + s.leaf.Reset() + s.offset = 0 + s.chunk++ + } + } + + return written, nil + } + + // If we can't fill all our lanes or the buffer isn't empty, we write the + // data to the buffer. + if s.offset != 0 || len(p) < len(s.buf) { + to := len(s.buf) - s.offset + if len(p) < to { + to = len(p) + } + p2 := p[:to] + p = p[to:] + copy(s.buf[s.offset:], p2) + s.offset += to + } + + // Absorb the buffer if we filled it + if s.offset == len(s.buf) { + s.writeX(s.buf) + s.offset = 0 + } + + // Note that at this point we may assume that s.offset = 0 if len(p) != 0 + if len(p) != 0 && s.offset != 0 { + panic("shouldn't happen") + } + + // Absorb a bunch of chunks at the same time. + if len(p) >= int(s.lanes)*chunkSize { + p = s.writeX(p) + } + + // Put the remainder in the buffer. + if len(p) > 0 { + copy(s.buf, p) + s.offset = len(p) + } + + return written, nil +} + +// Absorb a multiple of a multiple of lanes * chunkSize. +// Returns the remainder. +func (s *State) writeX(p []byte) []byte { + switch s.lanes { + case 4: + return s.writeX4(p) + default: + return s.writeX2(p) + } +} + +func (s *State) writeX4(p []byte) []byte { + for len(p) >= 4*chunkSize { + var x4 keccakf1600.StateX4 + a := x4.Initialize(true) + + for offset := 0; offset < 48*168; offset += 168 { + for i := 0; i < 21; i++ { + a[i*4] ^= binary.LittleEndian.Uint64( + p[8*i+offset:], + ) + a[i*4+1] ^= binary.LittleEndian.Uint64( + p[chunkSize+8*i+offset:], + ) + a[i*4+2] ^= binary.LittleEndian.Uint64( + p[chunkSize*2+8*i+offset:], + ) + a[i*4+3] ^= binary.LittleEndian.Uint64( + p[chunkSize*3+8*i+offset:], + ) + } + + x4.Permute() + } + + for i := 0; i < 16; i++ { + a[i*4] ^= binary.LittleEndian.Uint64( + p[8*i+48*168:], + ) + a[i*4+1] ^= binary.LittleEndian.Uint64( + p[chunkSize+8*i+48*168:], + ) + a[i*4+2] ^= binary.LittleEndian.Uint64( + p[chunkSize*2+8*i+48*168:], + ) + a[i*4+3] ^= binary.LittleEndian.Uint64( + p[chunkSize*3+8*i+48*168:], + ) + } + + a[16*4] ^= 0x0b + a[16*4+1] ^= 0x0b + a[16*4+2] ^= 0x0b + a[16*4+3] ^= 0x0b + a[20*4] ^= 0x80 << 56 + a[20*4+1] ^= 0x80 << 56 + a[20*4+2] ^= 0x80 << 56 + a[20*4+3] ^= 0x80 << 56 + + x4.Permute() + + var buf [32 * 4]byte + for i := 0; i < 4; i++ { + binary.LittleEndian.PutUint64(buf[8*i:], a[4*i]) + binary.LittleEndian.PutUint64(buf[32+8*i:], a[4*i+1]) + binary.LittleEndian.PutUint64(buf[32*2+8*i:], a[4*i+2]) + binary.LittleEndian.PutUint64(buf[32*3+8*i:], a[4*i+3]) + } + + _, _ = s.stalk.Write(buf[:]) + p = p[chunkSize*4:] + s.chunk += 4 + } + + return p +} + +func (s *State) writeX2(p []byte) []byte { + // TODO On M2 Pro, 1/3 of the time is spent on this function + // and LittleEndian.Uint64 excluding the actual permutation. + // Rewriting in assembler might be worthwhile. + for len(p) >= 2*chunkSize { + var x2 keccakf1600.StateX2 + a := x2.Initialize(true) + + for offset := 0; offset < 48*168; offset += 168 { + for i := 0; i < 21; i++ { + a[i*2] ^= binary.LittleEndian.Uint64( + p[8*i+offset:], + ) + a[i*2+1] ^= binary.LittleEndian.Uint64( + p[chunkSize+8*i+offset:], + ) + } + + x2.Permute() + } + + for i := 0; i < 16; i++ { + a[i*2] ^= binary.LittleEndian.Uint64( + p[8*i+48*168:], + ) + a[i*2+1] ^= binary.LittleEndian.Uint64( + p[chunkSize+8*i+48*168:], + ) + } + + a[16*2] ^= 0x0b + a[16*2+1] ^= 0x0b + a[20*2] ^= 0x80 << 56 + a[20*2+1] ^= 0x80 << 56 + + x2.Permute() + + var buf [32 * 2]byte + for i := 0; i < 4; i++ { + binary.LittleEndian.PutUint64(buf[8*i:], a[2*i]) + binary.LittleEndian.PutUint64(buf[32+8*i:], a[2*i+1]) + } + + _, _ = s.stalk.Write(buf[:]) + p = p[chunkSize*2:] + s.chunk += 2 + } + + return p +} + +func (s *State) Read(p []byte) (int, error) { + if s.stalk.IsAbsorbing() { + // Write context string C + _, _ = s.Write(s.context) + + // Write length_encode( |C| ) + var buf [9]byte + binary.BigEndian.PutUint64(buf[:8], uint64(len(s.context))) + + // Find first non-zero digit in big endian encoding of context length + i := 0 + for buf[i] == 0 && i < 8 { + i++ + } + + buf[8] = byte(8 - i) // number of bytes to represent |C| + _, _ = s.Write(buf[i:]) + + // We need to write the chunk number if we're past the first chunk. + if s.buf != nil { + // Write last remaining chunk(s) + var cv [32]byte + if s.lanes == 1 { + if s.offset != 0 { + _, _ = s.leaf.Read(cv[:]) + _, _ = s.stalk.Write(cv[:]) + s.chunk++ + } + } else { + remainingBuf := s.buf[:s.offset] + for len(remainingBuf) > 0 { + h := sha3.NewTurboShake128(0x0B) + to := chunkSize + if len(remainingBuf) < to { + to = len(remainingBuf) + } + _, _ = h.Write(remainingBuf[:to]) + _, _ = h.Read(cv[:]) + _, _ = s.stalk.Write(cv[:]) + s.chunk++ + remainingBuf = remainingBuf[to:] + } + } + + // Write length_encode( chunk ) + binary.BigEndian.PutUint64(buf[:8], uint64(s.chunk)) + + // Find first non-zero digit in big endian encoding of number of chunks + i = 0 + for buf[i] == 0 && i < 8 { + i++ + } + + buf[8] = byte(8 - i) // number of bytes to represent number of chunks. + _, _ = s.stalk.Write(buf[i:]) + _, _ = s.stalk.Write([]byte{0xff, 0xff}) + } + } + + return s.stalk.Read(p) +} diff --git a/xof/k12/k12_test.go b/xof/k12/k12_test.go new file mode 100644 index 000000000..a5be5b05e --- /dev/null +++ b/xof/k12/k12_test.go @@ -0,0 +1,96 @@ +package k12 + +import ( + "encoding/hex" + "testing" +) + +// See draft-irtf-cfrg-kangarootwelve-10 ยง4. +// https://datatracker.ietf.org/doc/draft-irtf-cfrg-kangarootwelve/10/ +func ptn(n int) []byte { + buf := make([]byte, n) + for i := 0; i < n; i++ { + buf[i] = byte(i % 0xfb) + } + return buf +} + +func testK12(t *testing.T, msg []byte, c []byte, l int, want string) { + do := func(lanes byte, writeSize int) { + h := newDraft10(c, lanes) + msg2 := msg + for len(msg2) > 0 { + to := writeSize + if len(msg2) < to { + to = len(msg2) + } + _, _ = h.Write(msg2[:to]) + msg2 = msg2[to:] + } + buf := make([]byte, l) + _, _ = h.Read(buf) + got := hex.EncodeToString(buf) + if want != got { + t.Fatalf("%s != %s (lanes=%d, writeSize=%d )", want, got, lanes, writeSize) + } + } + + for _, lanes := range []byte{1, 2, 4} { + for _, writeSize := range []int{7919, 1024, 8 * 1024} { + do(lanes, writeSize) + } + } +} + +func TestK12(t *testing.T) { + // I-D test vectors + testK12(t, []byte{}, []byte{}, 32, "1ac2d450fc3b4205d19da7bfca1b37513c0803577ac7167f06fe2ce1f0ef39e5") + i := 17 + testK12(t, ptn(i), []byte{}, 32, "6bf75fa2239198db4772e36478f8e19b0f371205f6a9a93a273f51df37122888") + i *= 17 + testK12(t, ptn(i), []byte{}, 32, "0c315ebcdedbf61426de7dcf8fb725d1e74675d7f5327a5067f367b108ecb67c") + i *= 17 + testK12(t, ptn(i), []byte{}, 32, "cb552e2ec77d9910701d578b457ddf772c12e322e4ee7fe417f92c758f0d59d0") + i *= 17 + testK12(t, ptn(i), []byte{}, 32, "8701045e22205345ff4dda05555cbb5c3af1a771c2b89baef37db43d9998b9fe") + i *= 17 + testK12(t, ptn(i), []byte{}, 32, "844d610933b1b9963cbdeb5ae3b6b05cc7cbd67ceedf883eb678a0a8e0371682") + i *= 17 + testK12(t, ptn(i), []byte{}, 32, "3c390782a8a4e89fa6367f72feaaf13255c8d95878481d3cd8ce85f58e880af8") + testK12(t, []byte{}, ptn(1), 32, "fab658db63e94a246188bf7af69a133045f46ee984c56e3c3328caaf1aa1a583") + testK12(t, []byte{0xff}, ptn(41), 32, "d848c5068ced736f4462159b9867fd4c20b808acc3d5bc48e0b06ba0a3762ec4") + testK12(t, []byte{0xff, 0xff, 0xff}, ptn(41*41), 32, "c389e5009ae57120854c2e8c64670ac01358cf4c1baf89447a724234dc7ced74") + testK12(t, []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, ptn(41*41*41), 32, "75d2f86a2e644566726b4fbcfc5657b9dbcf070c7b0dca06450ab291d7443bcf") + + // Cornercases + testK12(t, ptn(chunkSize), []byte{}, 16, "48f256f6772f9edfb6a8b661ec92dc93") + testK12(t, ptn(chunkSize+1), []byte{}, 16, "bb66fe72eaea5179418d5295ee134485") + testK12(t, ptn(2*chunkSize), []byte{}, 16, "82778f7f7234c83352e76837b721fbdb") + testK12(t, ptn(2*chunkSize+1), []byte{}, 16, "5f8d2b943922b451842b4e82740d0236") + testK12(t, ptn(3*chunkSize), []byte{}, 16, "f4082a8fe7d1635aa042cd1da63bf235") + testK12(t, ptn(3*chunkSize+1), []byte{}, 16, "38cb940999aca742d69dd79298c6051c") +} + +func BenchmarkK12_100B(b *testing.B) { benchmarkK12(b, 100, 1) } +func BenchmarkK12_10K(b *testing.B) { benchmarkK12(b, 10000, 1) } +func BenchmarkK12_100K(b *testing.B) { benchmarkK12(b, 10000, 10) } +func BenchmarkK12_1M(b *testing.B) { benchmarkK12(b, 10000, 100) } +func BenchmarkK12_10M(b *testing.B) { benchmarkK12(b, 10000, 1000) } + +func benchmarkK12(b *testing.B, size, num int) { + b.StopTimer() + h := NewDraft10([]byte{}) + data := make([]byte, size) + d := make([]byte, 32) + + b.SetBytes(int64(size * num)) + b.StartTimer() + + for i := 0; i < b.N; i++ { + h.Reset() + for j := 0; j < num; j++ { + _, _ = h.Write(data) + } + _, _ = h.Read(d) + } +}