From 7a4318c939ec887692242864083b9d9b3b8441e1 Mon Sep 17 00:00:00 2001
From: Bas Westerbaan <bas@westerbaan.name>
Date: Wed, 19 Apr 2023 00:23:19 +0200
Subject: [PATCH] Add KangarooTwelve draft -10

---
 internal/sha3/sha3.go  |   4 +
 internal/sha3/shake.go |   4 +
 xof/k12/k12.go         | 377 +++++++++++++++++++++++++++++++++++++++++
 xof/k12/k12_test.go    |  96 +++++++++++
 4 files changed, 481 insertions(+)
 create mode 100644 xof/k12/k12.go
 create mode 100644 xof/k12/k12_test.go

diff --git a/internal/sha3/sha3.go b/internal/sha3/sha3.go
index 01806d7d1..a0df5aa6c 100644
--- a/internal/sha3/sha3.go
+++ b/internal/sha3/sha3.go
@@ -194,3 +194,7 @@ func (d *State) Sum(in []byte) []byte {
 	_, _ = dup.Read(hash)
 	return append(in, hash...)
 }
+
+func (d *State) IsAbsorbing() bool {
+	return d.state == spongeAbsorbing
+}
diff --git a/internal/sha3/shake.go b/internal/sha3/shake.go
index 2a14d78a3..77817f758 100644
--- a/internal/sha3/shake.go
+++ b/internal/sha3/shake.go
@@ -113,3 +113,7 @@ func TurboShakeSum256(hash, data []byte, D byte) {
 	_, _ = h.Write(data)
 	_, _ = h.Read(hash)
 }
+
+func (d *State) SwitchDS(D byte) {
+	d.dsbyte = D
+}
diff --git a/xof/k12/k12.go b/xof/k12/k12.go
new file mode 100644
index 000000000..ba8567744
--- /dev/null
+++ b/xof/k12/k12.go
@@ -0,0 +1,377 @@
+// k12 implements the KangarooTwelve XOF.
+//
+// KangarooTwelve is being standardised at the CFFRG working group
+// of the IRTF. This package implements draft 10.
+//
+// https://datatracker.ietf.org/doc/draft-irtf-cfrg-kangarootwelve/10/
+package k12
+
+import (
+	"encoding/binary"
+
+	"github.com/cloudflare/circl/internal/sha3"
+	"github.com/cloudflare/circl/simd/keccakf1600"
+)
+
+const chunkSize = 8192 // aka B
+
+// KangarooTwelve splits the message into chunks of 8192 bytes each.
+// The first chunk is absorbed directly in a TurboSHAKE128 instance, which
+// we call the stalk. The subsequent chunks aren't absorbed directly, but
+// instead their hash is absorbed: they're like leafs on a stalk.
+// If we have a fast TurboSHAKE128 available, we buffer chunks until we have
+// enough to do the parallel TurboSHAKE128. If not, we absorb directly into
+// a separate TurboSHAKE128 state.
+
+type State struct {
+	initialTodo int // Bytes left to absorb for the first chunk.
+
+	stalk sha3.State
+
+	context []byte // context string "C" provided by the user
+
+	// buffer of incoming data so we can do parallel TurboSHAKE128:
+	// nil when we haven't aborbed the first chunk yet;
+	// empty if we have, but we do not have a fast parallel TurboSHAKE128;
+	// and chunkSize*lanes in length if we have.
+	buf []byte
+
+	offset int // offset in buf or bytes written to leaf
+
+	// Number of chunk hashes ("CV_i") absorbed into the stalk.
+	chunk uint
+
+	// TurboSHAKE128 instance to compute the leaf in case we don't have
+	// a fast parallel TurboSHAKE128, viz when lanes == 1.
+	leaf *sha3.State
+
+	lanes uint8 // number of TurboSHAKE128s to compute in parallel
+}
+
+// NewDraft10 creates a new instance of Kangaroo12 draft version -10.
+func NewDraft10(c []byte) State {
+	var lanes byte = 1
+
+	if keccakf1600.IsEnabledX4() {
+		lanes = 4
+	} else if keccakf1600.IsEnabledX2() {
+		lanes = 2
+	}
+
+	return newDraft10(c, lanes)
+}
+
+func newDraft10(c []byte, lanes byte) State {
+	return State{
+		initialTodo: chunkSize,
+		stalk:       sha3.NewTurboShake128(0x07),
+		context:     c,
+		lanes:       lanes,
+	}
+}
+
+func (s *State) Reset() {
+	s.initialTodo = chunkSize
+	s.stalk.Reset()
+	s.stalk.SwitchDS(0x07)
+	s.buf = nil
+	s.offset = 0
+	s.chunk = 0
+}
+
+func Draft10Sum(hash []byte, msg []byte, c []byte) {
+	// TODO Tweak number of lanes depending on the length of the message
+	s := NewDraft10(c)
+	_, _ = s.Write(msg)
+	_, _ = s.Read(hash)
+}
+
+func (s *State) Write(p []byte) (int, error) {
+	written := len(p)
+
+	// The first chunk is written directly to the stalk.
+	if s.initialTodo > 0 {
+		taken := s.initialTodo
+		if len(p) < taken {
+			taken = len(p)
+		}
+		headP := p[:taken]
+		_, _ = s.stalk.Write(headP)
+		s.initialTodo -= taken
+		p = p[taken:]
+	}
+
+	if len(p) == 0 {
+		return written, nil
+	}
+
+	// If this is the first bit of data written after the initial chunk,
+	// we're out of the fast-path and allocate some buffers.
+	if s.buf == nil {
+		if s.lanes != 1 {
+			s.buf = make([]byte, int(s.lanes)*chunkSize)
+		} else {
+			// We create the buffer to signal we're past the first chunk,
+			// but do not use it.
+			s.buf = make([]byte, 0)
+			h := sha3.NewTurboShake128(0x0B)
+			s.leaf = &h
+		}
+		_, _ = s.stalk.Write([]byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00})
+		s.stalk.SwitchDS(0x06)
+	}
+
+	// If we're just using one lane, we don't need to cache in a buffer
+	// for parallel hashing. Instead, we feed directly to TurboSHAKE.
+	if s.lanes == 1 {
+		for len(p) > 0 {
+			// Write to current leaf.
+			to := chunkSize - s.offset
+			if len(p) < to {
+				to = len(p)
+			}
+			_, _ = s.leaf.Write(p[:to])
+			p = p[to:]
+			s.offset += to
+
+			// Did we fill the chunk?
+			if s.offset == chunkSize {
+				var cv [32]byte
+				_, _ = s.leaf.Read(cv[:])
+				_, _ = s.stalk.Write(cv[:])
+				s.leaf.Reset()
+				s.offset = 0
+				s.chunk++
+			}
+		}
+
+		return written, nil
+	}
+
+	// If we can't fill all our lanes or the buffer isn't empty, we write the
+	// data to the buffer.
+	if s.offset != 0 || len(p) < len(s.buf) {
+		to := len(s.buf) - s.offset
+		if len(p) < to {
+			to = len(p)
+		}
+		p2 := p[:to]
+		p = p[to:]
+		copy(s.buf[s.offset:], p2)
+		s.offset += to
+	}
+
+	// Absorb the buffer if we filled it
+	if s.offset == len(s.buf) {
+		s.writeX(s.buf)
+		s.offset = 0
+	}
+
+	// Note that at this point we may assume that s.offset = 0 if len(p) != 0
+	if len(p) != 0 && s.offset != 0 {
+		panic("shouldn't happen")
+	}
+
+	// Absorb a bunch of chunks at the same time.
+	if len(p) >= int(s.lanes)*chunkSize {
+		p = s.writeX(p)
+	}
+
+	// Put the remainder in the buffer.
+	if len(p) > 0 {
+		copy(s.buf, p)
+		s.offset = len(p)
+	}
+
+	return written, nil
+}
+
+// Absorb a multiple of a multiple of lanes * chunkSize.
+// Returns the remainder.
+func (s *State) writeX(p []byte) []byte {
+	switch s.lanes {
+	case 4:
+		return s.writeX4(p)
+	default:
+		return s.writeX2(p)
+	}
+}
+
+func (s *State) writeX4(p []byte) []byte {
+	for len(p) >= 4*chunkSize {
+		var x4 keccakf1600.StateX4
+		a := x4.Initialize(true)
+
+		for offset := 0; offset < 48*168; offset += 168 {
+			for i := 0; i < 21; i++ {
+				a[i*4] ^= binary.LittleEndian.Uint64(
+					p[8*i+offset:],
+				)
+				a[i*4+1] ^= binary.LittleEndian.Uint64(
+					p[chunkSize+8*i+offset:],
+				)
+				a[i*4+2] ^= binary.LittleEndian.Uint64(
+					p[chunkSize*2+8*i+offset:],
+				)
+				a[i*4+3] ^= binary.LittleEndian.Uint64(
+					p[chunkSize*3+8*i+offset:],
+				)
+			}
+
+			x4.Permute()
+		}
+
+		for i := 0; i < 16; i++ {
+			a[i*4] ^= binary.LittleEndian.Uint64(
+				p[8*i+48*168:],
+			)
+			a[i*4+1] ^= binary.LittleEndian.Uint64(
+				p[chunkSize+8*i+48*168:],
+			)
+			a[i*4+2] ^= binary.LittleEndian.Uint64(
+				p[chunkSize*2+8*i+48*168:],
+			)
+			a[i*4+3] ^= binary.LittleEndian.Uint64(
+				p[chunkSize*3+8*i+48*168:],
+			)
+		}
+
+		a[16*4] ^= 0x0b
+		a[16*4+1] ^= 0x0b
+		a[16*4+2] ^= 0x0b
+		a[16*4+3] ^= 0x0b
+		a[20*4] ^= 0x80 << 56
+		a[20*4+1] ^= 0x80 << 56
+		a[20*4+2] ^= 0x80 << 56
+		a[20*4+3] ^= 0x80 << 56
+
+		x4.Permute()
+
+		var buf [32 * 4]byte
+		for i := 0; i < 4; i++ {
+			binary.LittleEndian.PutUint64(buf[8*i:], a[4*i])
+			binary.LittleEndian.PutUint64(buf[32+8*i:], a[4*i+1])
+			binary.LittleEndian.PutUint64(buf[32*2+8*i:], a[4*i+2])
+			binary.LittleEndian.PutUint64(buf[32*3+8*i:], a[4*i+3])
+		}
+
+		_, _ = s.stalk.Write(buf[:])
+		p = p[chunkSize*4:]
+		s.chunk += 4
+	}
+
+	return p
+}
+
+func (s *State) writeX2(p []byte) []byte {
+	// TODO On M2 Pro, 1/3 of the time is spent on this function
+	// and LittleEndian.Uint64 excluding the actual permutation.
+	// Rewriting in assembler might be worthwhile.
+	for len(p) >= 2*chunkSize {
+		var x2 keccakf1600.StateX2
+		a := x2.Initialize(true)
+
+		for offset := 0; offset < 48*168; offset += 168 {
+			for i := 0; i < 21; i++ {
+				a[i*2] ^= binary.LittleEndian.Uint64(
+					p[8*i+offset:],
+				)
+				a[i*2+1] ^= binary.LittleEndian.Uint64(
+					p[chunkSize+8*i+offset:],
+				)
+			}
+
+			x2.Permute()
+		}
+
+		for i := 0; i < 16; i++ {
+			a[i*2] ^= binary.LittleEndian.Uint64(
+				p[8*i+48*168:],
+			)
+			a[i*2+1] ^= binary.LittleEndian.Uint64(
+				p[chunkSize+8*i+48*168:],
+			)
+		}
+
+		a[16*2] ^= 0x0b
+		a[16*2+1] ^= 0x0b
+		a[20*2] ^= 0x80 << 56
+		a[20*2+1] ^= 0x80 << 56
+
+		x2.Permute()
+
+		var buf [32 * 2]byte
+		for i := 0; i < 4; i++ {
+			binary.LittleEndian.PutUint64(buf[8*i:], a[2*i])
+			binary.LittleEndian.PutUint64(buf[32+8*i:], a[2*i+1])
+		}
+
+		_, _ = s.stalk.Write(buf[:])
+		p = p[chunkSize*2:]
+		s.chunk += 2
+	}
+
+	return p
+}
+
+func (s *State) Read(p []byte) (int, error) {
+	if s.stalk.IsAbsorbing() {
+		// Write context string C
+		_, _ = s.Write(s.context)
+
+		// Write length_encode( |C| )
+		var buf [9]byte
+		binary.BigEndian.PutUint64(buf[:8], uint64(len(s.context)))
+
+		// Find first non-zero digit in big endian encoding of context length
+		i := 0
+		for buf[i] == 0 && i < 8 {
+			i++
+		}
+
+		buf[8] = byte(8 - i) // number of bytes to represent |C|
+		_, _ = s.Write(buf[i:])
+
+		// We need to write the chunk number if we're past the first chunk.
+		if s.buf != nil {
+			// Write last remaining chunk(s)
+			var cv [32]byte
+			if s.lanes == 1 {
+				if s.offset != 0 {
+					_, _ = s.leaf.Read(cv[:])
+					_, _ = s.stalk.Write(cv[:])
+					s.chunk++
+				}
+			} else {
+				remainingBuf := s.buf[:s.offset]
+				for len(remainingBuf) > 0 {
+					h := sha3.NewTurboShake128(0x0B)
+					to := chunkSize
+					if len(remainingBuf) < to {
+						to = len(remainingBuf)
+					}
+					_, _ = h.Write(remainingBuf[:to])
+					_, _ = h.Read(cv[:])
+					_, _ = s.stalk.Write(cv[:])
+					s.chunk++
+					remainingBuf = remainingBuf[to:]
+				}
+			}
+
+			// Write length_encode( chunk )
+			binary.BigEndian.PutUint64(buf[:8], uint64(s.chunk))
+
+			// Find first non-zero digit in big endian encoding of number of chunks
+			i = 0
+			for buf[i] == 0 && i < 8 {
+				i++
+			}
+
+			buf[8] = byte(8 - i) // number of bytes to represent number of chunks.
+			_, _ = s.stalk.Write(buf[i:])
+			_, _ = s.stalk.Write([]byte{0xff, 0xff})
+		}
+	}
+
+	return s.stalk.Read(p)
+}
diff --git a/xof/k12/k12_test.go b/xof/k12/k12_test.go
new file mode 100644
index 000000000..a5be5b05e
--- /dev/null
+++ b/xof/k12/k12_test.go
@@ -0,0 +1,96 @@
+package k12
+
+import (
+	"encoding/hex"
+	"testing"
+)
+
+// See draft-irtf-cfrg-kangarootwelve-10 §4.
+// https://datatracker.ietf.org/doc/draft-irtf-cfrg-kangarootwelve/10/
+func ptn(n int) []byte {
+	buf := make([]byte, n)
+	for i := 0; i < n; i++ {
+		buf[i] = byte(i % 0xfb)
+	}
+	return buf
+}
+
+func testK12(t *testing.T, msg []byte, c []byte, l int, want string) {
+	do := func(lanes byte, writeSize int) {
+		h := newDraft10(c, lanes)
+		msg2 := msg
+		for len(msg2) > 0 {
+			to := writeSize
+			if len(msg2) < to {
+				to = len(msg2)
+			}
+			_, _ = h.Write(msg2[:to])
+			msg2 = msg2[to:]
+		}
+		buf := make([]byte, l)
+		_, _ = h.Read(buf)
+		got := hex.EncodeToString(buf)
+		if want != got {
+			t.Fatalf("%s != %s (lanes=%d, writeSize=%d )", want, got, lanes, writeSize)
+		}
+	}
+
+	for _, lanes := range []byte{1, 2, 4} {
+		for _, writeSize := range []int{7919, 1024, 8 * 1024} {
+			do(lanes, writeSize)
+		}
+	}
+}
+
+func TestK12(t *testing.T) {
+	// I-D test vectors
+	testK12(t, []byte{}, []byte{}, 32, "1ac2d450fc3b4205d19da7bfca1b37513c0803577ac7167f06fe2ce1f0ef39e5")
+	i := 17
+	testK12(t, ptn(i), []byte{}, 32, "6bf75fa2239198db4772e36478f8e19b0f371205f6a9a93a273f51df37122888")
+	i *= 17
+	testK12(t, ptn(i), []byte{}, 32, "0c315ebcdedbf61426de7dcf8fb725d1e74675d7f5327a5067f367b108ecb67c")
+	i *= 17
+	testK12(t, ptn(i), []byte{}, 32, "cb552e2ec77d9910701d578b457ddf772c12e322e4ee7fe417f92c758f0d59d0")
+	i *= 17
+	testK12(t, ptn(i), []byte{}, 32, "8701045e22205345ff4dda05555cbb5c3af1a771c2b89baef37db43d9998b9fe")
+	i *= 17
+	testK12(t, ptn(i), []byte{}, 32, "844d610933b1b9963cbdeb5ae3b6b05cc7cbd67ceedf883eb678a0a8e0371682")
+	i *= 17
+	testK12(t, ptn(i), []byte{}, 32, "3c390782a8a4e89fa6367f72feaaf13255c8d95878481d3cd8ce85f58e880af8")
+	testK12(t, []byte{}, ptn(1), 32, "fab658db63e94a246188bf7af69a133045f46ee984c56e3c3328caaf1aa1a583")
+	testK12(t, []byte{0xff}, ptn(41), 32, "d848c5068ced736f4462159b9867fd4c20b808acc3d5bc48e0b06ba0a3762ec4")
+	testK12(t, []byte{0xff, 0xff, 0xff}, ptn(41*41), 32, "c389e5009ae57120854c2e8c64670ac01358cf4c1baf89447a724234dc7ced74")
+	testK12(t, []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, ptn(41*41*41), 32, "75d2f86a2e644566726b4fbcfc5657b9dbcf070c7b0dca06450ab291d7443bcf")
+
+	// Cornercases
+	testK12(t, ptn(chunkSize), []byte{}, 16, "48f256f6772f9edfb6a8b661ec92dc93")
+	testK12(t, ptn(chunkSize+1), []byte{}, 16, "bb66fe72eaea5179418d5295ee134485")
+	testK12(t, ptn(2*chunkSize), []byte{}, 16, "82778f7f7234c83352e76837b721fbdb")
+	testK12(t, ptn(2*chunkSize+1), []byte{}, 16, "5f8d2b943922b451842b4e82740d0236")
+	testK12(t, ptn(3*chunkSize), []byte{}, 16, "f4082a8fe7d1635aa042cd1da63bf235")
+	testK12(t, ptn(3*chunkSize+1), []byte{}, 16, "38cb940999aca742d69dd79298c6051c")
+}
+
+func BenchmarkK12_100B(b *testing.B) { benchmarkK12(b, 100, 1) }
+func BenchmarkK12_10K(b *testing.B)  { benchmarkK12(b, 10000, 1) }
+func BenchmarkK12_100K(b *testing.B) { benchmarkK12(b, 10000, 10) }
+func BenchmarkK12_1M(b *testing.B)   { benchmarkK12(b, 10000, 100) }
+func BenchmarkK12_10M(b *testing.B)  { benchmarkK12(b, 10000, 1000) }
+
+func benchmarkK12(b *testing.B, size, num int) {
+	b.StopTimer()
+	h := NewDraft10([]byte{})
+	data := make([]byte, size)
+	d := make([]byte, 32)
+
+	b.SetBytes(int64(size * num))
+	b.StartTimer()
+
+	for i := 0; i < b.N; i++ {
+		h.Reset()
+		for j := 0; j < num; j++ {
+			_, _ = h.Write(data)
+		}
+		_, _ = h.Read(d)
+	}
+}