Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kangaroo12 draft -10 #431

Merged
merged 1 commit into from
May 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions internal/sha3/sha3.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,7 @@ func (d *State) Sum(in []byte) []byte {
_, _ = dup.Read(hash)
return append(in, hash...)
}

func (d *State) IsAbsorbing() bool {
return d.state == spongeAbsorbing
}
4 changes: 4 additions & 0 deletions internal/sha3/shake.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,7 @@ func TurboShakeSum256(hash, data []byte, D byte) {
_, _ = h.Write(data)
_, _ = h.Read(hash)
}

func (d *State) SwitchDS(D byte) {
d.dsbyte = D
}
377 changes: 377 additions & 0 deletions xof/k12/k12.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,377 @@
// k12 implements the KangarooTwelve XOF.
//
// KangarooTwelve is being standardised at the CFFRG working group
// of the IRTF. This package implements draft 10.
//
// https://datatracker.ietf.org/doc/draft-irtf-cfrg-kangarootwelve/10/
package k12

import (
"encoding/binary"

"github.com/cloudflare/circl/internal/sha3"
"github.com/cloudflare/circl/simd/keccakf1600"
)

const chunkSize = 8192 // aka B

// KangarooTwelve splits the message into chunks of 8192 bytes each.
// The first chunk is absorbed directly in a TurboSHAKE128 instance, which
// we call the stalk. The subsequent chunks aren't absorbed directly, but
// instead their hash is absorbed: they're like leafs on a stalk.
// If we have a fast TurboSHAKE128 available, we buffer chunks until we have
// enough to do the parallel TurboSHAKE128. If not, we absorb directly into
// a separate TurboSHAKE128 state.

type State struct {
initialTodo int // Bytes left to absorb for the first chunk.

stalk sha3.State
armfazh marked this conversation as resolved.
Show resolved Hide resolved

context []byte // context string "C" provided by the user

// buffer of incoming data so we can do parallel TurboSHAKE128:
// nil when we haven't aborbed the first chunk yet;
// empty if we have, but we do not have a fast parallel TurboSHAKE128;
// and chunkSize*lanes in length if we have.
buf []byte

offset int // offset in buf or bytes written to leaf

// Number of chunk hashes ("CV_i") absorbed into the stalk.
chunk uint

// TurboSHAKE128 instance to compute the leaf in case we don't have
// a fast parallel TurboSHAKE128, viz when lanes == 1.
leaf *sha3.State

lanes uint8 // number of TurboSHAKE128s to compute in parallel
}

// NewDraft10 creates a new instance of Kangaroo12 draft version -10.
func NewDraft10(c []byte) State {
var lanes byte = 1

if keccakf1600.IsEnabledX4() {
lanes = 4
} else if keccakf1600.IsEnabledX2() {
lanes = 2
}

return newDraft10(c, lanes)
}

func newDraft10(c []byte, lanes byte) State {
return State{
initialTodo: chunkSize,
stalk: sha3.NewTurboShake128(0x07),
context: c,
lanes: lanes,
}
}

func (s *State) Reset() {
s.initialTodo = chunkSize
s.stalk.Reset()
s.stalk.SwitchDS(0x07)
s.buf = nil
s.offset = 0
s.chunk = 0
}

func Draft10Sum(hash []byte, msg []byte, c []byte) {
// TODO Tweak number of lanes depending on the length of the message
s := NewDraft10(c)
_, _ = s.Write(msg)
_, _ = s.Read(hash)
}

func (s *State) Write(p []byte) (int, error) {
written := len(p)

// The first chunk is written directly to the stalk.
if s.initialTodo > 0 {
taken := s.initialTodo
if len(p) < taken {
taken = len(p)
}
headP := p[:taken]
_, _ = s.stalk.Write(headP)
s.initialTodo -= taken
p = p[taken:]
}

if len(p) == 0 {
return written, nil
}

// If this is the first bit of data written after the initial chunk,
// we're out of the fast-path and allocate some buffers.
if s.buf == nil {
if s.lanes != 1 {
s.buf = make([]byte, int(s.lanes)*chunkSize)
} else {
// We create the buffer to signal we're past the first chunk,
// but do not use it.
s.buf = make([]byte, 0)
h := sha3.NewTurboShake128(0x0B)
s.leaf = &h
}
_, _ = s.stalk.Write([]byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00})
s.stalk.SwitchDS(0x06)
}

// If we're just using one lane, we don't need to cache in a buffer
// for parallel hashing. Instead, we feed directly to TurboSHAKE.
if s.lanes == 1 {
for len(p) > 0 {
// Write to current leaf.
to := chunkSize - s.offset
if len(p) < to {
to = len(p)
}
_, _ = s.leaf.Write(p[:to])
p = p[to:]
s.offset += to

// Did we fill the chunk?
if s.offset == chunkSize {
var cv [32]byte
_, _ = s.leaf.Read(cv[:])
_, _ = s.stalk.Write(cv[:])
armfazh marked this conversation as resolved.
Show resolved Hide resolved
s.leaf.Reset()
s.offset = 0
s.chunk++
}
}

return written, nil
}

// If we can't fill all our lanes or the buffer isn't empty, we write the
// data to the buffer.
if s.offset != 0 || len(p) < len(s.buf) {
to := len(s.buf) - s.offset
if len(p) < to {
to = len(p)
}
p2 := p[:to]
p = p[to:]
copy(s.buf[s.offset:], p2)
s.offset += to
}

// Absorb the buffer if we filled it
if s.offset == len(s.buf) {
s.writeX(s.buf)
armfazh marked this conversation as resolved.
Show resolved Hide resolved
s.offset = 0
}

// Note that at this point we may assume that s.offset = 0 if len(p) != 0
if len(p) != 0 && s.offset != 0 {
panic("shouldn't happen")
}

// Absorb a bunch of chunks at the same time.
if len(p) >= int(s.lanes)*chunkSize {
p = s.writeX(p)
}

// Put the remainder in the buffer.
if len(p) > 0 {
copy(s.buf, p)
s.offset = len(p)
}

return written, nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

written is always len(p), and is never modified based on actual Write operations.
Also, let's propagate errors from Read/Write functions above.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

written is always len(p)

True, but p is modified, so we need to store len(p).

Also, let's propagate errors from Read/Write functions above.

Read/Write don't error.

}

// Absorb a multiple of a multiple of lanes * chunkSize.
// Returns the remainder.
func (s *State) writeX(p []byte) []byte {
switch s.lanes {
case 4:
return s.writeX4(p)
default:
return s.writeX2(p)
}
}

func (s *State) writeX4(p []byte) []byte {
for len(p) >= 4*chunkSize {
var x4 keccakf1600.StateX4
a := x4.Initialize(true)

for offset := 0; offset < 48*168; offset += 168 {
for i := 0; i < 21; i++ {
a[i*4] ^= binary.LittleEndian.Uint64(
p[8*i+offset:],
)
a[i*4+1] ^= binary.LittleEndian.Uint64(
p[chunkSize+8*i+offset:],
)
a[i*4+2] ^= binary.LittleEndian.Uint64(
p[chunkSize*2+8*i+offset:],
)
a[i*4+3] ^= binary.LittleEndian.Uint64(
p[chunkSize*3+8*i+offset:],
)
}

x4.Permute()
}

for i := 0; i < 16; i++ {
a[i*4] ^= binary.LittleEndian.Uint64(
p[8*i+48*168:],
)
a[i*4+1] ^= binary.LittleEndian.Uint64(
p[chunkSize+8*i+48*168:],
)
a[i*4+2] ^= binary.LittleEndian.Uint64(
p[chunkSize*2+8*i+48*168:],
)
a[i*4+3] ^= binary.LittleEndian.Uint64(
p[chunkSize*3+8*i+48*168:],
)
}

a[16*4] ^= 0x0b
a[16*4+1] ^= 0x0b
a[16*4+2] ^= 0x0b
a[16*4+3] ^= 0x0b
a[20*4] ^= 0x80 << 56
a[20*4+1] ^= 0x80 << 56
a[20*4+2] ^= 0x80 << 56
a[20*4+3] ^= 0x80 << 56

x4.Permute()

var buf [32 * 4]byte
for i := 0; i < 4; i++ {
binary.LittleEndian.PutUint64(buf[8*i:], a[4*i])
binary.LittleEndian.PutUint64(buf[32+8*i:], a[4*i+1])
binary.LittleEndian.PutUint64(buf[32*2+8*i:], a[4*i+2])
binary.LittleEndian.PutUint64(buf[32*3+8*i:], a[4*i+3])
}

_, _ = s.stalk.Write(buf[:])
p = p[chunkSize*4:]
s.chunk += 4
}

return p
}

func (s *State) writeX2(p []byte) []byte {
// TODO On M2 Pro, 1/3 of the time is spent on this function
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure whether the compiler converts the LittleEndian encoding function into a single assembler instruction, like it does for AMD64.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably not. Also it'll probably do too many bounds checks.

// and LittleEndian.Uint64 excluding the actual permutation.
// Rewriting in assembler might be worthwhile.
for len(p) >= 2*chunkSize {
var x2 keccakf1600.StateX2
Comment on lines +270 to +271
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for len(p) >= 2*chunkSize {
var x2 keccakf1600.StateX2
var x2 keccakf1600.StateX2
for len(p) >= 2*chunkSize {

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not correct: Initialize doesn't zero the buffer.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then Initialize may take care of the zero-ing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then in the common case we're zeroing twice, which has a non-trivial cost.

a := x2.Initialize(true)

for offset := 0; offset < 48*168; offset += 168 {
for i := 0; i < 21; i++ {
a[i*2] ^= binary.LittleEndian.Uint64(
p[8*i+offset:],
)
a[i*2+1] ^= binary.LittleEndian.Uint64(
p[chunkSize+8*i+offset:],
)
}

x2.Permute()
}

for i := 0; i < 16; i++ {
a[i*2] ^= binary.LittleEndian.Uint64(
p[8*i+48*168:],
)
a[i*2+1] ^= binary.LittleEndian.Uint64(
p[chunkSize+8*i+48*168:],
)
}

a[16*2] ^= 0x0b
a[16*2+1] ^= 0x0b
a[20*2] ^= 0x80 << 56
a[20*2+1] ^= 0x80 << 56

x2.Permute()

var buf [32 * 2]byte
for i := 0; i < 4; i++ {
binary.LittleEndian.PutUint64(buf[8*i:], a[2*i])
binary.LittleEndian.PutUint64(buf[32+8*i:], a[2*i+1])
}

_, _ = s.stalk.Write(buf[:])
p = p[chunkSize*2:]
s.chunk += 2
}

return p
}

func (s *State) Read(p []byte) (int, error) {
if s.stalk.IsAbsorbing() {
// Write context string C
_, _ = s.Write(s.context)

// Write length_encode( |C| )
var buf [9]byte
binary.BigEndian.PutUint64(buf[:8], uint64(len(s.context)))

// Find first non-zero digit in big endian encoding of context length
i := 0
for buf[i] == 0 && i < 8 {
i++
}

buf[8] = byte(8 - i) // number of bytes to represent |C|
_, _ = s.Write(buf[i:])

// We need to write the chunk number if we're past the first chunk.
if s.buf != nil {
// Write last remaining chunk(s)
var cv [32]byte
if s.lanes == 1 {
if s.offset != 0 {
_, _ = s.leaf.Read(cv[:])
_, _ = s.stalk.Write(cv[:])
s.chunk++
}
} else {
remainingBuf := s.buf[:s.offset]
for len(remainingBuf) > 0 {
h := sha3.NewTurboShake128(0x0B)
Comment on lines +347 to +348
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we reuse the hashfn?

Suggested change
for len(remainingBuf) > 0 {
h := sha3.NewTurboShake128(0x0B)
h := sha3.NewTurboShake128(0x0B)
for len(remainingBuf) > 0 {
h.Reset()

to := chunkSize
if len(remainingBuf) < to {
to = len(remainingBuf)
}
_, _ = h.Write(remainingBuf[:to])
_, _ = h.Read(cv[:])
_, _ = s.stalk.Write(cv[:])
s.chunk++
remainingBuf = remainingBuf[to:]
}
}

// Write length_encode( chunk )
binary.BigEndian.PutUint64(buf[:8], uint64(s.chunk))

// Find first non-zero digit in big endian encoding of number of chunks
i = 0
for buf[i] == 0 && i < 8 {
i++
}

buf[8] = byte(8 - i) // number of bytes to represent number of chunks.
_, _ = s.stalk.Write(buf[i:])
_, _ = s.stalk.Write([]byte{0xff, 0xff})
}
}

return s.stalk.Read(p)
}
Loading