Skip to content

Commit

Permalink
argon2: Avo port of blamka_amd64.s
Browse files Browse the repository at this point in the history
This implementation utilizes the same registers found in the reference
implementation, aiming to produce a minimal semantic diff between the
Avo-generated output and the original hand-written assembly.

To verify the Avo implementation, the reference and Avo-generated
assembly files are fed to `go tool asm`, capturing the debug output into
corresponding temp files. The debug output contains supplementary
metadata (line numbers, instruction offsets, and source file references)
that must be removed in order to obtain a semantic diff of the two
files. This is accomplished via a small utility script written in awk.

Commands used to verify Avo output:

GOROOT=$(go env GOROOT)
ASM_PATH="argon2/blamka_amd64.s"
REFERENCE="b2d3a6a4b4d36521cd7f653879cf6981e7c5c340"

go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
  <(git cat-file -p "$REFERENCE:$ASM_PATH") \
  > /tmp/reference.s

go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \
  "$ASM_PATH" \
  > /tmp/avo.s

normalize(){
  awk '{
    $1=$2=$3="";
    print substr($0,4)
  }'
}

diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s)

Change-Id: I3567eb80ef80dff248225f17470122c0a4e6951e
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/600315
Reviewed-by: Filippo Valsorda <filippo@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
  • Loading branch information
Garrett-Bodley authored and rolandshoemaker committed Sep 4, 2024
1 parent bf5f14f commit 38a0b5d
Show file tree
Hide file tree
Showing 4 changed files with 3,074 additions and 212 deletions.
287 changes: 287 additions & 0 deletions argon2/_asm/blamka_amd64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package main

import (
. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
. "github.com/mmcloughlin/avo/reg"
_ "golang.org/x/crypto/argon2"
)

//go:generate go run . -out ../blamka_amd64.s -pkg argon2

func main() {
Package("golang.org/x/crypto/argon2")
ConstraintExpr("amd64,gc,!purego")

blamkaSSE4()
mixBlocksSSE2()
xorBlocksSSE2()
Generate()
}

func blamkaSSE4() {
Implement("blamkaSSE4")
Attributes(NOSPLIT)
AllocLocal(0)

Load(Param("b"), RAX)

c40 := c40_DATA()
c48 := c48_DATA()

MOVOU(c40, X10)
MOVOU(c48, X11)

BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)

BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
RET()
}

func mixBlocksSSE2() {
Implement("mixBlocksSSE2")
Attributes(NOSPLIT)
AllocLocal(0)

Load(Param("out"), RDX)
Load(Param("a"), RAX)
Load(Param("b"), RBX)
Load(Param("c"), RCX)
MOVQ(U32(128), RDI)

Label("loop")
MOVOU(Mem{Base: AX}.Offset(0), X0)
MOVOU(Mem{Base: BX}.Offset(0), X1)
MOVOU(Mem{Base: CX}.Offset(0), X2)
PXOR(X1, X0)
PXOR(X2, X0)
MOVOU(X0, Mem{Base: DX}.Offset(0))
ADDQ(Imm(16), RAX)
ADDQ(Imm(16), RBX)
ADDQ(Imm(16), RCX)
ADDQ(Imm(16), RDX)
SUBQ(Imm(2), RDI)
JA(LabelRef("loop"))
RET()
}

func xorBlocksSSE2() {
Implement("xorBlocksSSE2")
Attributes(NOSPLIT)
AllocLocal(0)

Load(Param("out"), RDX)
Load(Param("a"), RAX)
Load(Param("b"), RBX)
Load(Param("c"), RCX)
MOVQ(U32(128), RDI)

Label("loop")
MOVOU(Mem{Base: AX}.Offset(0), X0)
MOVOU(Mem{Base: BX}.Offset(0), X1)
MOVOU(Mem{Base: CX}.Offset(0), X2)
MOVOU(Mem{Base: DX}.Offset(0), X3)
PXOR(X1, X0)
PXOR(X2, X0)
PXOR(X3, X0)
MOVOU(X0, Mem{Base: DX}.Offset(0))
ADDQ(Imm(16), RAX)
ADDQ(Imm(16), RBX)
ADDQ(Imm(16), RCX)
ADDQ(Imm(16), RDX)
SUBQ(Imm(2), RDI)
JA(LabelRef("loop"))
RET()
}

func SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) {
MOVO(v4, t1)
MOVO(v5, v4)
MOVO(t1, v5)
MOVO(v6, t1)
PUNPCKLQDQ(v6, t2)
PUNPCKHQDQ(v7, v6)
PUNPCKHQDQ(t2, v6)
PUNPCKLQDQ(v7, t2)
MOVO(t1, v7)
MOVO(v2, t1)
PUNPCKHQDQ(t2, v7)
PUNPCKLQDQ(v3, t2)
PUNPCKHQDQ(t2, v2)
PUNPCKLQDQ(t1, t2)
PUNPCKHQDQ(t2, v3)
}

func SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2 VecPhysical) {
MOVO(v4, t1)
MOVO(v5, v4)
MOVO(t1, v5)
MOVO(v2, t1)
PUNPCKLQDQ(v2, t2)
PUNPCKHQDQ(v3, v2)
PUNPCKHQDQ(t2, v2)
PUNPCKLQDQ(v3, t2)
MOVO(t1, v3)
MOVO(v6, t1)
PUNPCKHQDQ(t2, v3)
PUNPCKLQDQ(v7, t2)
PUNPCKHQDQ(t2, v6)
PUNPCKLQDQ(t1, t2)
PUNPCKHQDQ(t2, v7)
}

func HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48 VecPhysical) {
MOVO(v0, t0)
PMULULQ(v2, t0)
PADDQ(v2, v0)
PADDQ(t0, v0)
PADDQ(t0, v0)
PXOR(v0, v6)
PSHUFD(Imm(0xB1), v6, v6)
MOVO(v4, t0)
PMULULQ(v6, t0)
PADDQ(v6, v4)
PADDQ(t0, v4)
PADDQ(t0, v4)
PXOR(v4, v2)
PSHUFB(c40, v2)
MOVO(v0, t0)
PMULULQ(v2, t0)
PADDQ(v2, v0)
PADDQ(t0, v0)
PADDQ(t0, v0)
PXOR(v0, v6)
PSHUFB(c48, v6)
MOVO(v4, t0)
PMULULQ(v6, t0)
PADDQ(v6, v4)
PADDQ(t0, v4)
PADDQ(t0, v4)
PXOR(v4, v2)
MOVO(v2, t0)
PADDQ(v2, t0)
PSRLQ(Imm(63), v2)
PXOR(t0, v2)
MOVO(v1, t0)
PMULULQ(v3, t0)
PADDQ(v3, v1)
PADDQ(t0, v1)
PADDQ(t0, v1)
PXOR(v1, v7)
PSHUFD(Imm(0xB1), v7, v7)
MOVO(v5, t0)
PMULULQ(v7, t0)
PADDQ(v7, v5)
PADDQ(t0, v5)
PADDQ(t0, v5)
PXOR(v5, v3)
PSHUFB(c40, v3)
MOVO(v1, t0)
PMULULQ(v3, t0)
PADDQ(v3, v1)
PADDQ(t0, v1)
PADDQ(t0, v1)
PXOR(v1, v7)
PSHUFB(c48, v7)
MOVO(v5, t0)
PMULULQ(v7, t0)
PADDQ(v7, v5)
PADDQ(t0, v5)
PADDQ(t0, v5)
PXOR(v5, v3)
MOVO(v3, t0)
PADDQ(v3, t0)
PSRLQ(Imm(63), v3)
PXOR(t0, v3)
}

func LOAD_MSG_0(block GPPhysical, off int) {
var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7}
for i, r := range registers {
MOVOU(Mem{Base: block}.Offset(8*(off+(i*2))), r)
}
}

func STORE_MSG_0(block GPPhysical, off int) {
var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7}
for i, r := range registers {
MOVOU(r, Mem{Base: block}.Offset(8*(off+(i*2))))
}
}

func LOAD_MSG_1(block GPPhysical, off int) {
var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7}
for i, r := range registers {
MOVOU(Mem{Base: block}.Offset(8*off+i*16*8), r)
}
}

func STORE_MSG_1(block GPPhysical, off int) {
var registers = []VecPhysical{X0, X1, X2, X3, X4, X5, X6, X7}
for i, r := range registers {
MOVOU(r, Mem{Base: block}.Offset(8*off+i*16*8))
}
}

func BLAMKA_ROUND_0(block GPPhysical, off int, t0, t1, c40, c48 VecPhysical) {
LOAD_MSG_0(block, off)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48)
SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1)
STORE_MSG_0(block, off)
}

func BLAMKA_ROUND_1(block GPPhysical, off int, t0, t1, c40, c48 VecPhysical) {
LOAD_MSG_1(block, off)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48)
SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1)
STORE_MSG_1(block, off)
}

// ##------------------DATA SECTION-------------------##

var c40_DATA_ptr, c48_DATA_ptr *Mem

func c40_DATA() Mem {
if c40_DATA_ptr != nil {
return *c40_DATA_ptr
}

c40_DATA := GLOBL("·c40", NOPTR|RODATA)
c40_DATA_ptr = &c40_DATA
DATA(0x00, U64(0x0201000706050403))
DATA(0x08, U64(0x0a09080f0e0d0c0b))
return c40_DATA
}
func c48_DATA() Mem {
if c48_DATA_ptr != nil {
return *c48_DATA_ptr
}

c48_DATA := GLOBL("·c48", NOPTR|RODATA)
c48_DATA_ptr = &c48_DATA
DATA(0x00, U64(0x0100070605040302))
DATA(0x08, U64(0x09080f0e0d0c0b0a))
return c48_DATA
}
15 changes: 15 additions & 0 deletions argon2/_asm/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
module argon2/_asm

go 1.23

require (
github.com/mmcloughlin/avo v0.6.0
golang.org/x/crypto v0.26.0
)

require (
golang.org/x/mod v0.20.0 // indirect
golang.org/x/sync v0.8.0 // indirect
golang.org/x/sys v0.24.0 // indirect
golang.org/x/tools v0.24.0 // indirect
)
12 changes: 12 additions & 0 deletions argon2/_asm/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY=
github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
Loading

0 comments on commit 38a0b5d

Please sign in to comment.