Skip to content

Commit

Permalink
crypto/sha256: add sha-ni implementation
Browse files Browse the repository at this point in the history
goos: linux
goarch: amd64
pkg: crypto/sha256
cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz
                    │  bench.old   │              bench.new              │
                    │    sec/op    │   sec/op     vs base                │
Hash8Bytes/New-4      169.20n ± 7%   65.40n ± 5%  -61.35% (p=0.000 n=10)
Hash8Bytes/Sum224-4   166.10n ± 3%   65.20n ± 8%  -60.74% (p=0.000 n=10)
Hash8Bytes/Sum256-4   168.50n ± 6%   63.58n ± 7%  -62.27% (p=0.000 n=10)
Hash1K/New-4          2275.5n ± 5%   618.5n ± 2%  -72.82% (p=0.000 n=10)
Hash1K/Sum224-4       2364.5n ± 1%   618.1n ± 1%  -73.86% (p=0.000 n=10)
Hash1K/Sum256-4       2338.5n ± 2%   613.0n ± 2%  -73.79% (p=0.000 n=10)
Hash8K/New-4          17.530µ ± 2%   4.501µ ± 1%  -74.33% (p=0.000 n=10)
Hash8K/Sum224-4       17.456µ ± 2%   4.505µ ± 1%  -74.19% (p=0.000 n=10)
Hash8K/Sum256-4       17.417µ ± 2%   4.504µ ± 1%  -74.14% (p=0.000 n=10)
geomean                1.897µ        564.3n       -70.25%

                    │  bench.old   │               bench.new                │
                    │     B/s      │      B/s       vs base                 │
Hash8Bytes/New-4      45.11Mi ± 6%   116.66Mi ± 5%  +158.62% (p=0.000 n=10)
Hash8Bytes/Sum224-4   45.92Mi ± 3%   117.04Mi ± 8%  +154.89% (p=0.000 n=10)
Hash8Bytes/Sum256-4   45.29Mi ± 6%   120.00Mi ± 7%  +164.99% (p=0.000 n=10)
Hash1K/New-4          429.2Mi ± 5%   1578.9Mi ± 2%  +267.92% (p=0.000 n=10)
Hash1K/Sum224-4       413.0Mi ± 1%   1579.8Mi ± 1%  +282.49% (p=0.000 n=10)
Hash1K/Sum256-4       417.6Mi ± 1%   1593.1Mi ± 2%  +281.53% (p=0.000 n=10)
Hash8K/New-4          445.7Mi ± 1%   1735.9Mi ± 1%  +289.50% (p=0.000 n=10)
Hash8K/Sum224-4       447.6Mi ± 2%   1734.5Mi ± 1%  +287.54% (p=0.000 n=10)
Hash8K/Sum256-4       448.6Mi ± 2%   1734.8Mi ± 1%  +286.75% (p=0.000 n=10)
geomean               204.3Mi         686.8Mi       +236.11%

                    │  bench.old   │              bench.new              │
                    │     B/op     │    B/op     vs base                 │
Hash8Bytes/New-4      0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum224-4   0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum256-4   0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/New-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/Sum224-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/Sum256-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/New-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/Sum224-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/Sum256-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
geomean                          ²               +0.00%                ²
¹ all samples are equal
² summaries must be >0 to compute geomean

                    │  bench.old   │              bench.new              │
                    │  allocs/op   │ allocs/op   vs base                 │
Hash8Bytes/New-4      0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum224-4   0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum256-4   0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/New-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/Sum224-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/Sum256-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/New-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/Sum224-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/Sum256-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
geomean                          ²               +0.00%                ²
¹ all samples are equal
² summaries must be >0 to compute geomean

Fixes #50543.

Change-Id: Ie9783647fe82f40fcbd91989a96a24f2d3d5b9a0
Reviewed-on: https://go-review.googlesource.com/c/go/+/408795
Reviewed-by: Paulo Gomes <paulo.gomes.uk@gmail.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
Reviewed-by: Alan Donovan <adonovan@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
  • Loading branch information
tpaint authored and gopherbot committed Mar 31, 2023
1 parent e29dd78 commit 1a64574
Show file tree
Hide file tree
Showing 2 changed files with 152 additions and 9 deletions.
1 change: 1 addition & 0 deletions src/crypto/sha256/sha256block_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ package sha256
import "internal/cpu"

var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
var useSHA = useAVX2 && cpu.X86.HasSHA
160 changes: 151 additions & 9 deletions src/crypto/sha256/sha256block_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@

#define XFER Y9

#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
#define X_BYTE_FLIP_MASK X13

#define NUM_BYTES DX
Expand Down Expand Up @@ -232,14 +232,14 @@
RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
XORL g, y2; \ // y2 = f^g // CH
XORL g, y2; \ // y2 = f^g // CH
VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1
RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
; \
ANDL e, y2; \ // y2 = (f^g)&e // CH
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
ADDL h, d; \ // d = k + w + h + d // --
ADDL h, d; \ // d = k + w + h + d // --
; \
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
Expand Down Expand Up @@ -270,7 +270,7 @@
MOVL a, y3; \ // y3 = a // MAJA
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3
Expand Down Expand Up @@ -316,7 +316,7 @@
; \
MOVL a, y3; \ // y3 = a // MAJA
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
; \
VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA}
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
Expand Down Expand Up @@ -495,7 +495,7 @@
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
Expand Down Expand Up @@ -531,7 +531,7 @@
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
Expand All @@ -550,9 +550,80 @@
; \
ADDL y3, h // h = t1 + S0 + MAJ // --

// Definitions for sha-ni version
//
// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
//
// Reference
// S. Gulley, et al, "New Instructions Supporting the Secure Hash
// Algorithm on Intel® Architecture Processors", July 2013
// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
//

#define digestPtr DI // input/output, base pointer to digest hash vector H0, H1, ..., H7
#define dataPtr SI // input, base pointer to first input data block
#define numBytes DX // input, number of input bytes to be processed
#define sha256Constants AX // round contants from K256 table, indexed by round number x 32
#define msg X0 // input data
#define state0 X1 // round intermediates and outputs
#define state1 X2
#define m0 X3 // m0, m1,... m4 -- round message temps
#define m1 X4
#define m2 X5
#define m3 X6
#define m4 X7
#define shufMask X8 // input data endian conversion control mask
#define abefSave X9 // digest hash vector inter-block buffer abef
#define cdghSave X10 // digest hash vector inter-block buffer cdgh

#define nop(m,a) // nop instead of final SHA256MSG1 for first and last few rounds

#define sha256msg1(m,a) \ // final SHA256MSG1 for middle rounds that require it
SHA256MSG1 m, a

#define vmov(a,b) \ // msg copy for all but rounds 12-15
VMOVDQA a, b

#define vmovrev(a,b) \ // reverse copy for rounds 12-15
VMOVDQA b, a

// sha rounds 0 to 11
// identical with the exception of the final msg op
// which is replaced with a nop for rounds where it is not needed
// refer to Gulley, et al for more information
#define rounds0to11(m,a,c,sha256Msg1) \
VMOVDQU c*16(dataPtr), msg \
PSHUFB shufMask, msg \
VMOVDQA msg, m \
PADDD (c*32)(sha256Constants), msg \
SHA256RNDS2 msg, state0, state1 \
PSHUFD $0x0e, msg, msg \
SHA256RNDS2 msg, state1, state0 \
sha256Msg1 (m,a)

// sha rounds 12 to 59
// identical with the exception of the final msg op
// and the reverse copy(m,msg) in round 12 which is required
// after the last data load
// refer to Gulley, et al for more information
#define rounds12to59(m,c,a,t,sha256Msg1,movop) \
movop (m,msg) \
PADDD (c*32)(sha256Constants), msg \
SHA256RNDS2 msg, state0, state1 \
VMOVDQA m, m4 \
PALIGNR $4, a, m4 \
PADDD m4, t \
SHA256MSG2 m, t \
PSHUFD $0x0e, msg, msg \
SHA256RNDS2 msg, state1, state0 \
sha256Msg1 (m,a)

TEXT ·block(SB), 0, $536-32
CMPB ·useAVX2(SB), $1
JE avx2
CMPB ·useSHA(SB), $1
JE sha_ni
CMPB ·useAVX2(SB), $1
JE avx2

MOVQ p_base+8(FP), SI
MOVQ p_len+16(FP), DX
Expand Down Expand Up @@ -862,6 +933,77 @@ done_hash:
VZEROUPPER
RET

sha_ni:
MOVQ dig+0(FP), digestPtr // init digest hash vector H0, H1,..., H7 pointer
MOVQ p_base+8(FP), dataPtr // init input data base pointer
MOVQ p_len+16(FP), numBytes // get number of input bytes to hash
SHRQ $6, numBytes // force modulo 64 input buffer length
SHLQ $6, numBytes
CMPQ numBytes, $0 // exit early for zero-length input buffer
JEQ done
ADDQ dataPtr, numBytes // point numBytes to end of input buffer
VMOVDQU (0*16)(digestPtr), state0 // load initial hash values and reorder
VMOVDQU (1*16)(digestPtr), state1 // DCBA, HGFE -> ABEF, CDGH
PSHUFD $0xb1, state0, state0 // CDAB
PSHUFD $0x1b, state1, state1 // EFGH
VMOVDQA state0, m4