Skip to content

Commit

Permalink
sha2: wasm32 simd128 backends (#562)
Browse files Browse the repository at this point in the history
This PR ports the AVX implementation of SHA-512 to simd128. It also
implements the related version of SHA-256 from
https://github.com/aws-samples/sha2-with-c-intrinsic/blob/master/src/sha256_compress_x86_64_avx.c
in simd128.
Also added wasm32 testing in CI using wasmtime. Since wasm does not have
feature detection, this backend is only used if the `-C
target-feature=+simd128` flag is set.


Benchmarks on AMD Ryzen 9 7950X3D, running with wasmtime 26.0.0
(c92317bcc 2024-10-22) on rustc 1.84.0-nightly (b3f75cc87 2024-11-02):

```
+ RUSTFLAGS='-C target-feature=+simd128'
+ cargo +nightly bench -q --bench mod --target wasm32-wasip1

running 8 tests
test sha256_10    ... bench:          18.71 ns/iter (+/- 1.62) = 555 MB/s
test sha256_100   ... bench:         167.94 ns/iter (+/- 0.62) = 598 MB/s
test sha256_1000  ... bench:       1,656.93 ns/iter (+/- 142.75) = 603 MB/s
test sha256_10000 ... bench:      15,601.30 ns/iter (+/- 1,268.65) = 640 MB/s
test sha512_10    ... bench:          14.35 ns/iter (+/- 0.09) = 714 MB/s
test sha512_100   ... bench:         137.37 ns/iter (+/- 0.87) = 729 MB/s
test sha512_1000  ... bench:       1,261.63 ns/iter (+/- 105.65) = 793 MB/s
test sha512_10000 ... bench:      12,434.24 ns/iter (+/- 24.46) = 804 MB/s

test result: ok. 0 passed; 0 failed; 0 ignored; 8 measured; 0 filtered out; finished in 4.40s

+ RUSTFLAGS='-C target-feature=-simd128'
+ cargo +nightly bench -q --bench mod --target wasm32-wasip1

running 8 tests
test sha256_10    ... bench:         155.59 ns/iter (+/- 1.08) = 64 MB/s
test sha256_100   ... bench:       1,539.48 ns/iter (+/- 9.18) = 64 MB/s
test sha256_1000  ... bench:      15,207.34 ns/iter (+/- 81.67) = 65 MB/s
test sha256_10000 ... bench:     151,547.98 ns/iter (+/- 1,170.30) = 65 MB/s
test sha512_10    ... bench:          98.59 ns/iter (+/- 0.45) = 102 MB/s
test sha512_100   ... bench:         980.99 ns/iter (+/- 3.43) = 102 MB/s
test sha512_1000  ... bench:       9,622.94 ns/iter (+/- 29.97) = 103 MB/s
test sha512_10000 ... bench:      95,977.25 ns/iter (+/- 310.30) = 104 MB/s

test result: ok. 0 passed; 0 failed; 0 ignored; 8 measured; 0 filtered out; finished in 6.55s

+ RUSTFLAGS='--cfg sha2_backend="soft" -C target-feature=+simd128'
+ cargo +nightly bench -q --bench mod --target wasm32-wasip1

running 8 tests
test sha256_10    ... bench:         142.07 ns/iter (+/- 13.71) = 70 MB/s
test sha256_100   ... bench:       1,404.58 ns/iter (+/- 10.83) = 71 MB/s
test sha256_1000  ... bench:      14,823.81 ns/iter (+/- 1,346.05) = 67 MB/s
test sha256_10000 ... bench:     139,001.94 ns/iter (+/- 978.58) = 71 MB/s
test sha512_10    ... bench:          90.39 ns/iter (+/- 7.82) = 111 MB/s
test sha512_100   ... bench:         893.20 ns/iter (+/- 72.22) = 111 MB/s
test sha512_1000  ... bench:       8,812.46 ns/iter (+/- 878.60) = 113 MB/s
test sha512_10000 ... bench:      87,887.02 ns/iter (+/- 394.70) = 113 MB/s

test result: ok. 0 passed; 0 failed; 0 ignored; 8 measured; 0 filtered out; finished in 8.62s

```
  • Loading branch information
max-te authored Nov 3, 2024
1 parent a467ac6 commit a68c77e
Show file tree
Hide file tree
Showing 6 changed files with 403 additions and 2 deletions.
34 changes: 33 additions & 1 deletion .github/workflows/sha2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
set-msrv:
uses: RustCrypto/actions/.github/workflows/set-msrv.yml@master
with:
msrv: 1.79.0
msrv: 1.81.0

# Builds for no_std platforms
build:
Expand Down Expand Up @@ -196,6 +196,38 @@ jobs:
env:
RUSTFLAGS: -Dwarnings --cfg sha2_backend="riscv-zknh-compact" -C target-feature=+zknh,+zbkb

# wasmtime tests
wasm:
needs: set-msrv
strategy:
matrix:
include:
# without simd
- rust: ${{needs.set-msrv.outputs.msrv}}
flags: "-C target-feature=-simd128"
- rust: stable
flags: "-C target-feature=-simd128"

# with simd
- rust: ${{needs.set-msrv.outputs.msrv}}
flags: "-C target-feature=+simd128"
- rust: stable
flags: "-C target-feature=+simd128"
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: RustCrypto/actions/cargo-cache@master
- uses: dtolnay/rust-toolchain@master
with:
toolchain: ${{ matrix.rust }}
targets: wasm32-wasip1
- uses: RustCrypto/actions/cargo-hack-install@master
- uses: jcbhmr/setup-wasmtime@v2
- run: cargo hack test --feature-powerset --target wasm32-wasip1
env:
RUSTFLAGS: ${{ matrix.flags }}
CARGO_TARGET_WASM32_WASIP1_RUNNER: wasmtime

minimal-versions:
uses: RustCrypto/actions/.github/workflows/minimal-versions.yml@master
with:
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Additionally all crates do not require the standard library (i.e. `no_std` capab
| [RIPEMD] | [`ripemd`] | [![crates.io](https://img.shields.io/crates/v/ripemd.svg)](https://crates.io/crates/ripemd) | [![Documentation](https://docs.rs/ripemd/badge.svg)](https://docs.rs/ripemd) | ![MSRV 1.71][msrv-1.71] | :green_heart: |
| [SHA-1] | [`sha1`] | [![crates.io](https://img.shields.io/crates/v/sha1.svg)](https://crates.io/crates/sha1) | [![Documentation](https://docs.rs/sha1/badge.svg)](https://docs.rs/sha1) | ![MSRV 1.72][msrv-1.72] | :broken_heart: |
| [SHA-1 Checked] | [`sha1-checked`] | [![crates.io](https://img.shields.io/crates/v/sha1-checked.svg)](https://crates.io/crates/sha1-checked) | [![Documentation](https://docs.rs/sha1-checked/badge.svg)](https://docs.rs/sha1-checked) | ![MSRV 1.72][msrv-1.72] | :yellow_heart: |
| [SHA-2] | [`sha2`] | [![crates.io](https://img.shields.io/crates/v/sha2.svg)](https://crates.io/crates/sha2) | [![Documentation](https://docs.rs/sha2/badge.svg)](https://docs.rs/sha2) | ![MSRV 1.72][msrv-1.72] | :green_heart: |
| [SHA-2] | [`sha2`] | [![crates.io](https://img.shields.io/crates/v/sha2.svg)](https://crates.io/crates/sha2) | [![Documentation](https://docs.rs/sha2/badge.svg)](https://docs.rs/sha2) | ![MSRV 1.81][msrv-1.81] | :green_heart: |
| [SHA-3] (Keccak) | [`sha3`] | [![crates.io](https://img.shields.io/crates/v/sha3.svg)](https://crates.io/crates/sha3) | [![Documentation](https://docs.rs/sha3/badge.svg)](https://docs.rs/sha3) | ![MSRV 1.71][msrv-1.71] | :green_heart: |
| [SHABAL] | [`shabal`] | [![crates.io](https://img.shields.io/crates/v/shabal.svg)](https://crates.io/crates/shabal) | [![Documentation](https://docs.rs/shabal/badge.svg)](https://docs.rs/shabal) | ![MSRV 1.71][msrv-1.71] | :green_heart: |
| [Skein] | [`skein`] | [![crates.io](https://img.shields.io/crates/v/skein.svg)](https://crates.io/crates/skein) | [![Documentation](https://docs.rs/skein/badge.svg)](https://docs.rs/skein) | ![MSRV 1.71][msrv-1.71] | :green_heart: |
Expand Down Expand Up @@ -237,6 +237,7 @@ Unless you explicitly state otherwise, any contribution intentionally submitted
[msrv-1.71]: https://img.shields.io/badge/rustc-1.71.0+-blue.svg
[msrv-1.72]: https://img.shields.io/badge/rustc-1.72.0+-blue.svg
[msrv-1.74]: https://img.shields.io/badge/rustc-1.74.0+-blue.svg
[msrv-1.81]: https://img.shields.io/badge/rustc-1.81.0+-blue.svg

[//]: # (crates)

Expand Down
3 changes: 3 additions & 0 deletions sha2/src/sha256.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ cfg_if::cfg_if! {
} else if #[cfg(target_arch = "loongarch64")] {
mod loongarch64_asm;
use loongarch64_asm::compress;
} else if #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] {
mod wasm32;
use wasm32::compress;
} else {
mod soft;
use soft::compress;
Expand Down
190 changes: 190 additions & 0 deletions sha2/src/sha256/wasm32.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
#![allow(clippy::many_single_char_names)]
use core::arch::wasm32::*;
use core::mem::size_of;

use crate::consts::K32;

pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
assert_eq!(SHA256_HASH_WORDS_NUM, 8);
assert_eq!(SHA256_BLOCK_WORDS_NUM, 16);
let mut ms = [u64x2(0, 0); 4];
let mut x = [u64x2(0, 0); 4];

for block in blocks {
unsafe {
let mut current_state = *state;
load_data(&mut x, &mut ms, block.as_ptr().cast());
rounds_0_47(&mut current_state, &mut x, &mut ms);
rounds_48_63(&mut current_state, &ms);
accumulate_state(state, &current_state);
}
}
}

#[inline(always)]
unsafe fn load_data(x: &mut [v128; 4], ms: &mut MsgSchedule, data: *const v128) {
macro_rules! unrolled_iterations {
($($i:literal),*) => {$(
x[$i] = v128_load(data.add($i).cast());
x[$i] = i8x16_shuffle::<3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>(x[$i], x[$i]);

let y = i32x4_add(
x[$i],
v128_load(K32.as_ptr().add(4 * $i).cast()),
);

ms[$i] = y;
)*};
}

unrolled_iterations!(0, 1, 2, 3);
}

#[inline(always)]
unsafe fn rounds_0_47(current_state: &mut State, x: &mut [v128; 4], ms: &mut MsgSchedule) {
let mut k32_idx: usize = SHA256_BLOCK_WORDS_NUM;

for _ in 0..3 {
for j in 0..4 {
let k32 = v128_load(K32.as_ptr().add(k32_idx).cast());
let y = sha256_update_x(x, k32);

{
let ms = ms[j];
sha_round(current_state, u32x4_extract_lane::<0>(ms));
sha_round(current_state, u32x4_extract_lane::<1>(ms));
sha_round(current_state, u32x4_extract_lane::<2>(ms));
sha_round(current_state, u32x4_extract_lane::<3>(ms));
}

ms[j] = y;
k32_idx += 4;
}
}
}

#[inline(always)]
fn rounds_48_63(current_state: &mut State, ms: &MsgSchedule) {
for j in 0..4 {
let ms = ms[j];
sha_round(current_state, u32x4_extract_lane::<0>(ms));
sha_round(current_state, u32x4_extract_lane::<1>(ms));
sha_round(current_state, u32x4_extract_lane::<2>(ms));
sha_round(current_state, u32x4_extract_lane::<3>(ms));
}
}

#[inline(always)]
fn sha_round(s: &mut State, x: u32) {
macro_rules! big_sigma0 {
($a:expr) => {
$a.rotate_right(2) ^ $a.rotate_right(13) ^ $a.rotate_right(22)
};
}
macro_rules! big_sigma1 {
($a:expr) => {
$a.rotate_right(6) ^ $a.rotate_right(11) ^ $a.rotate_right(25)
};
}
macro_rules! bool3ary_202 {
($a:expr, $b:expr, $c:expr) => {
$c ^ ($a & ($b ^ $c))
};
} // Choose, MD5F, SHA1C
macro_rules! bool3ary_232 {
($a:expr, $b:expr, $c:expr) => {
($a & $b) ^ ($a & $c) ^ ($b & $c)
};
} // Majority, SHA1M

macro_rules! rotate_state {
($s:ident) => {{
let tmp = $s[7];
$s[7] = $s[6];
$s[6] = $s[5];
$s[5] = $s[4];
$s[4] = $s[3];
$s[3] = $s[2];
$s[2] = $s[1];
$s[1] = $s[0];
$s[0] = tmp;
}};
}

let t = x
.wrapping_add(s[7])
.wrapping_add(big_sigma1!(s[4]))
.wrapping_add(bool3ary_202!(s[4], s[5], s[6]));

s[7] = t
.wrapping_add(big_sigma0!(s[0]))
.wrapping_add(bool3ary_232!(s[0], s[1], s[2]));
s[3] = s[3].wrapping_add(t);

rotate_state!(s);
}

#[inline(always)]
fn accumulate_state(dst: &mut State, src: &State) {
for i in 0..SHA256_HASH_WORDS_NUM {
dst[i] = dst[i].wrapping_add(src[i]);
}
}

#[inline(always)]
unsafe fn sha256_update_x(x: &mut [v128; 4], k32: v128) -> v128 {
const SIGMA0_0: u32 = 7;
const SIGMA0_1: u32 = 18;
const SIGMA0_2: u32 = 3;
const SIGMA1_0: u32 = 17;
const SIGMA1_1: u32 = 19;
const SIGMA1_2: u32 = 10;
const SHA256_WORD_BIT_LEN: u32 = 8 * size_of::<u32>() as u32;
const ZERO: v128 = u64x2(0, 0);

let mut t0 = u32x4_shuffle::<1, 2, 3, 4>(x[0], x[1]);
let mut t3 = u32x4_shuffle::<1, 2, 3, 4>(x[2], x[3]);
let mut t2 = u32x4_shr(t0, SIGMA0_0);
x[0] = u32x4_add(x[0], t3);
t3 = u32x4_shr(t0, SIGMA0_2);
let mut t1 = u32x4_shl(t0, SHA256_WORD_BIT_LEN - SIGMA0_1);
t0 = v128_xor(t3, t2);
t3 = u32x4_shuffle::<2, 2, 3, 3>(x[3], x[3]);
t2 = u32x4_shr(t2, SIGMA0_1 - SIGMA0_0);
t0 = v128_xor(t0, t1);
t0 = v128_xor(t0, t2);
t1 = u32x4_shl(t1, SIGMA0_1 - SIGMA0_0);
t2 = u32x4_shr(t3, SIGMA1_2);
t3 = u64x2_shr(t3, SIGMA1_0);
t1 = v128_xor(t0, t1);
x[0] = u32x4_add(x[0], t1);
t2 = v128_xor(t2, t3);
t3 = u64x2_shr(t3, SIGMA1_1 - SIGMA1_0);
t2 = v128_xor(t2, t3);
t2 = u32x4_shuffle::<0, 2, 7, 7>(t2, ZERO);
x[0] = u32x4_add(x[0], t2);
t3 = u32x4_shuffle::<0, 0, 1, 1>(x[0], x[0]);
t2 = u32x4_shr(t3, SIGMA1_2);
t3 = u64x2_shr(t3, SIGMA1_0);
t2 = v128_xor(t2, t3);
t3 = u64x2_shr(t3, SIGMA1_1 - SIGMA1_0);
t2 = v128_xor(t2, t3);
t2 = u32x4_shuffle::<7, 7, 0, 2>(t2, ZERO);
x[0] = u32x4_add(x[0], t2);

let tmp = x[0];
x[0] = x[1];
x[1] = x[2];
x[2] = x[3];
x[3] = tmp;

u32x4_add(x[3], k32)
}

type State = [u32; SHA256_HASH_WORDS_NUM];
type MsgSchedule = [v128; SHA256_BLOCK_WORDS_NUM / 4];

const SHA256_BLOCK_BYTE_LEN: usize = 64;
const SHA256_HASH_BYTE_LEN: usize = 32;
const SHA256_HASH_WORDS_NUM: usize = SHA256_HASH_BYTE_LEN / size_of::<u32>();
const SHA256_BLOCK_WORDS_NUM: usize = SHA256_BLOCK_BYTE_LEN / size_of::<u32>();
3 changes: 3 additions & 0 deletions sha2/src/sha512.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ cfg_if::cfg_if! {
} else if #[cfg(target_arch = "loongarch64")] {
mod loongarch64_asm;
use loongarch64_asm::compress;
} else if #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] {
mod wasm32;
use wasm32::compress;
} else {
mod soft;
use soft::compress;
Expand Down
Loading

0 comments on commit a68c77e

Please sign in to comment.