From 36b34efddc5da2340c4e08e09f590f2200c715ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=97=D0=B8=D0=BD=D0=BE=D0=B2=D0=BA=D0=B8=D0=BD=20=D0=90?= =?UTF-8?q?=D0=BB=D0=B5=D0=BA=D1=81=D0=B5=D0=B9?= <168339548+alexzin04@users.noreply.github.com> Date: Fri, 9 Aug 2024 20:18:02 +0300 Subject: [PATCH] kuznyechik: add Neon backend (#447) --- .github/workflows/kuznyechik.yml | 35 +++- kuznyechik/src/lib.rs | 7 + kuznyechik/src/neon/backends.rs | 322 +++++++++++++++++++++++++++++++ kuznyechik/src/neon/mod.rs | 63 ++++++ 4 files changed, 423 insertions(+), 4 deletions(-) create mode 100644 kuznyechik/src/neon/backends.rs create mode 100644 kuznyechik/src/neon/mod.rs diff --git a/.github/workflows/kuznyechik.yml b/.github/workflows/kuznyechik.yml index d3e7ecf7..e45f0bab 100644 --- a/.github/workflows/kuznyechik.yml +++ b/.github/workflows/kuznyechik.yml @@ -37,12 +37,12 @@ jobs: toolchain: ${{ matrix.rust }} targets: ${{ matrix.target }} - run: cargo build --target ${{ matrix.target }} - - run: cargo build --target ${{ matrix.target }} - env: + - env: RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft" - - run: cargo build --target ${{ matrix.target }} - env: + run: cargo build --target ${{ matrix.target }} + - env: RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft --cfg kuznyechik_compact_soft" + run: cargo build --target ${{ matrix.target }} minimal-versions: uses: RustCrypto/actions/.github/workflows/minimal-versions.yml@master @@ -75,3 +75,30 @@ jobs: run: | cargo test cargo test --all-features + + macos: + runs-on: macos-latest + strategy: + matrix: + rust: + - 1.65.0 + - stable + steps: + - uses: actions/checkout@v4 + - uses: RustCrypto/actions/cargo-cache@master + - uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.rust }} + - run: | + cargo test + cargo test --all-features + - env: + RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft" + run: | + cargo test + cargo test --all-features + - env: + RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft --cfg kuznyechik_compact_soft" + run: | + cargo test + cargo test --all-features diff --git a/kuznyechik/src/lib.rs b/kuznyechik/src/lib.rs index f725071c..b1b1b941 100644 --- a/kuznyechik/src/lib.rs +++ b/kuznyechik/src/lib.rs @@ -51,6 +51,13 @@ cfg_if::cfg_if!( ))] { mod sse2; use sse2 as imp; + } else if #[cfg(all( + target_arch = "aarch64", + target_feature = "neon", + not(kuznyechik_force_soft), + ))] { + mod neon; + use neon as imp; } else if #[cfg(kuznyechik_compact_soft)] { mod compact_soft; use compact_soft as imp; diff --git a/kuznyechik/src/neon/backends.rs b/kuznyechik/src/neon/backends.rs new file mode 100644 index 00000000..9c437352 --- /dev/null +++ b/kuznyechik/src/neon/backends.rs @@ -0,0 +1,322 @@ +use super::consts::{Table, DEC_TABLE, ENC_TABLE, RKEY_GEN}; +use crate::{ + consts::{P, P_INV}, + Block, Key, +}; +use cipher::{ + consts::{U16, U8}, + inout::InOut, + typenum::Unsigned, + BlockBackend, BlockSizeUser, ParBlocks, ParBlocksSizeUser, +}; + +use core::arch::aarch64::*; + +pub(super) type RoundKeys = [uint8x16_t; 10]; + +type ParBlocksSize = U8; + +#[rustfmt::skip] +macro_rules! unroll_par { + ($var:ident, $body:block) => { + { let $var: usize = 0; $body; } + { let $var: usize = 1; $body; } + { let $var: usize = 2; $body; } + { let $var: usize = 3; $body; } + { let $var: usize = 4; $body; } + { let $var: usize = 5; $body; } + { let $var: usize = 6; $body; } + { let $var: usize = 7; $body; } + + }; +} + +#[inline(always)] +unsafe fn sub_bytes(block: uint8x16_t, sbox: &[u8; 256]) -> uint8x16_t { + let value_vector = vdupq_n_u8(64); + + //Split the sbox table into four parts + let sbox_part1 = uint8x16x4_t( + vld1q_u8(&sbox[0] as *const u8), + vld1q_u8(&sbox[16] as *const u8), + vld1q_u8(&sbox[32] as *const u8), + vld1q_u8(&sbox[48] as *const u8), + ); + + let sbox_part2 = uint8x16x4_t( + vld1q_u8(&sbox[64] as *const u8), + vld1q_u8(&sbox[80] as *const u8), + vld1q_u8(&sbox[96] as *const u8), + vld1q_u8(&sbox[112] as *const u8), + ); + + let sbox_part3 = uint8x16x4_t( + vld1q_u8(&sbox[128] as *const u8), + vld1q_u8(&sbox[144] as *const u8), + vld1q_u8(&sbox[160] as *const u8), + vld1q_u8(&sbox[176] as *const u8), + ); + + let sbox_part4 = uint8x16x4_t( + vld1q_u8(&sbox[192] as *const u8), + vld1q_u8(&sbox[208] as *const u8), + vld1q_u8(&sbox[224] as *const u8), + vld1q_u8(&sbox[240] as *const u8), + ); + + // Indexing each part of the sbox table + let result1 = vqtbl4q_u8(sbox_part1, block); + let block_1 = vsubq_u8(block, value_vector); + let result2 = vqtbl4q_u8(sbox_part2, block_1); + let block_2 = vsubq_u8(block_1, value_vector); + let result3 = vqtbl4q_u8(sbox_part3, block_2); + let block_3 = vsubq_u8(block_2, value_vector); + let result4 = vqtbl4q_u8(sbox_part4, block_3); + // Merging results + let result = vorrq_u8(vorrq_u8(result1, result2), vorrq_u8(result3, result4)); + + result +} + +#[inline(always)] +unsafe fn transform(block: uint8x16_t, table: &Table) -> uint8x16_t { + macro_rules! get { + ($table:expr, $ind:expr, $i:expr) => {{ + let idx = vgetq_lane_u16($ind, $i) as usize; + let p = &($table.0[idx]) as *const u8 as *const uint8x16_t; + // correct alignment of `p` is guaranteed since offset values + // are shifted by 4 bits left and the table is aligned to 16 bytes + debug_assert_eq!(p as usize % 16, 0); + vld1q_u8(p as *const u8) + }}; + } + + macro_rules! xor_get { + ($val:expr, $table:expr, $ind:expr, $i:expr) => { + $val = veorq_u8($val, get!($table, $ind, $i)); + }; + } + + let ind = vcombine_u8( + vcreate_u8(0x0706050403020100), + vcreate_u8(0x0f0e0d0c0b0a0908), + ); + let test = vzip1q_u8(block, ind); + + let lind = vshlq_n_u16(vreinterpretq_u16_u8(test), 4); + + let mut lt = get!(table, lind, 0); + + xor_get!(lt, table, lind, 1); + xor_get!(lt, table, lind, 2); + xor_get!(lt, table, lind, 3); + xor_get!(lt, table, lind, 4); + xor_get!(lt, table, lind, 5); + xor_get!(lt, table, lind, 6); + xor_get!(lt, table, lind, 7); + + let rind = vshlq_n_u16(vreinterpretq_u16_u8(vzip2q_u8(block, ind)), 4); + + let mut rt = get!(table, rind, 0); + xor_get!(rt, table, rind, 1); + xor_get!(rt, table, rind, 2); + xor_get!(rt, table, rind, 3); + xor_get!(rt, table, rind, 4); + xor_get!(rt, table, rind, 5); + xor_get!(rt, table, rind, 6); + xor_get!(rt, table, rind, 7); + + veorq_u8(lt, rt) +} + +pub fn expand_enc_keys(key: &Key) -> RoundKeys { + macro_rules! next_const { + ($i:expr) => {{ + let p = RKEY_GEN.0.as_ptr() as *const uint8x16_t; + // correct alignment of `p` is guaranteed since the table + // is aligned to 16 bytes + let p = p.add($i); + debug_assert_eq!(p as usize % 16, 0); + $i += 1; + vld1q_u8(p as *const u8) + }}; + } + + unsafe { + let mut enc_keys = [vdupq_n_u8(0); 10]; + + let pk: *const uint8x16_t = key.as_ptr() as *const uint8x16_t; + let mut k1 = vld1q_u8(pk as *const u8); + let mut k2 = vld1q_u8(pk.add(1) as *const u8); + enc_keys[0] = k1; + enc_keys[1] = k2; + + let mut cidx = 0; + for i in 1..5 { + for _ in 0..4 { + let mut t = veorq_u8(k1, next_const!(cidx)); + t = transform(t, &ENC_TABLE); + k2 = veorq_u8(k2, t); + + let mut t = veorq_u8(k2, next_const!(cidx)); + t = transform(t, &ENC_TABLE); + k1 = veorq_u8(k1, t); + } + + enc_keys[2 * i] = k1; + enc_keys[2 * i + 1] = k2; + } + + enc_keys + } +} + +pub fn inv_enc_keys(enc_keys: &RoundKeys) -> RoundKeys { + unsafe { + let mut dec_keys = [vdupq_n_u8(0); 10]; + + dec_keys[0] = enc_keys[9]; + for i in 1..9 { + let k = sub_bytes(enc_keys[i], &P); + dec_keys[9 - i] = transform(k, &DEC_TABLE); + } + dec_keys[9] = enc_keys[0]; + + dec_keys + } +} + +pub(crate) struct EncBackend<'a>(pub(crate) &'a RoundKeys); + +impl<'a> BlockSizeUser for EncBackend<'a> { + type BlockSize = U16; +} + +impl<'a> ParBlocksSizeUser for EncBackend<'a> { + type ParBlocksSize = ParBlocksSize; +} + +impl<'a> BlockBackend for EncBackend<'a> { + #[inline] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + let k = self.0; + unsafe { + let (in_ptr, out_ptr) = block.into_raw(); + let mut b = vld1q_u8(in_ptr as *const u8); + + for i in 0..9 { + b = veorq_u8(b, k[i]); + b = transform(b, &ENC_TABLE); + } + b = veorq_u8(b, k[9]); + vst1q_u8(out_ptr as *mut u8, b); + } + } + + #[inline] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks>) { + let k = self.0; + unsafe { + let (in_ptr, out_ptr) = blocks.into_raw(); + let in_ptr = in_ptr as *mut uint8x16_t; + let out_ptr = out_ptr as *mut uint8x16_t; + + let mut blocks = [vdupq_n_u8(0); ParBlocksSize::USIZE]; + unroll_par! { + i, { + blocks[i] = vld1q_u8(in_ptr.add(i) as *const u8); + } + }; + + for i in 0..9 { + unroll_par!(j, { + let t = veorq_u8(blocks[j], k[i]); + blocks[j] = transform(t, &ENC_TABLE); + }); + } + + unroll_par! { + i, { + let t = veorq_u8(blocks[i], k[9]); + vst1q_u8(out_ptr.add(i) as *mut u8, t); + } + }; + } + } +} + +pub(crate) struct DecBackend<'a>(pub(crate) &'a RoundKeys); + +impl<'a> BlockSizeUser for DecBackend<'a> { + type BlockSize = U16; +} + +impl<'a> ParBlocksSizeUser for DecBackend<'a> { + type ParBlocksSize = ParBlocksSize; +} + +impl<'a> BlockBackend for DecBackend<'a> { + #[inline] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + let k = self.0; + unsafe { + let (in_ptr, out_ptr) = block.into_raw(); + let mut b = vld1q_u8(in_ptr as *const u8); + + b = veorq_u8(b, k[0]); + + b = sub_bytes(b, &P); + b = transform(b, &DEC_TABLE); + + for i in 1..9 { + b = transform(b, &DEC_TABLE); + b = veorq_u8(b, k[i]); + } + b = sub_bytes(b, &P_INV); + b = veorq_u8(b, k[9]); + + vst1q_u8(out_ptr as *mut u8, b); + } + } + #[inline] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks>) { + let k = self.0; + unsafe { + let (in_ptr, out_ptr) = blocks.into_raw(); + let in_ptr = in_ptr as *mut uint8x16_t; + let out_ptr = out_ptr as *mut uint8x16_t; + + let mut blocks = [vdupq_n_u8(0); ParBlocksSize::USIZE]; + unroll_par! { + i, { + blocks[i] = vld1q_u8(in_ptr.add(i) as *const u8); + } + }; + + unroll_par! { + i, { + let t = veorq_u8(blocks[i], k[0]); + let t = sub_bytes(t, &P); + blocks[i] = transform(t, &DEC_TABLE); + } + } + + for i in 1..9 { + unroll_par! { + j, { + let t = transform(blocks[j], &DEC_TABLE); + blocks[j] = veorq_u8(t, k[i]); + } + } + } + + unroll_par! { + i, { + let t = sub_bytes(blocks[i], &P_INV); + let t2 = veorq_u8(t, k[9]); + vst1q_u8(out_ptr.add(i) as *mut u8, t2); + } + } + } + } +} diff --git a/kuznyechik/src/neon/mod.rs b/kuznyechik/src/neon/mod.rs new file mode 100644 index 00000000..b96f7f5d --- /dev/null +++ b/kuznyechik/src/neon/mod.rs @@ -0,0 +1,63 @@ +use crate::{BlockSize, Key}; +use cipher::{BlockCipherDecrypt, BlockCipherEncrypt, BlockClosure}; + +mod backends; +#[path = "../fused_tables/consts.rs"] +mod consts; + +use backends::{expand_enc_keys, inv_enc_keys, DecBackend, EncBackend, RoundKeys}; + +#[derive(Clone)] +pub(crate) struct EncDecKeys { + enc: RoundKeys, + dec: RoundKeys, +} +#[derive(Clone)] +pub(crate) struct EncKeys(RoundKeys); +#[derive(Clone)] +pub(crate) struct DecKeys(RoundKeys); + +impl EncKeys { + pub fn new(key: &Key) -> Self { + Self(expand_enc_keys(key)) + } +} + +impl From for EncDecKeys { + fn from(enc: EncKeys) -> Self { + Self { + dec: inv_enc_keys(&enc.0), + enc: enc.0, + } + } +} + +impl From for DecKeys { + fn from(enc: EncKeys) -> Self { + Self(inv_enc_keys(&enc.0)) + } +} + +impl BlockCipherEncrypt for crate::Kuznyechik { + fn encrypt_with_backend(&self, f: impl BlockClosure) { + f.call(&mut EncBackend(&self.keys.enc)); + } +} + +impl BlockCipherDecrypt for crate::Kuznyechik { + fn decrypt_with_backend(&self, f: impl BlockClosure) { + f.call(&mut DecBackend(&self.keys.dec)); + } +} + +impl BlockCipherEncrypt for crate::KuznyechikEnc { + fn encrypt_with_backend(&self, f: impl BlockClosure) { + f.call(&mut EncBackend(&self.keys.0)); + } +} + +impl BlockCipherDecrypt for crate::KuznyechikDec { + fn decrypt_with_backend(&self, f: impl BlockClosure) { + f.call(&mut DecBackend(&self.keys.0)); + } +}