Skip to content

Commit

Permalink
kuznyechik: add Neon backend (#447)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexzin04 committed Aug 9, 2024
1 parent daac7ea commit 36b34ef
Show file tree
Hide file tree
Showing 4 changed files with 423 additions and 4 deletions.
35 changes: 31 additions & 4 deletions .github/workflows/kuznyechik.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ jobs:
toolchain: ${{ matrix.rust }}
targets: ${{ matrix.target }}
- run: cargo build --target ${{ matrix.target }}
- run: cargo build --target ${{ matrix.target }}
env:
- env:
RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft"
- run: cargo build --target ${{ matrix.target }}
env:
run: cargo build --target ${{ matrix.target }}
- env:
RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft --cfg kuznyechik_compact_soft"
run: cargo build --target ${{ matrix.target }}

minimal-versions:
uses: RustCrypto/actions/.github/workflows/minimal-versions.yml@master
Expand Down Expand Up @@ -75,3 +75,30 @@ jobs:
run: |
cargo test
cargo test --all-features
macos:
runs-on: macos-latest
strategy:
matrix:
rust:
- 1.65.0
- stable
steps:
- uses: actions/checkout@v4
- uses: RustCrypto/actions/cargo-cache@master
- uses: dtolnay/rust-toolchain@master
with:
toolchain: ${{ matrix.rust }}
- run: |
cargo test
cargo test --all-features
- env:
RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft"
run: |
cargo test
cargo test --all-features
- env:
RUSTFLAGS: "-Dwarnings --cfg kuznyechik_force_soft --cfg kuznyechik_compact_soft"
run: |
cargo test
cargo test --all-features
7 changes: 7 additions & 0 deletions kuznyechik/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ cfg_if::cfg_if!(
))] {
mod sse2;
use sse2 as imp;
} else if #[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
not(kuznyechik_force_soft),
))] {
mod neon;
use neon as imp;
} else if #[cfg(kuznyechik_compact_soft)] {
mod compact_soft;
use compact_soft as imp;
Expand Down
322 changes: 322 additions & 0 deletions kuznyechik/src/neon/backends.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@
use super::consts::{Table, DEC_TABLE, ENC_TABLE, RKEY_GEN};
use crate::{
consts::{P, P_INV},
Block, Key,
};
use cipher::{
consts::{U16, U8},
inout::InOut,
typenum::Unsigned,
BlockBackend, BlockSizeUser, ParBlocks, ParBlocksSizeUser,
};

use core::arch::aarch64::*;

pub(super) type RoundKeys = [uint8x16_t; 10];

type ParBlocksSize = U8;

#[rustfmt::skip]
macro_rules! unroll_par {
($var:ident, $body:block) => {
{ let $var: usize = 0; $body; }
{ let $var: usize = 1; $body; }
{ let $var: usize = 2; $body; }
{ let $var: usize = 3; $body; }
{ let $var: usize = 4; $body; }
{ let $var: usize = 5; $body; }
{ let $var: usize = 6; $body; }
{ let $var: usize = 7; $body; }

};
}

#[inline(always)]
unsafe fn sub_bytes(block: uint8x16_t, sbox: &[u8; 256]) -> uint8x16_t {
let value_vector = vdupq_n_u8(64);

//Split the sbox table into four parts
let sbox_part1 = uint8x16x4_t(
vld1q_u8(&sbox[0] as *const u8),
vld1q_u8(&sbox[16] as *const u8),
vld1q_u8(&sbox[32] as *const u8),
vld1q_u8(&sbox[48] as *const u8),
);

let sbox_part2 = uint8x16x4_t(
vld1q_u8(&sbox[64] as *const u8),
vld1q_u8(&sbox[80] as *const u8),
vld1q_u8(&sbox[96] as *const u8),
vld1q_u8(&sbox[112] as *const u8),
);

let sbox_part3 = uint8x16x4_t(
vld1q_u8(&sbox[128] as *const u8),
vld1q_u8(&sbox[144] as *const u8),
vld1q_u8(&sbox[160] as *const u8),
vld1q_u8(&sbox[176] as *const u8),
);

let sbox_part4 = uint8x16x4_t(
vld1q_u8(&sbox[192] as *const u8),
vld1q_u8(&sbox[208] as *const u8),
vld1q_u8(&sbox[224] as *const u8),
vld1q_u8(&sbox[240] as *const u8),
);

// Indexing each part of the sbox table
let result1 = vqtbl4q_u8(sbox_part1, block);
let block_1 = vsubq_u8(block, value_vector);
let result2 = vqtbl4q_u8(sbox_part2, block_1);
let block_2 = vsubq_u8(block_1, value_vector);
let result3 = vqtbl4q_u8(sbox_part3, block_2);
let block_3 = vsubq_u8(block_2, value_vector);
let result4 = vqtbl4q_u8(sbox_part4, block_3);
// Merging results
let result = vorrq_u8(vorrq_u8(result1, result2), vorrq_u8(result3, result4));

result
}

#[inline(always)]
unsafe fn transform(block: uint8x16_t, table: &Table) -> uint8x16_t {
macro_rules! get {
($table:expr, $ind:expr, $i:expr) => {{
let idx = vgetq_lane_u16($ind, $i) as usize;
let p = &($table.0[idx]) as *const u8 as *const uint8x16_t;
// correct alignment of `p` is guaranteed since offset values
// are shifted by 4 bits left and the table is aligned to 16 bytes
debug_assert_eq!(p as usize % 16, 0);
vld1q_u8(p as *const u8)
}};
}

macro_rules! xor_get {
($val:expr, $table:expr, $ind:expr, $i:expr) => {
$val = veorq_u8($val, get!($table, $ind, $i));
};
}

let ind = vcombine_u8(
vcreate_u8(0x0706050403020100),
vcreate_u8(0x0f0e0d0c0b0a0908),
);
let test = vzip1q_u8(block, ind);

let lind = vshlq_n_u16(vreinterpretq_u16_u8(test), 4);

let mut lt = get!(table, lind, 0);

xor_get!(lt, table, lind, 1);
xor_get!(lt, table, lind, 2);
xor_get!(lt, table, lind, 3);
xor_get!(lt, table, lind, 4);
xor_get!(lt, table, lind, 5);
xor_get!(lt, table, lind, 6);
xor_get!(lt, table, lind, 7);

let rind = vshlq_n_u16(vreinterpretq_u16_u8(vzip2q_u8(block, ind)), 4);

let mut rt = get!(table, rind, 0);
xor_get!(rt, table, rind, 1);
xor_get!(rt, table, rind, 2);
xor_get!(rt, table, rind, 3);
xor_get!(rt, table, rind, 4);
xor_get!(rt, table, rind, 5);
xor_get!(rt, table, rind, 6);
xor_get!(rt, table, rind, 7);

veorq_u8(lt, rt)
}

pub fn expand_enc_keys(key: &Key) -> RoundKeys {
macro_rules! next_const {
($i:expr) => {{
let p = RKEY_GEN.0.as_ptr() as *const uint8x16_t;
// correct alignment of `p` is guaranteed since the table
// is aligned to 16 bytes
let p = p.add($i);
debug_assert_eq!(p as usize % 16, 0);
$i += 1;
vld1q_u8(p as *const u8)
}};
}

unsafe {
let mut enc_keys = [vdupq_n_u8(0); 10];

let pk: *const uint8x16_t = key.as_ptr() as *const uint8x16_t;
let mut k1 = vld1q_u8(pk as *const u8);
let mut k2 = vld1q_u8(pk.add(1) as *const u8);
enc_keys[0] = k1;
enc_keys[1] = k2;

let mut cidx = 0;
for i in 1..5 {
for _ in 0..4 {
let mut t = veorq_u8(k1, next_const!(cidx));
t = transform(t, &ENC_TABLE);
k2 = veorq_u8(k2, t);

let mut t = veorq_u8(k2, next_const!(cidx));
t = transform(t, &ENC_TABLE);
k1 = veorq_u8(k1, t);
}

enc_keys[2 * i] = k1;
enc_keys[2 * i + 1] = k2;
}

enc_keys
}
}

pub fn inv_enc_keys(enc_keys: &RoundKeys) -> RoundKeys {
unsafe {
let mut dec_keys = [vdupq_n_u8(0); 10];

dec_keys[0] = enc_keys[9];
for i in 1..9 {
let k = sub_bytes(enc_keys[i], &P);
dec_keys[9 - i] = transform(k, &DEC_TABLE);
}
dec_keys[9] = enc_keys[0];

dec_keys
}
}

pub(crate) struct EncBackend<'a>(pub(crate) &'a RoundKeys);

impl<'a> BlockSizeUser for EncBackend<'a> {
type BlockSize = U16;
}

impl<'a> ParBlocksSizeUser for EncBackend<'a> {
type ParBlocksSize = ParBlocksSize;
}

impl<'a> BlockBackend for EncBackend<'a> {
#[inline]
fn proc_block(&mut self, block: InOut<'_, '_, Block>) {
let k = self.0;
unsafe {
let (in_ptr, out_ptr) = block.into_raw();
let mut b = vld1q_u8(in_ptr as *const u8);

for i in 0..9 {
b = veorq_u8(b, k[i]);
b = transform(b, &ENC_TABLE);
}
b = veorq_u8(b, k[9]);
vst1q_u8(out_ptr as *mut u8, b);
}
}

#[inline]
fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks<Self>>) {
let k = self.0;
unsafe {
let (in_ptr, out_ptr) = blocks.into_raw();
let in_ptr = in_ptr as *mut uint8x16_t;
let out_ptr = out_ptr as *mut uint8x16_t;

let mut blocks = [vdupq_n_u8(0); ParBlocksSize::USIZE];
unroll_par! {
i, {
blocks[i] = vld1q_u8(in_ptr.add(i) as *const u8);
}
};

for i in 0..9 {
unroll_par!(j, {
let t = veorq_u8(blocks[j], k[i]);
blocks[j] = transform(t, &ENC_TABLE);
});
}

unroll_par! {
i, {
let t = veorq_u8(blocks[i], k[9]);
vst1q_u8(out_ptr.add(i) as *mut u8, t);
}
};
}
}
}

pub(crate) struct DecBackend<'a>(pub(crate) &'a RoundKeys);

impl<'a> BlockSizeUser for DecBackend<'a> {
type BlockSize = U16;
}

impl<'a> ParBlocksSizeUser for DecBackend<'a> {
type ParBlocksSize = ParBlocksSize;
}

impl<'a> BlockBackend for DecBackend<'a> {
#[inline]
fn proc_block(&mut self, block: InOut<'_, '_, Block>) {
let k = self.0;
unsafe {
let (in_ptr, out_ptr) = block.into_raw();
let mut b = vld1q_u8(in_ptr as *const u8);

b = veorq_u8(b, k[0]);

b = sub_bytes(b, &P);
b = transform(b, &DEC_TABLE);

for i in 1..9 {
b = transform(b, &DEC_TABLE);
b = veorq_u8(b, k[i]);
}
b = sub_bytes(b, &P_INV);
b = veorq_u8(b, k[9]);

vst1q_u8(out_ptr as *mut u8, b);
}
}
#[inline]
fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks<Self>>) {
let k = self.0;
unsafe {
let (in_ptr, out_ptr) = blocks.into_raw();
let in_ptr = in_ptr as *mut uint8x16_t;
let out_ptr = out_ptr as *mut uint8x16_t;

let mut blocks = [vdupq_n_u8(0); ParBlocksSize::USIZE];
unroll_par! {
i, {
blocks[i] = vld1q_u8(in_ptr.add(i) as *const u8);
}
};

unroll_par! {
i, {
let t = veorq_u8(blocks[i], k[0]);
let t = sub_bytes(t, &P);
blocks[i] = transform(t, &DEC_TABLE);
}
}

for i in 1..9 {
unroll_par! {
j, {
let t = transform(blocks[j], &DEC_TABLE);
blocks[j] = veorq_u8(t, k[i]);
}
}
}

unroll_par! {
i, {
let t = sub_bytes(blocks[i], &P_INV);
let t2 = veorq_u8(t, k[9]);
vst1q_u8(out_ptr.add(i) as *mut u8, t2);
}
}
}
}
}
Loading

0 comments on commit 36b34ef

Please sign in to comment.