-
Notifications
You must be signed in to change notification settings - Fork 256
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
sha2: improve RISC-V Zknh backends (#617)
Annoyingly, RISC-V is really inconvenient when we have to deal with misaligned loads/stores. LLVM by default generates very [inefficient code](https://rust.godbolt.org/z/3Yaj4fq5o) which loads every byte separately and combines them into a 32/64 bit integer. The `ld` instruction "may" support misaligned loads and for Linux user-space it's even [guaranteed](https://www.kernel.org/doc/html/v6.10/arch/riscv/uabi.html#misaligned-accesses), but it can be (and IIUC often in practice is) "extremely slow", so we should not rely on it while writing performant code. After asking around, it looks like this mess is here to stay, so we have no choice but to work around it. To do that this PR introduces two separate paths for loading block data: aligned and misaligned. The aligned path should be the most common one. In the misaligned path we have to rely on inline assembly since we have to load some bits outside of the block. Additionally, this PR makes inlining in the `riscv-zknh` backend less aggressive, which makes generated binary code 3-4 times smaller at the cost of one additional branch. Generated assembly for RV64: - SHA-256, unrolled: https://rust.godbolt.org/z/GxPM8PE3P (2278 bytes) - SHA-256, compact: https://rust.godbolt.org/z/4KWrcve9E (538 bytes) - SHA-512, unrolled: https://rust.godbolt.org/z/Th8ro8Tbo (2278 bytes) - SHA-512: compact: https://rust.godbolt.org/z/dqrv48ax3 (530 bytes)
- Loading branch information
Showing
10 changed files
with
405 additions
and
197 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
use core::{arch::asm, ptr}; | ||
|
||
#[inline(always)] | ||
pub(super) fn load_block(block: &[u8; 64]) -> [u32; 16] { | ||
if block.as_ptr().cast::<u32>().is_aligned() { | ||
load_aligned_block(block) | ||
} else { | ||
load_unaligned_block(block) | ||
} | ||
} | ||
|
||
#[inline(always)] | ||
fn load_aligned_block(block: &[u8; 64]) -> [u32; 16] { | ||
let p: *const u32 = block.as_ptr().cast(); | ||
debug_assert!(p.is_aligned()); | ||
let mut res = [0u32; 16]; | ||
for i in 0..16 { | ||
let val = unsafe { ptr::read(p.add(i)) }; | ||
res[i] = val.to_be(); | ||
} | ||
res | ||
} | ||
|
||
#[inline(always)] | ||
fn load_unaligned_block(block: &[u8; 64]) -> [u32; 16] { | ||
let offset = (block.as_ptr() as usize) % align_of::<u32>(); | ||
debug_assert_ne!(offset, 0); | ||
let off1 = (8 * offset) % 32; | ||
let off2 = (32 - off1) % 32; | ||
let bp: *const u32 = block.as_ptr().wrapping_sub(offset).cast(); | ||
|
||
let mut left: u32; | ||
let mut res = [0u32; 16]; | ||
|
||
/// Use LW instruction on RV32 and LWU on RV64 | ||
#[cfg(target_arch = "riscv32")] | ||
macro_rules! lw { | ||
($r:literal) => { | ||
concat!("lw ", $r) | ||
}; | ||
} | ||
#[cfg(target_arch = "riscv64")] | ||
macro_rules! lw { | ||
($r:literal) => { | ||
concat!("lwu ", $r) | ||
}; | ||
} | ||
|
||
unsafe { | ||
asm!( | ||
lw!("{left}, 0({bp})"), // left = unsafe { ptr::read(bp) }; | ||
"srl {left}, {left}, {off1}", // left >>= off1; | ||
bp = in(reg) bp, | ||
off1 = in(reg) off1, | ||
left = out(reg) left, | ||
options(pure, nostack, readonly, preserves_flags), | ||
); | ||
} | ||
|
||
for i in 0..15 { | ||
let right = unsafe { ptr::read(bp.add(1 + i)) }; | ||
res[i] = (left | (right << off2)).to_be(); | ||
left = right >> off1; | ||
} | ||
|
||
let right: u32; | ||
unsafe { | ||
asm!( | ||
lw!("{right}, 16 * 4({bp})"), // right = ptr::read(bp.add(16)); | ||
"sll {right}, {right}, {off2}", // right <<= off2; | ||
bp = in(reg) bp, | ||
off2 = in(reg) off2, | ||
right = out(reg) right, | ||
options(pure, nostack, readonly, preserves_flags), | ||
); | ||
} | ||
res[15] = (left | right).to_be(); | ||
|
||
res | ||
} |
Oops, something went wrong.