diff --git a/chacha20/Cargo.toml b/chacha20/Cargo.toml index 6dcf4c0d..61cbc9cb 100644 --- a/chacha20/Cargo.toml +++ b/chacha20/Cargo.toml @@ -36,7 +36,11 @@ xchacha20 = ["stream-cipher"] rng = ["rand_core"] [[bench]] -name = "chacha20" +name = "stream_cipher" +harness = false + +[[bench]] +name = "rng" harness = false [package.metadata.docs.rs] diff --git a/chacha20/README.md b/chacha20/README.md index f48bf842..8acca59c 100644 --- a/chacha20/README.md +++ b/chacha20/README.md @@ -24,10 +24,12 @@ This crate contains the following implementations of ChaCha20, all of which work on stable Rust with the following `RUSTFLAGS`: - `x86` / `x86_64` - - `sse2`: `-Ctarget-feature=+sse2` (on by default on x86 CPUs) - - `avx2`: `-Ctarget-cpu=haswell -Ctarget-feature=+avx2` + - `avx2`: (~1.4cpb) `-Ctarget-cpu=haswell -Ctarget-feature=+avx2` + - `sse2`: (~2.5cpb) `-Ctarget-feature=+sse2` (on by default on x86 CPUs) - Portable - - `soft` + - `soft`: (~5 cpb on x86/x86_64) + +NOTE: cpb = cycles per byte (smaller is better) ## Security Warning diff --git a/chacha20/benches/rng.rs b/chacha20/benches/rng.rs new file mode 100644 index 00000000..0cb054aa --- /dev/null +++ b/chacha20/benches/rng.rs @@ -0,0 +1,36 @@ +//! `ChaCha20Rng` benchmark + +#[cfg(not(feature = "rng"))] +compile_error!("run benchmarks with `cargo bench --all-features`"); + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use criterion_cycles_per_byte::CyclesPerByte; + +use chacha20::ChaCha20Rng; +use rand_core::{RngCore, SeedableRng}; + +const KB: usize = 1024; + +fn bench(c: &mut Criterion) { + let mut group = c.benchmark_group("rng"); + + for size in &[KB, 2 * KB, 4 * KB, 8 * KB, 16 * KB] { + let mut buf = vec![0u8; *size]; + + group.throughput(Throughput::Bytes(*size as u64)); + + group.bench_function(BenchmarkId::new("apply_keystream", size), |b| { + let mut rng = ChaCha20Rng::from_seed(Default::default()); + b.iter(|| rng.fill_bytes(&mut buf)); + }); + } + + group.finish(); +} + +criterion_group!( + name = benches; + config = Criterion::default().with_measurement(CyclesPerByte); + targets = bench +); +criterion_main!(benches); diff --git a/chacha20/benches/chacha20.rs b/chacha20/benches/stream_cipher.rs similarity index 81% rename from chacha20/benches/chacha20.rs rename to chacha20/benches/stream_cipher.rs index fd574459..0f924e93 100644 --- a/chacha20/benches/chacha20.rs +++ b/chacha20/benches/stream_cipher.rs @@ -1,3 +1,8 @@ +//! ChaCha20 `stream-cipher` benchmark + +#[cfg(not(feature = "stream-cipher"))] +compile_error!("run benchmarks with `cargo bench --all-features`"); + use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use criterion_cycles_per_byte::CyclesPerByte; @@ -9,7 +14,7 @@ use chacha20::{ const KB: usize = 1024; fn bench(c: &mut Criterion) { - let mut group = c.benchmark_group("chacha20"); + let mut group = c.benchmark_group("stream-cipher"); for size in &[KB, 2 * KB, 4 * KB, 8 * KB, 16 * KB] { let mut buf = vec![0u8; *size]; diff --git a/chacha20/src/block.rs b/chacha20/src/block.rs index 203d978c..ce8965c0 100644 --- a/chacha20/src/block.rs +++ b/chacha20/src/block.rs @@ -21,20 +21,20 @@ mod avx2; any(target_arch = "x86", target_arch = "x86_64"), any(target_feature = "sse2", target_feature = "avx2") )))] -pub(crate) use self::soft::Block; +pub(crate) use self::soft::{Block, BUFFER_SIZE}; #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2", not(target_feature = "avx2") ))] -pub(crate) use self::sse2::Block; +pub(crate) use self::sse2::{Block, BUFFER_SIZE}; #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx2" ))] -pub(crate) use self::avx2::Block; +pub(crate) use self::avx2::{Block, BUFFER_SIZE}; use core::fmt::{self, Debug}; diff --git a/chacha20/src/block/avx2.rs b/chacha20/src/block/avx2.rs index 58204e4c..50fdb2e3 100644 --- a/chacha20/src/block/avx2.rs +++ b/chacha20/src/block/avx2.rs @@ -8,7 +8,7 @@ //! Goll, M., and Gueron,S.: Vectorization of ChaCha Stream Cipher. Cryptology ePrint Archive, //! Report 2013/759, November, 2013, -use crate::{CONSTANTS, IV_SIZE, KEY_SIZE}; +use crate::{BLOCK_SIZE, CONSTANTS, IV_SIZE, KEY_SIZE}; use core::convert::TryInto; #[cfg(target_arch = "x86")] @@ -16,6 +16,12 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; +/// Size of buffers passed to `generate` and `apply_keystream` for this +/// backend, which operates on two blocks in parallel for optimal performance. +pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE * 2; + +/// The ChaCha20 block function (AVX2 accelerated implementation for x86/x86_64) +// TODO(tarcieri): zeroize? #[derive(Clone)] pub(crate) struct Block { v0: __m256i, @@ -62,16 +68,24 @@ impl Block { #[inline] #[allow(clippy::cast_ptr_alignment)] // loadu/storeu support unaligned loads/stores pub(crate) fn apply_keystream(&self, counter: u64, output: &mut [u8]) { + debug_assert_eq!(output.len(), BUFFER_SIZE); + unsafe { let (mut v0, mut v1, mut v2) = (self.v0, self.v1, self.v2); let mut v3 = iv_setup(self.iv, counter); self.rounds(&mut v0, &mut v1, &mut v2, &mut v3); - for (chunk, a) in output.chunks_mut(0x10).zip(&[v0, v1, v2, v3]) { + for (chunk, a) in output[..BLOCK_SIZE].chunks_mut(0x10).zip(&[v0, v1, v2, v3]) { let b = _mm_loadu_si128(chunk.as_ptr() as *const __m128i); let out = _mm_xor_si128(_mm256_castsi256_si128(*a), b); _mm_storeu_si128(chunk.as_mut_ptr() as *mut __m128i, out); } + + for (chunk, a) in output[BLOCK_SIZE..].chunks_mut(0x10).zip(&[v0, v1, v2, v3]) { + let b = _mm_loadu_si128(chunk.as_ptr() as *const __m128i); + let out = _mm_xor_si128(_mm256_extractf128_si256(*a, 1), b); + _mm_storeu_si128(chunk.as_mut_ptr() as *mut __m128i, out); + } } } @@ -132,22 +146,21 @@ unsafe fn iv_setup(iv: [i32; 2], counter: u64) -> __m256i { #[target_feature(enable = "avx2")] #[allow(clippy::cast_ptr_alignment)] // storeu supports unaligned stores unsafe fn store(v0: __m256i, v1: __m256i, v2: __m256i, v3: __m256i, output: &mut [u8]) { - _mm_storeu_si128( - output.as_mut_ptr().offset(0x00) as *mut __m128i, - _mm256_castsi256_si128(v0), - ); - _mm_storeu_si128( - output.as_mut_ptr().offset(0x10) as *mut __m128i, - _mm256_castsi256_si128(v1), - ); - _mm_storeu_si128( - output.as_mut_ptr().offset(0x20) as *mut __m128i, - _mm256_castsi256_si128(v2), - ); - _mm_storeu_si128( - output.as_mut_ptr().offset(0x30) as *mut __m128i, - _mm256_castsi256_si128(v3), - ); + debug_assert_eq!(output.len(), BUFFER_SIZE); + + for (chunk, v) in output[..BLOCK_SIZE].chunks_mut(0x10).zip(&[v0, v1, v2, v3]) { + _mm_storeu_si128( + chunk.as_mut_ptr() as *mut __m128i, + _mm256_castsi256_si128(*v), + ); + } + + for (chunk, v) in output[BLOCK_SIZE..].chunks_mut(0x10).zip(&[v0, v1, v2, v3]) { + _mm_storeu_si128( + chunk.as_mut_ptr() as *mut __m128i, + _mm256_extractf128_si256(*v, 1), + ); + } } #[inline] diff --git a/chacha20/src/block/soft.rs b/chacha20/src/block/soft.rs index b9c3bf5a..bd91defe 100644 --- a/chacha20/src/block/soft.rs +++ b/chacha20/src/block/soft.rs @@ -8,11 +8,11 @@ use crate::{BLOCK_SIZE, CONSTANTS, IV_SIZE, KEY_SIZE, STATE_WORDS}; use core::{convert::TryInto, mem}; -/// The ChaCha20 block function -/// -/// While ChaCha20 is a stream cipher, not a block cipher, its core -/// primitive is a function which acts on a 512-bit block -// TODO(tarcieri): zeroize? need to make sure we're actually copying first +/// Size of buffers passed to `generate` and `apply_keystream` for this backend +pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE; + +/// The ChaCha20 block function (portable software implementation) +// TODO(tarcieri): zeroize? #[allow(dead_code)] #[derive(Clone)] pub(crate) struct Block { @@ -49,7 +49,7 @@ impl Block { /// Generate output, overwriting data already in the buffer pub(crate) fn generate(&mut self, counter: u64, output: &mut [u8]) { - debug_assert_eq!(output.len(), BLOCK_SIZE); + debug_assert_eq!(output.len(), BUFFER_SIZE); self.counter_setup(counter); let mut state = self.state; @@ -62,7 +62,7 @@ impl Block { /// Apply generated keystream to the output buffer pub(crate) fn apply_keystream(&mut self, counter: u64, output: &mut [u8]) { - debug_assert_eq!(output.len(), BLOCK_SIZE); + debug_assert_eq!(output.len(), BUFFER_SIZE); self.counter_setup(counter); let mut state = self.state; diff --git a/chacha20/src/block/sse2.rs b/chacha20/src/block/sse2.rs index d0016a76..d3bde4a5 100644 --- a/chacha20/src/block/sse2.rs +++ b/chacha20/src/block/sse2.rs @@ -4,7 +4,7 @@ //! //! SSE2-optimized implementation for x86/x86-64 CPUs. -use crate::{CONSTANTS, IV_SIZE, KEY_SIZE}; +use crate::{BLOCK_SIZE, CONSTANTS, IV_SIZE, KEY_SIZE}; use core::convert::TryInto; #[cfg(target_arch = "x86")] @@ -12,6 +12,11 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; +/// Size of buffers passed to `generate` and `apply_keystream` for this backend +pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE; + +/// The ChaCha20 block function (SSE2 accelerated implementation for x86/x86_64) +// TODO(tarcieri): zeroize? #[derive(Clone)] pub(crate) struct Block { v0: __m128i, @@ -47,6 +52,8 @@ impl Block { #[inline] pub(crate) fn generate(&self, counter: u64, output: &mut [u8]) { + debug_assert_eq!(output.len(), BUFFER_SIZE); + unsafe { let (mut v0, mut v1, mut v2) = (self.v0, self.v1, self.v2); let mut v3 = iv_setup(self.iv, counter); @@ -58,6 +65,8 @@ impl Block { #[inline] #[allow(clippy::cast_ptr_alignment)] // loadu/storeu support unaligned loads/stores pub(crate) fn apply_keystream(&self, counter: u64, output: &mut [u8]) { + debug_assert_eq!(output.len(), BUFFER_SIZE); + unsafe { let (mut v0, mut v1, mut v2) = (self.v0, self.v1, self.v2); let mut v3 = iv_setup(self.iv, counter); diff --git a/chacha20/src/cipher.rs b/chacha20/src/cipher.rs index 0cbdc92d..1d4536eb 100644 --- a/chacha20/src/cipher.rs +++ b/chacha20/src/cipher.rs @@ -4,7 +4,10 @@ // TODO(tarcieri): figure out how to unify this with the `ctr` crate -use crate::{block::Block, BLOCK_SIZE}; +use crate::{ + block::{Block, BUFFER_SIZE}, + BLOCK_SIZE, +}; use core::{ cmp, fmt::{self, Debug}, @@ -12,7 +15,13 @@ use core::{ use stream_cipher::{LoopError, SyncStreamCipher, SyncStreamCipherSeek}; /// Internal buffer -type Buffer = [u8; BLOCK_SIZE]; +type Buffer = [u8; BUFFER_SIZE]; + +/// How much to increment the counter by for each buffer we generate. +/// Normally this is 1 but the AVX2 backend uses double-wide buffers. +// TODO(tarcieri): support a parallel blocks count like the `ctr` crate +// See: +const COUNTER_INCR: u64 = (BUFFER_SIZE as u64) / (BLOCK_SIZE as u64); /// ChaCha20 as a counter mode stream cipher pub(crate) struct Cipher { @@ -39,7 +48,7 @@ impl Cipher { pub fn new(block: Block, counter_offset: u64) -> Self { Self { block, - buffer: [0u8; BLOCK_SIZE], + buffer: [0u8; BUFFER_SIZE], buffer_pos: None, counter: 0, counter_offset, @@ -63,7 +72,7 @@ impl SyncStreamCipher for Cipher { if let Some(pos) = self.buffer_pos { let pos = pos as usize; - if data.len() >= BLOCK_SIZE - pos { + if data.len() >= BUFFER_SIZE - pos { let buf = &self.buffer[pos..]; let (r, l) = data.split_at_mut(buf.len()); data = l; @@ -79,20 +88,20 @@ impl SyncStreamCipher for Cipher { let mut counter = self.counter; - while data.len() >= BLOCK_SIZE { - let (l, r) = { data }.split_at_mut(BLOCK_SIZE); + while data.len() >= BUFFER_SIZE { + let (l, r) = { data }.split_at_mut(BUFFER_SIZE); data = r; // TODO(tarcieri): double check this should be checked and not wrapping let counter_with_offset = self.counter_offset.checked_add(counter).unwrap(); self.block.apply_keystream(counter_with_offset, l); - counter = counter.checked_add(1).unwrap(); + counter = counter.checked_add(COUNTER_INCR).unwrap(); } if !data.is_empty() { self.generate_block(counter); - counter = counter.checked_add(1).unwrap(); + counter = counter.checked_add(COUNTER_INCR).unwrap(); let n = data.len(); xor(data, &self.buffer[..n]); self.buffer_pos = Some(n as u8); @@ -126,7 +135,7 @@ impl SyncStreamCipherSeek for Cipher { self.buffer_pos = None; } else { self.generate_block(self.counter); - self.counter = self.counter.checked_add(1).unwrap(); + self.counter = self.counter.checked_add(COUNTER_INCR).unwrap(); self.buffer_pos = Some(rem as u8); } } @@ -137,12 +146,12 @@ impl Cipher { let dlen = data.len() - self .buffer_pos - .map(|pos| cmp::min(BLOCK_SIZE - pos as usize, data.len())) + .map(|pos| cmp::min(BUFFER_SIZE - pos as usize, data.len())) .unwrap_or_default(); - let data_buffers = dlen / BLOCK_SIZE + if data.len() % BLOCK_SIZE != 0 { 1 } else { 0 }; + let data_blocks = dlen / BLOCK_SIZE + if data.len() % BLOCK_SIZE != 0 { 1 } else { 0 }; - if self.counter.checked_add(data_buffers as u64).is_some() { + if self.counter.checked_add(data_blocks as u64).is_some() { Ok(()) } else { Err(LoopError) diff --git a/chacha20/src/rng.rs b/chacha20/src/rng.rs index e6b59fff..f73df5dd 100644 --- a/chacha20/src/rng.rs +++ b/chacha20/src/rng.rs @@ -4,7 +4,10 @@ use core::slice; use rand_core::block::{BlockRng, BlockRngCore}; use rand_core::{Error, RngCore, SeedableRng}; -use crate::{block::Block, BLOCK_SIZE, KEY_SIZE, STATE_WORDS}; +use crate::{ + block::{Block, BUFFER_SIZE}, + KEY_SIZE, +}; macro_rules! impl_chacha_rng { ($name:ident, $core:ident, $rounds:expr, $doc:expr) => { @@ -63,12 +66,12 @@ macro_rules! impl_chacha_rng { impl BlockRngCore for $core { type Item = u32; - type Results = [u32; STATE_WORDS]; + type Results = [u32; BUFFER_SIZE / 4]; fn generate(&mut self, results: &mut Self::Results) { // TODO(tarcieri): eliminate unsafety (replace w\ [u8; BLOCK_SIZE) self.block.generate(self.counter, unsafe { - slice::from_raw_parts_mut(results.as_mut_ptr() as *mut u8, BLOCK_SIZE) + slice::from_raw_parts_mut(results.as_mut_ptr() as *mut u8, BUFFER_SIZE) }); self.counter += 1; }