From 4d9881993684ec0dd2eefcb0c89f5ea74d1d74fd Mon Sep 17 00:00:00 2001
From: Tony Arcieri <bascule@gmail.com>
Date: Thu, 16 Jan 2020 08:04:21 -0800
Subject: [PATCH] chacha20: Parallelize AVX2 backend

The AVX2 backend was previously computing two ChaCha blocks in parallel,
then throwing one away.

This updates the implementation to always compute two blocks in parallel
when the AVX2 backend is enabled, resulting in a ~2X speedup.

Unfortunately for `cipher.rs`, originally adapted from the `ctr` crate,
I deleted the original parallel computation code, and in lieu of that
the implementation diverges from what was originally in `ctr`. See here
for a reference:

https://github.com/RustCrypto/stream-ciphers/blob/907e94b/ctr/src/lib.rs#L73

Ideally we can come up with some generic counter management and
buffering abstraction in the `ctr` crate which works in all cases.
---
 chacha20/Cargo.toml                           |  6 ++-
 chacha20/README.md                            |  8 +--
 chacha20/benches/rng.rs                       | 36 ++++++++++++++
 .../benches/{chacha20.rs => stream_cipher.rs} |  7 ++-
 chacha20/src/block.rs                         |  6 +--
 chacha20/src/block/avx2.rs                    | 49 ++++++++++++-------
 chacha20/src/block/soft.rs                    | 14 +++---
 chacha20/src/block/sse2.rs                    | 11 ++++-
 chacha20/src/cipher.rs                        | 33 ++++++++-----
 chacha20/src/rng.rs                           |  9 ++--
 10 files changed, 130 insertions(+), 49 deletions(-)
 create mode 100644 chacha20/benches/rng.rs
 rename chacha20/benches/{chacha20.rs => stream_cipher.rs} (81%)
diff --git a/chacha20/Cargo.toml b/chacha20/Cargo.toml
index 6dcf4c0d..61cbc9cb 100644
--- a/chacha20/Cargo.toml
+++ b/chacha20/Cargo.toml
@@ -36,7 +36,11 @@ xchacha20 = ["stream-cipher"]
 rng = ["rand_core"]
 
 [[bench]]
-name = "chacha20"
+name = "stream_cipher"
+harness = false
+
+[[bench]]
+name = "rng"
 harness = false
 
 [package.metadata.docs.rs]
diff --git a/chacha20/README.md b/chacha20/README.md
index f48bf842..8acca59c 100644
--- a/chacha20/README.md
+++ b/chacha20/README.md
@@ -24,10 +24,12 @@ This crate contains the following implementations of ChaCha20, all of which
 work on stable Rust with the following `RUSTFLAGS`:
 
 - `x86` / `x86_64`
-  - `sse2`: `-Ctarget-feature=+sse2` (on by default on x86 CPUs)
-  - `avx2`: `-Ctarget-cpu=haswell -Ctarget-feature=+avx2`
+  - `avx2`: (~1.4cpb) `-Ctarget-cpu=haswell -Ctarget-feature=+avx2`
+  - `sse2`: (~2.5cpb) `-Ctarget-feature=+sse2` (on by default on x86 CPUs)
 - Portable
-  - `soft`
+  - `soft`: (~5 cpb on x86/x86_64)
+
+NOTE: cpb = cycles per byte (smaller is better)
 
 ## Security Warning
 
diff --git a/chacha20/benches/rng.rs b/chacha20/benches/rng.rs
new file mode 100644
index 00000000..0cb054aa
--- /dev/null
+++ b/chacha20/benches/rng.rs
@@ -0,0 +1,36 @@
+//! `ChaCha20Rng` benchmark
+
+#[cfg(not(feature = "rng"))]
+compile_error!("run benchmarks with `cargo bench --all-features`");
+
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use criterion_cycles_per_byte::CyclesPerByte;
+
+use chacha20::ChaCha20Rng;
+use rand_core::{RngCore, SeedableRng};
+
+const KB: usize = 1024;
+
+fn bench(c: &mut Criterion<CyclesPerByte>) {
+    let mut group = c.benchmark_group("rng");
+
+    for size in &[KB, 2 * KB, 4 * KB, 8 * KB, 16 * KB] {
+        let mut buf = vec![0u8; *size];
+
+        group.throughput(Throughput::Bytes(*size as u64));
+
+        group.bench_function(BenchmarkId::new("apply_keystream", size), |b| {
+            let mut rng = ChaCha20Rng::from_seed(Default::default());
+            b.iter(|| rng.fill_bytes(&mut buf));
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_measurement(CyclesPerByte);
+    targets = bench
+);
+criterion_main!(benches);
diff --git a/chacha20/benches/chacha20.rs b/chacha20/benches/stream_cipher.rs
similarity index 81%
rename from chacha20/benches/chacha20.rs
rename to chacha20/benches/stream_cipher.rs
index fd574459..0f924e93 100644
--- a/chacha20/benches/chacha20.rs
+++ b/chacha20/benches/stream_cipher.rs
@@ -1,3 +1,8 @@
+//! ChaCha20 `stream-cipher` benchmark
+
+#[cfg(not(feature = "stream-cipher"))]
+compile_error!("run benchmarks with `cargo bench --all-features`");
+
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
 use criterion_cycles_per_byte::CyclesPerByte;
 
@@ -9,7 +14,7 @@ use chacha20::{
 const KB: usize = 1024;
 
 fn bench(c: &mut Criterion<CyclesPerByte>) {
-    let mut group = c.benchmark_group("chacha20");
+    let mut group = c.benchmark_group("stream-cipher");
 
     for size in &[KB, 2 * KB, 4 * KB, 8 * KB, 16 * KB] {
         let mut buf = vec![0u8; *size];
diff --git a/chacha20/src/block.rs b/chacha20/src/block.rs
index 203d978c..ce8965c0 100644
--- a/chacha20/src/block.rs
+++ b/chacha20/src/block.rs
@@ -21,20 +21,20 @@ mod avx2;
     any(target_arch = "x86", target_arch = "x86_64"),
     any(target_feature = "sse2", target_feature = "avx2")
 )))]
-pub(crate) use self::soft::Block;
+pub(crate) use self::soft::{Block, BUFFER_SIZE};
 
 #[cfg(all(
     any(target_arch = "x86", target_arch = "x86_64"),
     target_feature = "sse2",
     not(target_feature = "avx2")
 ))]
-pub(crate) use self::sse2::Block;
+pub(crate) use self::sse2::{Block, BUFFER_SIZE};
 
 #[cfg(all(
     any(target_arch = "x86", target_arch = "x86_64"),
     target_feature = "avx2"
 ))]
-pub(crate) use self::avx2::Block;
+pub(crate) use self::avx2::{Block, BUFFER_SIZE};
 
 use core::fmt::{self, Debug};
 
diff --git a/chacha20/src/block/avx2.rs b/chacha20/src/block/avx2.rs
index 58204e4c..50fdb2e3 100644
--- a/chacha20/src/block/avx2.rs
+++ b/chacha20/src/block/avx2.rs
@@ -8,7 +8,7 @@
 //! Goll, M., and Gueron,S.: Vectorization of ChaCha Stream Cipher. Cryptology ePrint Archive,
 //! Report 2013/759, November, 2013, <https://eprint.iacr.org/2013/759.pdf>
 
-use crate::{CONSTANTS, IV_SIZE, KEY_SIZE};
+use crate::{BLOCK_SIZE, CONSTANTS, IV_SIZE, KEY_SIZE};
 use core::convert::TryInto;
 
 #[cfg(target_arch = "x86")]
@@ -16,6 +16,12 @@ use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
 
+/// Size of buffers passed to `generate` and `apply_keystream` for this
+/// backend, which operates on two blocks in parallel for optimal performance.
+pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE * 2;
+
+/// The ChaCha20 block function (AVX2 accelerated implementation for x86/x86_64)
+// TODO(tarcieri): zeroize?
 #[derive(Clone)]
 pub(crate) struct Block {
     v0: __m256i,
@@ -62,16 +68,24 @@ impl Block {
     #[inline]
     #[allow(clippy::cast_ptr_alignment)] // loadu/storeu support unaligned loads/stores
     pub(crate) fn apply_keystream(&self, counter: u64, output: &mut [u8]) {
+        debug_assert_eq!(output.len(), BUFFER_SIZE);
+
         unsafe {
             let (mut v0, mut v1, mut v2) = (self.v0, self.v1, self.v2);
             let mut v3 = iv_setup(self.iv, counter);
             self.rounds(&mut v0, &mut v1, &mut v2, &mut v3);
 
-            for (chunk, a) in output.chunks_mut(0x10).zip(&[v0, v1, v2, v3]) {
+            for (chunk, a) in output[..BLOCK_SIZE].chunks_mut(0x10).zip(&[v0, v1, v2, v3]) {
                 let b = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
                 let out = _mm_xor_si128(_mm256_castsi256_si128(*a), b);
                 _mm_storeu_si128(chunk.as_mut_ptr() as *mut __m128i, out);
             }
+
+            for (chunk, a) in output[BLOCK_SIZE..].chunks_mut(0x10).zip(&[v0, v1, v2, v3]) {
+                let b = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
+                let out = _mm_xor_si128(_mm256_extractf128_si256(*a, 1), b);
+                _mm_storeu_si128(chunk.as_mut_ptr() as *mut __m128i, out);
+            }
         }
     }
 
@@ -132,22 +146,21 @@ unsafe fn iv_setup(iv: [i32; 2], counter: u64) -> __m256i {
 #[target_feature(enable = "avx2")]
 #[allow(clippy::cast_ptr_alignment)] // storeu supports unaligned stores
 unsafe fn store(v0: __m256i, v1: __m256i, v2: __m256i, v3: __m256i, output: &mut [u8]) {
-    _mm_storeu_si128(
-        output.as_mut_ptr().offset(0x00) as *mut __m128i,
-        _mm256_castsi256_si128(v0),
-    );
-    _mm_storeu_si128(
-        output.as_mut_ptr().offset(0x10) as *mut __m128i,
-        _mm256_castsi256_si128(v1),
-    );
-    _mm_storeu_si128(
-        output.as_mut_ptr().offset(0x20) as *mut __m128i,
-        _mm256_castsi256_si128(v2),
-    );
-    _mm_storeu_si128(
-        output.as_mut_ptr().offset(0x30) as *mut __m128i,
-        _mm256_castsi256_si128(v3),
-    );
+    debug_assert_eq!(output.len(), BUFFER_SIZE);
+
+    for (chunk, v) in output[..BLOCK_SIZE].chunks_mut(0x10).zip(&[v0, v1, v2, v3]) {
+        _mm_storeu_si128(
+            chunk.as_mut_ptr() as *mut __m128i,
+            _mm256_castsi256_si128(*v),
+        );
+    }
+
+    for (chunk, v) in output[BLOCK_SIZE..].chunks_mut(0x10).zip(&[v0, v1, v2, v3]) {
+        _mm_storeu_si128(
+            chunk.as_mut_ptr() as *mut __m128i,
+            _mm256_extractf128_si256(*v, 1),
+        );
+    }
 }
 
 #[inline]
diff --git a/chacha20/src/block/soft.rs b/chacha20/src/block/soft.rs
index b9c3bf5a..bd91defe 100644
--- a/chacha20/src/block/soft.rs
+++ b/chacha20/src/block/soft.rs
@@ -8,11 +8,11 @@
 use crate::{BLOCK_SIZE, CONSTANTS, IV_SIZE, KEY_SIZE, STATE_WORDS};
 use core::{convert::TryInto, mem};
 
-/// The ChaCha20 block function
-///
-/// While ChaCha20 is a stream cipher, not a block cipher, its core
-/// primitive is a function which acts on a 512-bit block
-// TODO(tarcieri): zeroize? need to make sure we're actually copying first
+/// Size of buffers passed to `generate` and `apply_keystream` for this backend
+pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE;
+
+/// The ChaCha20 block function (portable software implementation)
+// TODO(tarcieri): zeroize?
 #[allow(dead_code)]
 #[derive(Clone)]
 pub(crate) struct Block {
@@ -49,7 +49,7 @@ impl Block {
 
     /// Generate output, overwriting data already in the buffer
     pub(crate) fn generate(&mut self, counter: u64, output: &mut [u8]) {
-        debug_assert_eq!(output.len(), BLOCK_SIZE);
+        debug_assert_eq!(output.len(), BUFFER_SIZE);
         self.counter_setup(counter);
 
         let mut state = self.state;
@@ -62,7 +62,7 @@ impl Block {
 
     /// Apply generated keystream to the output buffer
     pub(crate) fn apply_keystream(&mut self, counter: u64, output: &mut [u8]) {
-        debug_assert_eq!(output.len(), BLOCK_SIZE);
+        debug_assert_eq!(output.len(), BUFFER_SIZE);
         self.counter_setup(counter);
 
         let mut state = self.state;
diff --git a/chacha20/src/block/sse2.rs b/chacha20/src/block/sse2.rs
index d0016a76..d3bde4a5 100644
--- a/chacha20/src/block/sse2.rs
+++ b/chacha20/src/block/sse2.rs
@@ -4,7 +4,7 @@
 //!
 //! SSE2-optimized implementation for x86/x86-64 CPUs.
 
-use crate::{CONSTANTS, IV_SIZE, KEY_SIZE};
+use crate::{BLOCK_SIZE, CONSTANTS, IV_SIZE, KEY_SIZE};
 use core::convert::TryInto;
 
 #[cfg(target_arch = "x86")]
@@ -12,6 +12,11 @@ use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
 
+/// Size of buffers passed to `generate` and `apply_keystream` for this backend
+pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE;
+
+/// The ChaCha20 block function (SSE2 accelerated implementation for x86/x86_64)
+// TODO(tarcieri): zeroize?
 #[derive(Clone)]
 pub(crate) struct Block {
     v0: __m128i,
@@ -47,6 +52,8 @@ impl Block {
 
     #[inline]
     pub(crate) fn generate(&self, counter: u64, output: &mut [u8]) {
+        debug_assert_eq!(output.len(), BUFFER_SIZE);
+
         unsafe {
             let (mut v0, mut v1, mut v2) = (self.v0, self.v1, self.v2);
             let mut v3 = iv_setup(self.iv, counter);
@@ -58,6 +65,8 @@ impl Block {
     #[inline]
     #[allow(clippy::cast_ptr_alignment)] // loadu/storeu support unaligned loads/stores
     pub(crate) fn apply_keystream(&self, counter: u64, output: &mut [u8]) {
+        debug_assert_eq!(output.len(), BUFFER_SIZE);
+
         unsafe {
             let (mut v0, mut v1, mut v2) = (self.v0, self.v1, self.v2);
             let mut v3 = iv_setup(self.iv, counter);
diff --git a/chacha20/src/cipher.rs b/chacha20/src/cipher.rs
index 0cbdc92d..1d4536eb 100644
--- a/chacha20/src/cipher.rs
+++ b/chacha20/src/cipher.rs
@@ -4,7 +4,10 @@
 
 // TODO(tarcieri): figure out how to unify this with the `ctr` crate
 
-use crate::{block::Block, BLOCK_SIZE};
+use crate::{
+    block::{Block, BUFFER_SIZE},
+    BLOCK_SIZE,
+};
 use core::{
     cmp,
     fmt::{self, Debug},
@@ -12,7 +15,13 @@ use core::{
 use stream_cipher::{LoopError, SyncStreamCipher, SyncStreamCipherSeek};
 
 /// Internal buffer
-type Buffer = [u8; BLOCK_SIZE];
+type Buffer = [u8; BUFFER_SIZE];
+
+/// How much to increment the counter by for each buffer we generate.
+/// Normally this is 1 but the AVX2 backend uses double-wide buffers.
+// TODO(tarcieri): support a parallel blocks count like the `ctr` crate
+// See: <https://github.com/RustCrypto/stream-ciphers/blob/907e94b/ctr/src/lib.rs#L73>
+const COUNTER_INCR: u64 = (BUFFER_SIZE as u64) / (BLOCK_SIZE as u64);
 
 /// ChaCha20 as a counter mode stream cipher
 pub(crate) struct Cipher {
@@ -39,7 +48,7 @@ impl Cipher {
     pub fn new(block: Block, counter_offset: u64) -> Self {
         Self {
             block,
-            buffer: [0u8; BLOCK_SIZE],
+            buffer: [0u8; BUFFER_SIZE],
             buffer_pos: None,
             counter: 0,
             counter_offset,
@@ -63,7 +72,7 @@ impl SyncStreamCipher for Cipher {
         if let Some(pos) = self.buffer_pos {
             let pos = pos as usize;
 
-            if data.len() >= BLOCK_SIZE - pos {
+            if data.len() >= BUFFER_SIZE - pos {
                 let buf = &self.buffer[pos..];
                 let (r, l) = data.split_at_mut(buf.len());
                 data = l;
@@ -79,20 +88,20 @@ impl SyncStreamCipher for Cipher {
 
         let mut counter = self.counter;
 
-        while data.len() >= BLOCK_SIZE {
-            let (l, r) = { data }.split_at_mut(BLOCK_SIZE);
+        while data.len() >= BUFFER_SIZE {
+            let (l, r) = { data }.split_at_mut(BUFFER_SIZE);
             data = r;
 
             // TODO(tarcieri): double check this should be checked and not wrapping
             let counter_with_offset = self.counter_offset.checked_add(counter).unwrap();
             self.block.apply_keystream(counter_with_offset, l);
 
-            counter = counter.checked_add(1).unwrap();
+            counter = counter.checked_add(COUNTER_INCR).unwrap();
         }
 
         if !data.is_empty() {
             self.generate_block(counter);
-            counter = counter.checked_add(1).unwrap();
+            counter = counter.checked_add(COUNTER_INCR).unwrap();
             let n = data.len();
             xor(data, &self.buffer[..n]);
             self.buffer_pos = Some(n as u8);
@@ -126,7 +135,7 @@ impl SyncStreamCipherSeek for Cipher {
             self.buffer_pos = None;
         } else {
             self.generate_block(self.counter);
-            self.counter = self.counter.checked_add(1).unwrap();
+            self.counter = self.counter.checked_add(COUNTER_INCR).unwrap();
             self.buffer_pos = Some(rem as u8);
         }
     }
@@ -137,12 +146,12 @@ impl Cipher {
         let dlen = data.len()
             - self
                 .buffer_pos
-                .map(|pos| cmp::min(BLOCK_SIZE - pos as usize, data.len()))
+                .map(|pos| cmp::min(BUFFER_SIZE - pos as usize, data.len()))
                 .unwrap_or_default();
 
-        let data_buffers = dlen / BLOCK_SIZE + if data.len() % BLOCK_SIZE != 0 { 1 } else { 0 };
+        let data_blocks = dlen / BLOCK_SIZE + if data.len() % BLOCK_SIZE != 0 { 1 } else { 0 };
 
-        if self.counter.checked_add(data_buffers as u64).is_some() {
+        if self.counter.checked_add(data_blocks as u64).is_some() {
             Ok(())
         } else {
             Err(LoopError)
diff --git a/chacha20/src/rng.rs b/chacha20/src/rng.rs
index e6b59fff..f73df5dd 100644
--- a/chacha20/src/rng.rs
+++ b/chacha20/src/rng.rs
@@ -4,7 +4,10 @@ use core::slice;
 use rand_core::block::{BlockRng, BlockRngCore};
 use rand_core::{Error, RngCore, SeedableRng};
 
-use crate::{block::Block, BLOCK_SIZE, KEY_SIZE, STATE_WORDS};
+use crate::{
+    block::{Block, BUFFER_SIZE},
+    KEY_SIZE,
+};
 
 macro_rules! impl_chacha_rng {
     ($name:ident, $core:ident, $rounds:expr, $doc:expr) => {
@@ -63,12 +66,12 @@ macro_rules! impl_chacha_rng {
 
         impl BlockRngCore for $core {
             type Item = u32;
-            type Results = [u32; STATE_WORDS];
+            type Results = [u32; BUFFER_SIZE / 4];
 
             fn generate(&mut self, results: &mut Self::Results) {
                 // TODO(tarcieri): eliminate unsafety (replace w\ [u8; BLOCK_SIZE)
                 self.block.generate(self.counter, unsafe {
-                    slice::from_raw_parts_mut(results.as_mut_ptr() as *mut u8, BLOCK_SIZE)
+                    slice::from_raw_parts_mut(results.as_mut_ptr() as *mut u8, BUFFER_SIZE)
                 });
                 self.counter += 1;
             }