From 50b9a447410860af8d6db9a208c3576886955874 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Wed, 7 Dec 2022 09:47:45 +0000
Subject: [PATCH] fill_via_chunks: mutate src on BE (small optimisation)
 (#1182)

* fill_via_chunks: mutate src on BE (small optimisation)

* Add doc to fill_via_chunks
---
 rand_core/src/block.rs |  4 +--
 rand_core/src/impls.rs | 77 ++++++++++++++++++++++++------------------
 2 files changed, 46 insertions(+), 35 deletions(-)
diff --git a/rand_core/src/block.rs b/rand_core/src/block.rs
index 5a986895484..f813784ff2f 100644
--- a/rand_core/src/block.rs
+++ b/rand_core/src/block.rs
@@ -223,7 +223,7 @@ impl<R: BlockRngCore<Item = u32>> RngCore for BlockRng<R> {
                 self.generate_and_set(0);
             }
             let (consumed_u32, filled_u8) =
-                fill_via_u32_chunks(&self.results.as_ref()[self.index..], &mut dest[read_len..]);
+                fill_via_u32_chunks(&mut self.results.as_mut()[self.index..], &mut dest[read_len..]);
 
             self.index += consumed_u32;
             read_len += filled_u8;
@@ -387,7 +387,7 @@ impl<R: BlockRngCore<Item = u64>> RngCore for BlockRng64<R> {
             }
 
             let (consumed_u64, filled_u8) = fill_via_u64_chunks(
-                &self.results.as_ref()[self.index..],
+                &mut self.results.as_mut()[self.index..],
                 &mut dest[read_len..],
             );
 
diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs
index 4b7688c5c80..8f99ef813a5 100644
--- a/rand_core/src/impls.rs
+++ b/rand_core/src/impls.rs
@@ -53,16 +53,14 @@ pub fn fill_bytes_via_next<R: RngCore + ?Sized>(rng: &mut R, dest: &mut [u8]) {
 }
 
 trait Observable: Copy {
-    type Bytes: AsRef<[u8]>;
-    fn to_le_bytes(self) -> Self::Bytes;
+    fn to_le(self) -> Self;
 
     // Contract: observing self is memory-safe (implies no uninitialised padding)
     fn as_byte_slice(x: &[Self]) -> &[u8];
 }
 impl Observable for u32 {
-    type Bytes = [u8; 4];
-    fn to_le_bytes(self) -> Self::Bytes {
-        self.to_le_bytes()
+    fn to_le(self) -> Self {
+        self.to_le()
     }
     fn as_byte_slice(x: &[Self]) -> &[u8] {
         let ptr = x.as_ptr() as *const u8;
@@ -71,9 +69,8 @@ impl Observable for u32 {
     }
 }
 impl Observable for u64 {
-    type Bytes = [u8; 8];
-    fn to_le_bytes(self) -> Self::Bytes {
-        self.to_le_bytes()
+    fn to_le(self) -> Self {
+        self.to_le()
     }
     fn as_byte_slice(x: &[Self]) -> &[u8] {
         let ptr = x.as_ptr() as *const u8;
@@ -82,28 +79,27 @@ impl Observable for u64 {
     }
 }
 
-fn fill_via_chunks<T: Observable>(src: &[T], dest: &mut [u8]) -> (usize, usize) {
+/// Fill dest from src
+///
+/// Returns `(n, byte_len)`. `src[..n]` is consumed (and possibly mutated),
+/// `dest[..byte_len]` is filled. `src[n..]` and `dest[byte_len..]` are left
+/// unaltered.
+fn fill_via_chunks<T: Observable>(src: &mut [T], dest: &mut [u8]) -> (usize, usize) {
     let size = core::mem::size_of::<T>();
     let byte_len = min(src.len() * size, dest.len());
     let num_chunks = (byte_len + size - 1) / size;
 
-    if cfg!(target_endian = "little") {
-        // On LE we can do a simple copy, which is 25-50% faster:
-        dest[..byte_len].copy_from_slice(&T::as_byte_slice(&src[..num_chunks])[..byte_len]);
-    } else {
-        // This code is valid on all arches, but slower than the above:
-        let mut i = 0;
-        let mut iter = dest[..byte_len].chunks_exact_mut(size);
-        for chunk in &mut iter {
-            chunk.copy_from_slice(src[i].to_le_bytes().as_ref());
-            i += 1;
-        }
-        let chunk = iter.into_remainder();
-        if !chunk.is_empty() {
-            chunk.copy_from_slice(&src[i].to_le_bytes().as_ref()[..chunk.len()]);
+    // Byte-swap for portability of results. This must happen before copying
+    // since the size of dest is not guaranteed to be a multiple of T or to be
+    // sufficiently aligned.
+    if cfg!(target_endian = "big") {
+        for x in &mut src[..num_chunks] {
+            *x = x.to_le();
         }
     }
 
+    dest[..byte_len].copy_from_slice(&T::as_byte_slice(&src[..num_chunks])[..byte_len]);
+
     (num_chunks, byte_len)
 }
 
@@ -112,6 +108,9 @@ fn fill_via_chunks<T: Observable>(src: &[T], dest: &mut [u8]) -> (usize, usize)
 ///
 /// The return values are `(consumed_u32, filled_u8)`.
 ///
+/// On big-endian systems, endianness of `src[..consumed_u32]` values is
+/// swapped. No other adjustments to `src` are made.
+///
 /// `filled_u8` is the number of filled bytes in `dest`, which may be less than
 /// the length of `dest`.
 /// `consumed_u32` is the number of words consumed from `src`, which is the same
@@ -137,7 +136,7 @@ fn fill_via_chunks<T: Observable>(src: &[T], dest: &mut [u8]) -> (usize, usize)
 ///     }
 /// }
 /// ```
-pub fn fill_via_u32_chunks(src: &[u32], dest: &mut [u8]) -> (usize, usize) {
+pub fn fill_via_u32_chunks(src: &mut [u32], dest: &mut [u8]) -> (usize, usize) {
     fill_via_chunks(src, dest)
 }
 
@@ -145,13 +144,17 @@ pub fn fill_via_u32_chunks(src: &[u32], dest: &mut [u8]) -> (usize, usize) {
 /// based RNG.
 ///
 /// The return values are `(consumed_u64, filled_u8)`.
+///
+/// On big-endian systems, endianness of `src[..consumed_u64]` values is
+/// swapped. No other adjustments to `src` are made.
+///
 /// `filled_u8` is the number of filled bytes in `dest`, which may be less than
 /// the length of `dest`.
 /// `consumed_u64` is the number of words consumed from `src`, which is the same
 /// as `filled_u8 / 8` rounded up.
 ///
 /// See `fill_via_u32_chunks` for an example.
-pub fn fill_via_u64_chunks(src: &[u64], dest: &mut [u8]) -> (usize, usize) {
+pub fn fill_via_u64_chunks(src: &mut [u64], dest: &mut [u8]) -> (usize, usize) {
     fill_via_chunks(src, dest)
 }
 
@@ -175,33 +178,41 @@ mod test {
 
     #[test]
     fn test_fill_via_u32_chunks() {
-        let src = [1, 2, 3];
+        let src_orig = [1, 2, 3];
+
+        let mut src = src_orig;
         let mut dst = [0u8; 11];
-        assert_eq!(fill_via_u32_chunks(&src, &mut dst), (3, 11));
+        assert_eq!(fill_via_u32_chunks(&mut src, &mut dst), (3, 11));
         assert_eq!(dst, [1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0]);
 
+        let mut src = src_orig;
         let mut dst = [0u8; 13];
-        assert_eq!(fill_via_u32_chunks(&src, &mut dst), (3, 12));
+        assert_eq!(fill_via_u32_chunks(&mut src, &mut dst), (3, 12));
         assert_eq!(dst, [1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0]);
 
+        let mut src = src_orig;
         let mut dst = [0u8; 5];
-        assert_eq!(fill_via_u32_chunks(&src, &mut dst), (2, 5));
+        assert_eq!(fill_via_u32_chunks(&mut src, &mut dst), (2, 5));
         assert_eq!(dst, [1, 0, 0, 0, 2]);
     }
 
     #[test]
     fn test_fill_via_u64_chunks() {
-        let src = [1, 2];
+        let src_orig = [1, 2];
+
+        let mut src = src_orig;
         let mut dst = [0u8; 11];
-        assert_eq!(fill_via_u64_chunks(&src, &mut dst), (2, 11));
+        assert_eq!(fill_via_u64_chunks(&mut src, &mut dst), (2, 11));
         assert_eq!(dst, [1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0]);
 
+        let mut src = src_orig;
         let mut dst = [0u8; 17];
-        assert_eq!(fill_via_u64_chunks(&src, &mut dst), (2, 16));
+        assert_eq!(fill_via_u64_chunks(&mut src, &mut dst), (2, 16));
         assert_eq!(dst, [1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0]);
 
+        let mut src = src_orig;
         let mut dst = [0u8; 5];
-        assert_eq!(fill_via_u64_chunks(&src, &mut dst), (1, 5));
+        assert_eq!(fill_via_u64_chunks(&mut src, &mut dst), (1, 5));
         assert_eq!(dst, [1, 0, 0, 0, 0]);
     }
 }