From cd7fade0a9c1c8762d2fba7c65c1b82e8d369711 Mon Sep 17 00:00:00 2001
From: Ruud van Asseldonk <dev@veniogames.com>
Date: Wed, 2 Nov 2016 22:49:27 +0100
Subject: [PATCH 1/3] Add small-copy optimization for io::Cursor

During benchmarking, I found that one of my programs spent between 5 and
10 percent of the time doing memmoves. Ultimately I tracked these down
to single-byte slices being copied with a memcopy in io::Cursor::read().
Doing a manual copy if only one byte is requested can speed things up
significantly. For my program, this reduced the running time by 20%.

Why special-case only a single byte, and not a "small" slice in general?
I tried doing this for slices of at most 64 bytes and of at most 8
bytes. In both cases my test program was significantly slower.
---
 src/libstd/io/cursor.rs | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/src/libstd/io/cursor.rs b/src/libstd/io/cursor.rs
index 1b5023380a783..9b50168a954b7 100644
--- a/src/libstd/io/cursor.rs
+++ b/src/libstd/io/cursor.rs
@@ -219,9 +219,21 @@ impl<T> io::Seek for Cursor<T> where T: AsRef<[u8]> {
 #[stable(feature = "rust1", since = "1.0.0")]
 impl<T> Read for Cursor<T> where T: AsRef<[u8]> {
     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        let n = Read::read(&mut self.fill_buf()?, buf)?;
-        self.pos += n as u64;
-        Ok(n)
+        // First check if the amount of bytes we want to read is small: the read
+        // in the else branch will end up calling `<&[u8] as Read>::read()`,
+        // which will copy the buffer using a memcopy. If we only want to read a
+        // single byte, then the overhead of the function call is significant.
+        let num_read = {
+            let mut inner_buf = self.fill_buf()?;
+            if buf.len() == 1 && inner_buf.len() > 0 {
+                buf[0] = inner_buf[0];
+                1
+            } else {
+                Read::read(&mut inner_buf, buf)?
+            }
+        };
+        self.pos += num_read as u64;
+        Ok(num_read)
     }
 }
 

From 341805288e8a055162bef64055a7962ecffbf103 Mon Sep 17 00:00:00 2001
From: Ruud van Asseldonk <dev@veniogames.com>
Date: Fri, 4 Nov 2016 00:20:11 +0100
Subject: [PATCH 2/3] Move small-copy optimization into copy_from_slice

Ultimately copy_from_slice is being a bottleneck, not io::Cursor::read.
It might be worthwhile to move the check here, so more places can
benefit from it.
---
 src/libcore/slice.rs    | 16 +++++++++++++---
 src/libstd/io/cursor.rs | 18 +++---------------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/src/libcore/slice.rs b/src/libcore/slice.rs
index a4a90e7a9da7a..b238623eabaa7 100644
--- a/src/libcore/slice.rs
+++ b/src/libcore/slice.rs
@@ -515,9 +515,19 @@ impl<T> SliceExt for [T] {
     fn copy_from_slice(&mut self, src: &[T]) where T: Copy {
         assert!(self.len() == src.len(),
                 "destination and source slices have different lengths");
-        unsafe {
-            ptr::copy_nonoverlapping(
-                src.as_ptr(), self.as_mut_ptr(), self.len());
+        // First check if the amount of elements we want to copy is small:
+        // `copy_nonoverlapping` will do a memcopy, which involves an indirect
+        // function call when `memcpy` is in the dynamically-linked libc. For
+        // small elements (such as a single byte or pointer), the overhead is
+        // significant. If the element is big then the assignment is a memcopy
+        // anyway.
+        if self.len() == 1 {
+            self[0] = src[0];
+        } else {
+            unsafe {
+                ptr::copy_nonoverlapping(
+                    src.as_ptr(), self.as_mut_ptr(), self.len());
+            }
         }
     }
 
diff --git a/src/libstd/io/cursor.rs b/src/libstd/io/cursor.rs
index 9b50168a954b7..1b5023380a783 100644
--- a/src/libstd/io/cursor.rs
+++ b/src/libstd/io/cursor.rs
@@ -219,21 +219,9 @@ impl<T> io::Seek for Cursor<T> where T: AsRef<[u8]> {
 #[stable(feature = "rust1", since = "1.0.0")]
 impl<T> Read for Cursor<T> where T: AsRef<[u8]> {
     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        // First check if the amount of bytes we want to read is small: the read
-        // in the else branch will end up calling `<&[u8] as Read>::read()`,
-        // which will copy the buffer using a memcopy. If we only want to read a
-        // single byte, then the overhead of the function call is significant.
-        let num_read = {
-            let mut inner_buf = self.fill_buf()?;
-            if buf.len() == 1 && inner_buf.len() > 0 {
-                buf[0] = inner_buf[0];
-                1
-            } else {
-                Read::read(&mut inner_buf, buf)?
-            }
-        };
-        self.pos += num_read as u64;
-        Ok(num_read)
+        let n = Read::read(&mut self.fill_buf()?, buf)?;
+        self.pos += n as u64;
+        Ok(n)
     }
 }
 

From 3be2c3b3092e934bdc2db67d5bdcabd611deca9c Mon Sep 17 00:00:00 2001
From: Ruud van Asseldonk <dev@veniogames.com>
Date: Sat, 12 Nov 2016 15:58:58 +0100
Subject: [PATCH 3/3] Move small-copy optimization into <&[u8] as Read>

Based on the discussion in https://github.com/rust-lang/rust/pull/37573,
it is likely better to keep this limited to std::io, instead of
modifying a function which users expect to be a memcpy.
---
 src/libcore/slice.rs   | 16 +++-------------
 src/libstd/io/impls.rs | 22 ++++++++++++++++++++--
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/libcore/slice.rs b/src/libcore/slice.rs
index b238623eabaa7..a4a90e7a9da7a 100644
--- a/src/libcore/slice.rs
+++ b/src/libcore/slice.rs
@@ -515,19 +515,9 @@ impl<T> SliceExt for [T] {
     fn copy_from_slice(&mut self, src: &[T]) where T: Copy {
         assert!(self.len() == src.len(),
                 "destination and source slices have different lengths");
-        // First check if the amount of elements we want to copy is small:
-        // `copy_nonoverlapping` will do a memcopy, which involves an indirect
-        // function call when `memcpy` is in the dynamically-linked libc. For
-        // small elements (such as a single byte or pointer), the overhead is
-        // significant. If the element is big then the assignment is a memcopy
-        // anyway.
-        if self.len() == 1 {
-            self[0] = src[0];
-        } else {
-            unsafe {
-                ptr::copy_nonoverlapping(
-                    src.as_ptr(), self.as_mut_ptr(), self.len());
-            }
+        unsafe {
+            ptr::copy_nonoverlapping(
+                src.as_ptr(), self.as_mut_ptr(), self.len());
         }
     }
 
diff --git a/src/libstd/io/impls.rs b/src/libstd/io/impls.rs
index 6b26c016638a7..f691289811bc6 100644
--- a/src/libstd/io/impls.rs
+++ b/src/libstd/io/impls.rs
@@ -157,7 +157,16 @@ impl<'a> Read for &'a [u8] {
     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
         let amt = cmp::min(buf.len(), self.len());
         let (a, b) = self.split_at(amt);
-        buf[..amt].copy_from_slice(a);
+
+        // First check if the amount of bytes we want to read is small:
+        // `copy_from_slice` will generally expand to a call to `memcpy`, and
+        // for a single byte the overhead is significant.
+        if amt == 1 {
+            buf[0] = a[0];
+        } else {
+            buf[..amt].copy_from_slice(a);
+        }
+
         *self = b;
         Ok(amt)
     }
@@ -169,7 +178,16 @@ impl<'a> Read for &'a [u8] {
                                   "failed to fill whole buffer"));
         }
         let (a, b) = self.split_at(buf.len());
-        buf.copy_from_slice(a);
+
+        // First check if the amount of bytes we want to read is small:
+        // `copy_from_slice` will generally expand to a call to `memcpy`, and
+        // for a single byte the overhead is significant.
+        if buf.len() == 1 {
+            buf[0] = a[0];
+        } else {
+            buf.copy_from_slice(a);
+        }
+
         *self = b;
         Ok(())
     }