From cd7fade0a9c1c8762d2fba7c65c1b82e8d369711 Mon Sep 17 00:00:00 2001 From: Ruud van Asseldonk Date: Wed, 2 Nov 2016 22:49:27 +0100 Subject: [PATCH 1/3] Add small-copy optimization for io::Cursor During benchmarking, I found that one of my programs spent between 5 and 10 percent of the time doing memmoves. Ultimately I tracked these down to single-byte slices being copied with a memcopy in io::Cursor::read(). Doing a manual copy if only one byte is requested can speed things up significantly. For my program, this reduced the running time by 20%. Why special-case only a single byte, and not a "small" slice in general? I tried doing this for slices of at most 64 bytes and of at most 8 bytes. In both cases my test program was significantly slower. --- src/libstd/io/cursor.rs | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/libstd/io/cursor.rs b/src/libstd/io/cursor.rs index 1b5023380a783..9b50168a954b7 100644 --- a/src/libstd/io/cursor.rs +++ b/src/libstd/io/cursor.rs @@ -219,9 +219,21 @@ impl io::Seek for Cursor where T: AsRef<[u8]> { #[stable(feature = "rust1", since = "1.0.0")] impl Read for Cursor where T: AsRef<[u8]> { fn read(&mut self, buf: &mut [u8]) -> io::Result { - let n = Read::read(&mut self.fill_buf()?, buf)?; - self.pos += n as u64; - Ok(n) + // First check if the amount of bytes we want to read is small: the read + // in the else branch will end up calling `<&[u8] as Read>::read()`, + // which will copy the buffer using a memcopy. If we only want to read a + // single byte, then the overhead of the function call is significant. + let num_read = { + let mut inner_buf = self.fill_buf()?; + if buf.len() == 1 && inner_buf.len() > 0 { + buf[0] = inner_buf[0]; + 1 + } else { + Read::read(&mut inner_buf, buf)? + } + }; + self.pos += num_read as u64; + Ok(num_read) } } From 341805288e8a055162bef64055a7962ecffbf103 Mon Sep 17 00:00:00 2001 From: Ruud van Asseldonk Date: Fri, 4 Nov 2016 00:20:11 +0100 Subject: [PATCH 2/3] Move small-copy optimization into copy_from_slice Ultimately copy_from_slice is being a bottleneck, not io::Cursor::read. It might be worthwhile to move the check here, so more places can benefit from it. --- src/libcore/slice.rs | 16 +++++++++++++--- src/libstd/io/cursor.rs | 18 +++--------------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/libcore/slice.rs b/src/libcore/slice.rs index a4a90e7a9da7a..b238623eabaa7 100644 --- a/src/libcore/slice.rs +++ b/src/libcore/slice.rs @@ -515,9 +515,19 @@ impl SliceExt for [T] { fn copy_from_slice(&mut self, src: &[T]) where T: Copy { assert!(self.len() == src.len(), "destination and source slices have different lengths"); - unsafe { - ptr::copy_nonoverlapping( - src.as_ptr(), self.as_mut_ptr(), self.len()); + // First check if the amount of elements we want to copy is small: + // `copy_nonoverlapping` will do a memcopy, which involves an indirect + // function call when `memcpy` is in the dynamically-linked libc. For + // small elements (such as a single byte or pointer), the overhead is + // significant. If the element is big then the assignment is a memcopy + // anyway. + if self.len() == 1 { + self[0] = src[0]; + } else { + unsafe { + ptr::copy_nonoverlapping( + src.as_ptr(), self.as_mut_ptr(), self.len()); + } } } diff --git a/src/libstd/io/cursor.rs b/src/libstd/io/cursor.rs index 9b50168a954b7..1b5023380a783 100644 --- a/src/libstd/io/cursor.rs +++ b/src/libstd/io/cursor.rs @@ -219,21 +219,9 @@ impl io::Seek for Cursor where T: AsRef<[u8]> { #[stable(feature = "rust1", since = "1.0.0")] impl Read for Cursor where T: AsRef<[u8]> { fn read(&mut self, buf: &mut [u8]) -> io::Result { - // First check if the amount of bytes we want to read is small: the read - // in the else branch will end up calling `<&[u8] as Read>::read()`, - // which will copy the buffer using a memcopy. If we only want to read a - // single byte, then the overhead of the function call is significant. - let num_read = { - let mut inner_buf = self.fill_buf()?; - if buf.len() == 1 && inner_buf.len() > 0 { - buf[0] = inner_buf[0]; - 1 - } else { - Read::read(&mut inner_buf, buf)? - } - }; - self.pos += num_read as u64; - Ok(num_read) + let n = Read::read(&mut self.fill_buf()?, buf)?; + self.pos += n as u64; + Ok(n) } } From 3be2c3b3092e934bdc2db67d5bdcabd611deca9c Mon Sep 17 00:00:00 2001 From: Ruud van Asseldonk Date: Sat, 12 Nov 2016 15:58:58 +0100 Subject: [PATCH 3/3] Move small-copy optimization into <&[u8] as Read> Based on the discussion in https://github.com/rust-lang/rust/pull/37573, it is likely better to keep this limited to std::io, instead of modifying a function which users expect to be a memcpy. --- src/libcore/slice.rs | 16 +++------------- src/libstd/io/impls.rs | 22 ++++++++++++++++++++-- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/libcore/slice.rs b/src/libcore/slice.rs index b238623eabaa7..a4a90e7a9da7a 100644 --- a/src/libcore/slice.rs +++ b/src/libcore/slice.rs @@ -515,19 +515,9 @@ impl SliceExt for [T] { fn copy_from_slice(&mut self, src: &[T]) where T: Copy { assert!(self.len() == src.len(), "destination and source slices have different lengths"); - // First check if the amount of elements we want to copy is small: - // `copy_nonoverlapping` will do a memcopy, which involves an indirect - // function call when `memcpy` is in the dynamically-linked libc. For - // small elements (such as a single byte or pointer), the overhead is - // significant. If the element is big then the assignment is a memcopy - // anyway. - if self.len() == 1 { - self[0] = src[0]; - } else { - unsafe { - ptr::copy_nonoverlapping( - src.as_ptr(), self.as_mut_ptr(), self.len()); - } + unsafe { + ptr::copy_nonoverlapping( + src.as_ptr(), self.as_mut_ptr(), self.len()); } } diff --git a/src/libstd/io/impls.rs b/src/libstd/io/impls.rs index 6b26c016638a7..f691289811bc6 100644 --- a/src/libstd/io/impls.rs +++ b/src/libstd/io/impls.rs @@ -157,7 +157,16 @@ impl<'a> Read for &'a [u8] { fn read(&mut self, buf: &mut [u8]) -> io::Result { let amt = cmp::min(buf.len(), self.len()); let (a, b) = self.split_at(amt); - buf[..amt].copy_from_slice(a); + + // First check if the amount of bytes we want to read is small: + // `copy_from_slice` will generally expand to a call to `memcpy`, and + // for a single byte the overhead is significant. + if amt == 1 { + buf[0] = a[0]; + } else { + buf[..amt].copy_from_slice(a); + } + *self = b; Ok(amt) } @@ -169,7 +178,16 @@ impl<'a> Read for &'a [u8] { "failed to fill whole buffer")); } let (a, b) = self.split_at(buf.len()); - buf.copy_from_slice(a); + + // First check if the amount of bytes we want to read is small: + // `copy_from_slice` will generally expand to a call to `memcpy`, and + // for a single byte the overhead is significant. + if buf.len() == 1 { + buf[0] = a[0]; + } else { + buf.copy_from_slice(a); + } + *self = b; Ok(()) }