Auto merge of #40454 - djzin:fast-swap, r=sfackler

speed up mem::swap I would have thought that the mem::swap code didn't need an intermediate variable precisely because the pointers are guaranteed never to alias. And.. it doesn't! It seems that llvm will also auto-vectorize this case for large structs, but alas it doesn't seem to have all the aliasing info it needs and so will add redundant checks (and even not bother with autovectorizing for small types). Looks like a lot of performance could still be gained here, so this might be a good test case for future optimizer improvements. Here are the current benchmarks for the simd version of mem::swap; the timings are in cycles (code below) measured with 10 iterations. The timings for sizes > 32 which are not a multiple of 8 tend to be ever so slightly faster in the old code, but not always. For large struct sizes (> 1024) the new code shows a marked improvement. \* = latest commit † = subtracted from other measurements | arr_length | noop<sup>†</sup> | rust_stdlib | simd_u64x4\* | simd_u64x8 |------------------|------------|-------------------|-------------------|------------------- 8|80|90|90|90 16|72|177|177|177 24|32|76|76|76 32|68|188|112|188 40|32|80|60|80 48|32|84|56|84 56|32|108|72|108 64|32|108|72|76 72|80|350|220|230 80|80|350|220|230 88|80|420|270|270 96|80|420|270|270 104|80|500|320|320 112|80|490|320|320 120|72|528|342|342 128|48|360|234|234 136|72|987|387|387 144|80|1070|420|420 152|64|856|376|376 160|68|804|400|400 168|80|1060|520|520 176|80|1070|520|520 184|32|464|228|228 192|32|504|228|228 200|32|440|248|248 208|72|987|573|573 216|80|1464|220|220 224|48|852|450|450 232|72|1182|666|666 240|32|428|288|288 248|32|428|308|308 256|80|860|770|770 264|80|1130|820|820 272|80|1340|820|820 280|80|1220|870|870 288|72|1227|804|804 296|72|1356|849|849
rust-lang · Jun 11, 2017 · 27650ee · 27650ee
2 parents 07a2dd4 + 83f1f11
commit 27650ee
Showing 1 changed file with 54 additions and 13 deletions.
diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
@@ -109,7 +109,7 @@ pub use intrinsics::transmute;
 /// [`Clone`][clone]. You need the value's destructor to run only once,
 /// because a double `free` is undefined behavior.
 ///
-/// An example is the definition of [`mem::swap`][swap] in this module:
+/// An example is a possible implementation of [`mem::swap`][swap]:
 ///
 /// ```
 /// use std::mem;
@@ -499,18 +499,59 @@ pub unsafe fn uninitialized<T>() -> T {
 #[stable(feature = "rust1", since = "1.0.0")]
 pub fn swap<T>(x: &mut T, y: &mut T) {
     unsafe {
-        // Give ourselves some scratch space to work with
-        let mut t: T = uninitialized();
-
-        // Perform the swap, `&mut` pointers never alias
-        ptr::copy_nonoverlapping(&*x, &mut t, 1);
-        ptr::copy_nonoverlapping(&*y, x, 1);
-        ptr::copy_nonoverlapping(&t, y, 1);
-
-        // y and t now point to the same thing, but we need to completely
-        // forget `t` because we do not want to run the destructor for `T`
-        // on its value, which is still owned somewhere outside this function.
-        forget(t);
+        // The approach here is to utilize simd to swap x & y efficiently. Testing reveals
+        // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
+        // Haswell E processors. LLVM is more able to optimize if we give a struct a
+        // #[repr(simd)], even if we don't actually use this struct directly.
+        //
+        // FIXME repr(simd) broken on emscripten
+        #[cfg_attr(not(target_os = "emscripten"), repr(simd))]
+        struct Block(u64, u64, u64, u64);
+        struct UnalignedBlock(u64, u64, u64, u64);
+
+        let block_size = size_of::<Block>();
+
+        // Get raw pointers to the bytes of x & y for easier manipulation
+        let x = x as *mut T as *mut u8;
+        let y = y as *mut T as *mut u8;
+
+        // Loop through x & y, copying them `Block` at a time
+        // The optimizer should unroll the loop fully for most types
+        // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
+        let len = size_of::<T>();
+        let mut i = 0;
+        while i + block_size <= len {
+            // Create some uninitialized memory as scratch space
+            // Declaring `t` here avoids aligning the stack when this loop is unused
+            let mut t: Block = uninitialized();
+            let t = &mut t as *mut _ as *mut u8;
+            let x = x.offset(i as isize);
+            let y = y.offset(i as isize);
+
+            // Swap a block of bytes of x & y, using t as a temporary buffer
+            // This should be optimized into efficient SIMD operations where available
+            ptr::copy_nonoverlapping(x, t, block_size);
+            ptr::copy_nonoverlapping(y, x, block_size);
+            ptr::copy_nonoverlapping(t, y, block_size);
+            i += block_size;
+        }
+
+
+        if i < len {
+            // Swap any remaining bytes, using aligned types to copy
+            // where appropriate (this information is lost by conversion
+            // to *mut u8, so restore it manually here)
+            let mut t: UnalignedBlock = uninitialized();
+            let rem = len - i;
+
+            let t = &mut t as *mut _ as *mut u8;
+            let x = x.offset(i as isize);
+            let y = y.offset(i as isize);
+
+            ptr::copy_nonoverlapping(x, t, rem);
+            ptr::copy_nonoverlapping(y, x, rem);
+            ptr::copy_nonoverlapping(t, y, rem);
+        }
     }
 }