From 4bcfbc36c672f0f30176a31c5a6e529bebd6fbcc Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Sun, 12 Mar 2017 13:14:47 +0000
Subject: [PATCH 01/15] speed up mem::swap

---
 src/libcore/mem.rs | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)
diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index f4ce4697d7cf4..a7c5d29c5a516 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -109,7 +109,7 @@ pub use intrinsics::transmute;
 /// [`Clone`][clone]. You need the value's destructor to run only once,
 /// because a double `free` is undefined behavior.
 ///
-/// An example is the definition of [`mem::swap`][swap] in this module:
+/// An example is the (old) definition of [`mem::swap`][swap] in this module:
 ///
 /// ```
 /// use std::mem;
@@ -447,18 +447,15 @@ pub unsafe fn uninitialized<T>() -> T {
 #[stable(feature = "rust1", since = "1.0.0")]
 pub fn swap<T>(x: &mut T, y: &mut T) {
     unsafe {
-        // Give ourselves some scratch space to work with
-        let mut t: T = uninitialized();
+        let x = x as *mut T as *mut u8;
+        let y = y as *mut T as *mut u8;
 
-        // Perform the swap, `&mut` pointers never alias
-        ptr::copy_nonoverlapping(&*x, &mut t, 1);
-        ptr::copy_nonoverlapping(&*y, x, 1);
-        ptr::copy_nonoverlapping(&t, y, 1);
-
-        // y and t now point to the same thing, but we need to completely
-        // forget `t` because we do not want to run the destructor for `T`
-        // on its value, which is still owned somewhere outside this function.
-        forget(t);
+        // use an xor-swap as x & y are guaranteed to never alias
+        for i in 0..size_of::<T>() as isize {
+            *x.offset(i) ^= *y.offset(i);
+            *y.offset(i) ^= *x.offset(i);
+            *x.offset(i) ^= *y.offset(i);
+        }
     }
 }
 

From 85049e508ba6b1502e4d074d051a188c398fabc6 Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Sun, 12 Mar 2017 14:10:07 +0000
Subject: [PATCH 02/15] avoid recursion

---
 src/libcore/mem.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index a7c5d29c5a516..748d63362463a 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -450,11 +450,14 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
         let x = x as *mut T as *mut u8;
         let y = y as *mut T as *mut u8;
 
-        // use an xor-swap as x & y are guaranteed to never alias
-        for i in 0..size_of::<T>() as isize {
+        // can't use a for loop as the `range` impl calls `mem::swap` recursively
+        let mut i = 0;
+        while i < size_of::<T>() as isize {
+            // use an xor-swap as x & y are guaranteed to never alias
             *x.offset(i) ^= *y.offset(i);
             *y.offset(i) ^= *x.offset(i);
             *x.offset(i) ^= *y.offset(i);
+            i += 1;
         }
     }
 }

From 5702f436aa6258119a32cbff31cc442d73b0d2c0 Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Sun, 12 Mar 2017 18:32:20 +0000
Subject: [PATCH 03/15] a new approach; ditch xor cuteness and maximize cache
 locality

---
 src/libcore/mem.rs | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index 748d63362463a..e1b9991ccfdf5 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -447,17 +447,29 @@ pub unsafe fn uninitialized<T>() -> T {
 #[stable(feature = "rust1", since = "1.0.0")]
 pub fn swap<T>(x: &mut T, y: &mut T) {
     unsafe {
+        // Give ourselves some scratch space to work with
+        let mut t: [u8; 16] = mem::uninitialized();
+
         let x = x as *mut T as *mut u8;
         let y = y as *mut T as *mut u8;
+        let t = &mut t as *mut _ as *mut u8;
 
         // can't use a for loop as the `range` impl calls `mem::swap` recursively
+        let len = size_of::<T>() as isize;
         let mut i = 0;
-        while i < size_of::<T>() as isize {
-            // use an xor-swap as x & y are guaranteed to never alias
-            *x.offset(i) ^= *y.offset(i);
-            *y.offset(i) ^= *x.offset(i);
-            *x.offset(i) ^= *y.offset(i);
-            i += 1;
+        while i + 16 <= len {
+            // Perform the swap 16 bytes at a time, `&mut` pointers never alias
+            ptr::copy_nonoverlapping(x.offset(i), t, 16);
+            ptr::copy_nonoverlapping(y.offset(i), x.offset(i), 16);
+            ptr::copy_nonoverlapping(t, y.offset(i), 16);
+            i += 16;
+        }
+        if i < len {
+            // Swap any remaining bytes
+            let rem = (len - i) as usize;
+            ptr::copy_nonoverlapping(x.offset(i), t, rem);
+            ptr::copy_nonoverlapping(y.offset(i), x.offset(i), rem);
+            ptr::copy_nonoverlapping(t, y.offset(i), rem);
         }
     }
 }

From d1fec0d87a95310fcc1c59d72953ad6be89c78a5 Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Sun, 12 Mar 2017 18:41:58 +0000
Subject: [PATCH 04/15] fix typo

---
 src/libcore/mem.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index e1b9991ccfdf5..865ff3c6ee4b3 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -448,7 +448,7 @@ pub unsafe fn uninitialized<T>() -> T {
 pub fn swap<T>(x: &mut T, y: &mut T) {
     unsafe {
         // Give ourselves some scratch space to work with
-        let mut t: [u8; 16] = mem::uninitialized();
+        let mut t: [u8; 16] = uninitialized();
 
         let x = x as *mut T as *mut u8;
         let y = y as *mut T as *mut u8;

From 1daf58964216dfe1f805cdaff76a91ca90d7523e Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Mon, 13 Mar 2017 20:03:10 +0000
Subject: [PATCH 05/15] add SWAP_BLOCK_SIZE constant

---
 src/libcore/mem.rs | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index 865ff3c6ee4b3..9a116e9041f78 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -447,8 +447,10 @@ pub unsafe fn uninitialized<T>() -> T {
 #[stable(feature = "rust1", since = "1.0.0")]
 pub fn swap<T>(x: &mut T, y: &mut T) {
     unsafe {
+        const SWAP_BLOCK_SIZE: usize = 16;
+
         // Give ourselves some scratch space to work with
-        let mut t: [u8; 16] = uninitialized();
+        let mut t: [u8; SWAP_BLOCK_SIZE] = uninitialized();
 
         let x = x as *mut T as *mut u8;
         let y = y as *mut T as *mut u8;
@@ -457,12 +459,12 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
         // can't use a for loop as the `range` impl calls `mem::swap` recursively
         let len = size_of::<T>() as isize;
         let mut i = 0;
-        while i + 16 <= len {
-            // Perform the swap 16 bytes at a time, `&mut` pointers never alias
-            ptr::copy_nonoverlapping(x.offset(i), t, 16);
-            ptr::copy_nonoverlapping(y.offset(i), x.offset(i), 16);
-            ptr::copy_nonoverlapping(t, y.offset(i), 16);
-            i += 16;
+        while i + SWAP_BLOCK_SIZE as isize <= len {
+            // Perform the swap SWAP_BLOCK_SIZE bytes at a time, `&mut` pointers never alias
+            ptr::copy_nonoverlapping(x.offset(i), t, SWAP_BLOCK_SIZE);
+            ptr::copy_nonoverlapping(y.offset(i), x.offset(i), SWAP_BLOCK_SIZE);
+            ptr::copy_nonoverlapping(t, y.offset(i), SWAP_BLOCK_SIZE);
+            i += SWAP_BLOCK_SIZE as isize;
         }
         if i < len {
             // Swap any remaining bytes

From 2816998d1e702a8597c07100856e2483590f3e2a Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Wed, 15 Mar 2017 06:45:43 +0000
Subject: [PATCH 06/15] use simd blocks

---
 src/libcore/mem.rs | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index 9a116e9041f78..b6838f103c469 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -447,24 +447,34 @@ pub unsafe fn uninitialized<T>() -> T {
 #[stable(feature = "rust1", since = "1.0.0")]
 pub fn swap<T>(x: &mut T, y: &mut T) {
     unsafe {
-        const SWAP_BLOCK_SIZE: usize = 16;
+        // The approach here is to utilize simd to swap x & y efficiently. Testing reveals
+        // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
+        // Haswell E processors. LLVM is more able to optimize if we give a struct a
+        // #[repr(simd)], even if we don't actually use this struct directly.
+        #[repr(simd)]
+        struct Block(u64, u64, u64, u64);
+        let block_size = size_of::<Block>();
 
-        // Give ourselves some scratch space to work with
-        let mut t: [u8; SWAP_BLOCK_SIZE] = uninitialized();
+        // Create some uninitialized memory as scratch space
+        let mut t: Block = uninitialized();
 
+        // Get raw pointers to the bytes of x, y & t for easier manipulation
         let x = x as *mut T as *mut u8;
         let y = y as *mut T as *mut u8;
         let t = &mut t as *mut _ as *mut u8;
 
-        // can't use a for loop as the `range` impl calls `mem::swap` recursively
+        // Loop through x & y, copying them `Block` at a time
+        // The optimizer should unroll the loop fully for most types
+        // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
         let len = size_of::<T>() as isize;
         let mut i = 0;
-        while i + SWAP_BLOCK_SIZE as isize <= len {
-            // Perform the swap SWAP_BLOCK_SIZE bytes at a time, `&mut` pointers never alias
-            ptr::copy_nonoverlapping(x.offset(i), t, SWAP_BLOCK_SIZE);
-            ptr::copy_nonoverlapping(y.offset(i), x.offset(i), SWAP_BLOCK_SIZE);
-            ptr::copy_nonoverlapping(t, y.offset(i), SWAP_BLOCK_SIZE);
-            i += SWAP_BLOCK_SIZE as isize;
+        while i + block_size as isize <= len {
+            // Swap a block of bytes of x & y, using t as a temporary buffer
+            // This should be optimized into efficient SIMD operations where available
+            ptr::copy_nonoverlapping(x.offset(i), t, block_size);
+            ptr::copy_nonoverlapping(y.offset(i), x.offset(i), block_size);
+            ptr::copy_nonoverlapping(t, y.offset(i), block_size);
+            i += block_size as isize;
         }
         if i < len {
             // Swap any remaining bytes

From c6ca81aa921415a33e1c8f32e0c53a2c5eb6b485 Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Mon, 24 Apr 2017 07:40:11 +0100
Subject: [PATCH 07/15] change wording

---
 src/libcore/mem.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index b6838f103c469..e51976c4845ec 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -109,7 +109,7 @@ pub use intrinsics::transmute;
 /// [`Clone`][clone]. You need the value's destructor to run only once,
 /// because a double `free` is undefined behavior.
 ///
-/// An example is the (old) definition of [`mem::swap`][swap] in this module:
+/// An example is a possible implementation of [`mem::swap`][swap]:
 ///
 /// ```
 /// use std::mem;

From 165f3668d602c4eaa02125cf86fb8d12719cb441 Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Sun, 7 May 2017 20:26:19 +0100
Subject: [PATCH 08/15] optimize out stack alignment for sizes < 32

---
 src/libcore/mem.rs | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index e51976c4845ec..2dc1f0e04bcc8 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -453,15 +453,13 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
         // #[repr(simd)], even if we don't actually use this struct directly.
         #[repr(simd)]
         struct Block(u64, u64, u64, u64);
-        let block_size = size_of::<Block>();
+        struct UnalignedBlock(u64, u64, u64, u64);
 
-        // Create some uninitialized memory as scratch space
-        let mut t: Block = uninitialized();
+        let block_size = size_of::<Block>();
 
-        // Get raw pointers to the bytes of x, y & t for easier manipulation
+        // Get raw pointers to the bytes of x & y for easier manipulation
         let x = x as *mut T as *mut u8;
         let y = y as *mut T as *mut u8;
-        let t = &mut t as *mut _ as *mut u8;
 
         // Loop through x & y, copying them `Block` at a time
         // The optimizer should unroll the loop fully for most types
@@ -469,6 +467,12 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
         let len = size_of::<T>() as isize;
         let mut i = 0;
         while i + block_size as isize <= len {
+            // Create some uninitialized memory as scratch space
+            // Moving the declaration of `t` here avoids aligning the stack when
+            // this loop is unused
+            let mut t: Block = uninitialized();
+            let t = &mut t as *mut _ as *mut u8;
+
             // Swap a block of bytes of x & y, using t as a temporary buffer
             // This should be optimized into efficient SIMD operations where available
             ptr::copy_nonoverlapping(x.offset(i), t, block_size);
@@ -478,6 +482,9 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
         }
         if i < len {
             // Swap any remaining bytes
+            let mut t: UnalignedBlock = uninitialized();
+            let t = &mut t as *mut _ as *mut u8;
+
             let rem = (len - i) as usize;
             ptr::copy_nonoverlapping(x.offset(i), t, rem);
             ptr::copy_nonoverlapping(y.offset(i), x.offset(i), rem);

From ca2fa97b6ca858a1c7748d117f5ed096b185380a Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Sun, 7 May 2017 20:29:39 +0100
Subject: [PATCH 09/15] improve wording

---
 src/libcore/mem.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index 2dc1f0e04bcc8..5fb30a9abfecc 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -468,8 +468,7 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
         let mut i = 0;
         while i + block_size as isize <= len {
             // Create some uninitialized memory as scratch space
-            // Moving the declaration of `t` here avoids aligning the stack when
-            // this loop is unused
+            // Decaring `t` here avoids aligning the stack when this loop is unused
             let mut t: Block = uninitialized();
             let t = &mut t as *mut _ as *mut u8;
 

From fcc970aca5d3e169c6a837c34f62b3e6e0ed74d1 Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Sun, 7 May 2017 21:56:14 +0100
Subject: [PATCH 10/15] fix nit

---
 src/libcore/mem.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index 5fb30a9abfecc..87206152a0d21 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -468,7 +468,7 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
         let mut i = 0;
         while i + block_size as isize <= len {
             // Create some uninitialized memory as scratch space
-            // Decaring `t` here avoids aligning the stack when this loop is unused
+            // Declaring `t` here avoids aligning the stack when this loop is unused
             let mut t: Block = uninitialized();
             let t = &mut t as *mut _ as *mut u8;
 

From c6307a2fa55c3d62c06b85b349257a8194093442 Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Sat, 27 May 2017 14:29:41 +0100
Subject: [PATCH 11/15] copy tail bytes better for aligned types

---
 src/libcore/mem.rs | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index 87206152a0d21..2975d2e28dd38 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -479,6 +479,27 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
             ptr::copy_nonoverlapping(t, y.offset(i), block_size);
             i += block_size as isize;
         }
+
+        // Swap remaining bytes 8 at a time if x & y are properly aligned
+        if align_of::<T>() % 8 == 0 {
+            while i + 8 <= len as isize {
+                let t = *(x.offset(i) as *mut u64);
+                *(x.offset(i) as *mut u64) = *(y.offset(i) as *mut u64);
+                *(y.offset(i) as *mut u64) = t;
+                i += 8;
+            }
+        }
+
+        // Swap remaining bytes 4 at a time if x & y are properly aligned
+        if align_of::<T>() % 4 == 0 {
+            while i + 4 <= len as isize {
+                let t = *(x.offset(i) as *mut u32);
+                *(x.offset(i) as *mut u32) = *(y.offset(i) as *mut u32);
+                *(y.offset(i) as *mut u32) = t;
+                i += 4;
+            }
+        }
+
         if i < len {
             // Swap any remaining bytes
             let mut t: UnalignedBlock = uninitialized();

From d4d3f53468cb392cb9b80278a232857ad8d68992 Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Sat, 27 May 2017 16:39:51 +0100
Subject: [PATCH 12/15] better respect alignment for copying tail

---
 src/libcore/mem.rs | 62 +++++++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index 64034b05fde99..a2154e7fc69e5 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -532,35 +532,47 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
             i += block_size as isize;
         }
 
-        // Swap remaining bytes 8 at a time if x & y are properly aligned
-        if align_of::<T>() % 8 == 0 {
-            while i + 8 <= len as isize {
-                let t = *(x.offset(i) as *mut u64);
-                *(x.offset(i) as *mut u64) = *(y.offset(i) as *mut u64);
-                *(y.offset(i) as *mut u64) = t;
-                i += 8;
-            }
-        }
-
-        // Swap remaining bytes 4 at a time if x & y are properly aligned
-        if align_of::<T>() % 4 == 0 {
-            while i + 4 <= len as isize {
-                let t = *(x.offset(i) as *mut u32);
-                *(x.offset(i) as *mut u32) = *(y.offset(i) as *mut u32);
-                *(y.offset(i) as *mut u32) = t;
-                i += 4;
-            }
-        }
 
         if i < len {
-            // Swap any remaining bytes
+            // Swap any remaining bytes, using aligned types to copy
+            // where appropriate (this information is lost by conversion
+            // to *mut u8, so restore it manually here)
             let mut t: UnalignedBlock = uninitialized();
-            let t = &mut t as *mut _ as *mut u8;
-
             let rem = (len - i) as usize;
-            ptr::copy_nonoverlapping(x.offset(i), t, rem);
-            ptr::copy_nonoverlapping(y.offset(i), x.offset(i), rem);
-            ptr::copy_nonoverlapping(t, y.offset(i), rem);
+
+            if align_of::<T>() % 8 == 0 && len % 8 == 0 {
+                let t = &mut t as *mut _ as *mut u64;
+                let x = x.offset(i) as *mut u64;
+                let y = y.offset(i) as *mut u64;
+
+                ptr::copy_nonoverlapping(x, t, rem / 8);
+                ptr::copy_nonoverlapping(y, x, rem / 8);
+                ptr::copy_nonoverlapping(t, y, rem / 8);
+            } else if align_of::<T>() % 4 == 0 && len % 4 == 0 {
+                let t = &mut t as *mut _ as *mut u32;
+                let x = x.offset(i) as *mut u32;
+                let y = y.offset(i) as *mut u32;
+
+                ptr::copy_nonoverlapping(x, t, rem / 4);
+                ptr::copy_nonoverlapping(y, x, rem / 4);
+                ptr::copy_nonoverlapping(t, y, rem / 4);
+            } else if align_of::<T>() % 2 == 0 && len % 2 == 0 {
+                let t = &mut t as *mut _ as *mut u16;
+                let x = x.offset(i) as *mut u16;
+                let y = y.offset(i) as *mut u16;
+
+                ptr::copy_nonoverlapping(x, t, rem / 2);
+                ptr::copy_nonoverlapping(y, x, rem / 2);
+                ptr::copy_nonoverlapping(t, y, rem / 2);
+            } else {
+                let t = &mut t as *mut _ as *mut u8;
+                let x = x.offset(i);
+                let y = y.offset(i);
+
+                ptr::copy_nonoverlapping(x, t, rem);
+                ptr::copy_nonoverlapping(y, x, rem);
+                ptr::copy_nonoverlapping(t, y, rem);
+            }
         }
     }
 }

From 8a973dfa24b676e6e50b6c6bbb1cdce17499a2f7 Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Sun, 28 May 2017 15:39:47 +0100
Subject: [PATCH 13/15] restore old behaviour for sizes < 128

---
 src/libcore/mem.rs | 75 +++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 41 deletions(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index a2154e7fc69e5..93ec54b17390f 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -499,6 +499,24 @@ pub unsafe fn uninitialized<T>() -> T {
 #[stable(feature = "rust1", since = "1.0.0")]
 pub fn swap<T>(x: &mut T, y: &mut T) {
     unsafe {
+        let len = size_of::<T>();
+
+        if len < 128 {
+            // Give ourselves some scratch space to work with
+            let mut t: T = uninitialized();
+   
+            // Perform the swap, `&mut` pointers never alias
+            ptr::copy_nonoverlapping(&*x, &mut t, 1);
+            ptr::copy_nonoverlapping(&*y, x, 1);
+            ptr::copy_nonoverlapping(&t, y, 1);
+   
+            // y and t now point to the same thing, but we need to completely
+            // forget `t` because we do not want to run the destructor for `T`
+            // on its value, which is still owned somewhere outside this function.
+            forget(t);
+            return;
+        }
+
         // The approach here is to utilize simd to swap x & y efficiently. Testing reveals
         // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
         // Haswell E processors. LLVM is more able to optimize if we give a struct a
@@ -516,20 +534,21 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
         // Loop through x & y, copying them `Block` at a time
         // The optimizer should unroll the loop fully for most types
         // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
-        let len = size_of::<T>() as isize;
         let mut i = 0;
-        while i + block_size as isize <= len {
+        while i + block_size <= len {
             // Create some uninitialized memory as scratch space
             // Declaring `t` here avoids aligning the stack when this loop is unused
             let mut t: Block = uninitialized();
             let t = &mut t as *mut _ as *mut u8;
+            let x = x.offset(i as isize);
+            let y = y.offset(i as isize);
 
             // Swap a block of bytes of x & y, using t as a temporary buffer
             // This should be optimized into efficient SIMD operations where available
-            ptr::copy_nonoverlapping(x.offset(i), t, block_size);
-            ptr::copy_nonoverlapping(y.offset(i), x.offset(i), block_size);
-            ptr::copy_nonoverlapping(t, y.offset(i), block_size);
-            i += block_size as isize;
+            ptr::copy_nonoverlapping(x, t, block_size);
+            ptr::copy_nonoverlapping(y, x, block_size);
+            ptr::copy_nonoverlapping(t, y, block_size);
+            i += block_size;
         }
 
 
@@ -538,41 +557,15 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
             // where appropriate (this information is lost by conversion
             // to *mut u8, so restore it manually here)
             let mut t: UnalignedBlock = uninitialized();
-            let rem = (len - i) as usize;
-
-            if align_of::<T>() % 8 == 0 && len % 8 == 0 {
-                let t = &mut t as *mut _ as *mut u64;
-                let x = x.offset(i) as *mut u64;
-                let y = y.offset(i) as *mut u64;
-
-                ptr::copy_nonoverlapping(x, t, rem / 8);
-                ptr::copy_nonoverlapping(y, x, rem / 8);
-                ptr::copy_nonoverlapping(t, y, rem / 8);
-            } else if align_of::<T>() % 4 == 0 && len % 4 == 0 {
-                let t = &mut t as *mut _ as *mut u32;
-                let x = x.offset(i) as *mut u32;
-                let y = y.offset(i) as *mut u32;
-
-                ptr::copy_nonoverlapping(x, t, rem / 4);
-                ptr::copy_nonoverlapping(y, x, rem / 4);
-                ptr::copy_nonoverlapping(t, y, rem / 4);
-            } else if align_of::<T>() % 2 == 0 && len % 2 == 0 {
-                let t = &mut t as *mut _ as *mut u16;
-                let x = x.offset(i) as *mut u16;
-                let y = y.offset(i) as *mut u16;
-
-                ptr::copy_nonoverlapping(x, t, rem / 2);
-                ptr::copy_nonoverlapping(y, x, rem / 2);
-                ptr::copy_nonoverlapping(t, y, rem / 2);
-            } else {
-                let t = &mut t as *mut _ as *mut u8;
-                let x = x.offset(i);
-                let y = y.offset(i);
-
-                ptr::copy_nonoverlapping(x, t, rem);
-                ptr::copy_nonoverlapping(y, x, rem);
-                ptr::copy_nonoverlapping(t, y, rem);
-            }
+            let rem = len - i;
+
+            let t = &mut t as *mut _ as *mut u8;
+            let x = x.offset(i as isize);
+            let y = y.offset(i as isize);
+
+            ptr::copy_nonoverlapping(x, t, rem);
+            ptr::copy_nonoverlapping(y, x, rem);
+            ptr::copy_nonoverlapping(t, y, rem);
         }
     }
 }

From b795b7b43b5812a07b8fa7ed022b2f0f012a4950 Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Sun, 28 May 2017 18:10:12 +0100
Subject: [PATCH 14/15] restore old behaviour

---
 src/libcore/mem.rs | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index 93ec54b17390f..91b348fb5d677 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -499,24 +499,6 @@ pub unsafe fn uninitialized<T>() -> T {
 #[stable(feature = "rust1", since = "1.0.0")]
 pub fn swap<T>(x: &mut T, y: &mut T) {
     unsafe {
-        let len = size_of::<T>();
-
-        if len < 128 {
-            // Give ourselves some scratch space to work with
-            let mut t: T = uninitialized();
-   
-            // Perform the swap, `&mut` pointers never alias
-            ptr::copy_nonoverlapping(&*x, &mut t, 1);
-            ptr::copy_nonoverlapping(&*y, x, 1);
-            ptr::copy_nonoverlapping(&t, y, 1);
-   
-            // y and t now point to the same thing, but we need to completely
-            // forget `t` because we do not want to run the destructor for `T`
-            // on its value, which is still owned somewhere outside this function.
-            forget(t);
-            return;
-        }
-
         // The approach here is to utilize simd to swap x & y efficiently. Testing reveals
         // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
         // Haswell E processors. LLVM is more able to optimize if we give a struct a
@@ -534,6 +516,7 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
         // Loop through x & y, copying them `Block` at a time
         // The optimizer should unroll the loop fully for most types
         // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively
+        let len = size_of::<T>();
         let mut i = 0;
         while i + block_size <= len {
             // Create some uninitialized memory as scratch space

From 83f1f118e56320667c04a522e05f09a9f4abb6ff Mon Sep 17 00:00:00 2001
From: Djzin <djzin@users.noreply.github.com>
Date: Fri, 9 Jun 2017 07:07:58 +0100
Subject: [PATCH 15/15] hack around bug in emscripten

---
 src/libcore/mem.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs
index 91b348fb5d677..7c63ecd203e2a 100644
--- a/src/libcore/mem.rs
+++ b/src/libcore/mem.rs
@@ -503,7 +503,9 @@ pub fn swap<T>(x: &mut T, y: &mut T) {
         // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel
         // Haswell E processors. LLVM is more able to optimize if we give a struct a
         // #[repr(simd)], even if we don't actually use this struct directly.
-        #[repr(simd)]
+        //
+        // FIXME repr(simd) broken on emscripten
+        #[cfg_attr(not(target_os = "emscripten"), repr(simd))]
         struct Block(u64, u64, u64, u64);
         struct UnalignedBlock(u64, u64, u64, u64);