From 4bcfbc36c672f0f30176a31c5a6e529bebd6fbcc Mon Sep 17 00:00:00 2001 From: Djzin Date: Sun, 12 Mar 2017 13:14:47 +0000 Subject: [PATCH 01/15] speed up mem::swap --- src/libcore/mem.rs | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index f4ce4697d7cf4..a7c5d29c5a516 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -109,7 +109,7 @@ pub use intrinsics::transmute; /// [`Clone`][clone]. You need the value's destructor to run only once, /// because a double `free` is undefined behavior. /// -/// An example is the definition of [`mem::swap`][swap] in this module: +/// An example is the (old) definition of [`mem::swap`][swap] in this module: /// /// ``` /// use std::mem; @@ -447,18 +447,15 @@ pub unsafe fn uninitialized() -> T { #[stable(feature = "rust1", since = "1.0.0")] pub fn swap(x: &mut T, y: &mut T) { unsafe { - // Give ourselves some scratch space to work with - let mut t: T = uninitialized(); + let x = x as *mut T as *mut u8; + let y = y as *mut T as *mut u8; - // Perform the swap, `&mut` pointers never alias - ptr::copy_nonoverlapping(&*x, &mut t, 1); - ptr::copy_nonoverlapping(&*y, x, 1); - ptr::copy_nonoverlapping(&t, y, 1); - - // y and t now point to the same thing, but we need to completely - // forget `t` because we do not want to run the destructor for `T` - // on its value, which is still owned somewhere outside this function. - forget(t); + // use an xor-swap as x & y are guaranteed to never alias + for i in 0..size_of::() as isize { + *x.offset(i) ^= *y.offset(i); + *y.offset(i) ^= *x.offset(i); + *x.offset(i) ^= *y.offset(i); + } } } From 85049e508ba6b1502e4d074d051a188c398fabc6 Mon Sep 17 00:00:00 2001 From: Djzin Date: Sun, 12 Mar 2017 14:10:07 +0000 Subject: [PATCH 02/15] avoid recursion --- src/libcore/mem.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index a7c5d29c5a516..748d63362463a 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -450,11 +450,14 @@ pub fn swap(x: &mut T, y: &mut T) { let x = x as *mut T as *mut u8; let y = y as *mut T as *mut u8; - // use an xor-swap as x & y are guaranteed to never alias - for i in 0..size_of::() as isize { + // can't use a for loop as the `range` impl calls `mem::swap` recursively + let mut i = 0; + while i < size_of::() as isize { + // use an xor-swap as x & y are guaranteed to never alias *x.offset(i) ^= *y.offset(i); *y.offset(i) ^= *x.offset(i); *x.offset(i) ^= *y.offset(i); + i += 1; } } } From 5702f436aa6258119a32cbff31cc442d73b0d2c0 Mon Sep 17 00:00:00 2001 From: Djzin Date: Sun, 12 Mar 2017 18:32:20 +0000 Subject: [PATCH 03/15] a new approach; ditch xor cuteness and maximize cache locality --- src/libcore/mem.rs | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index 748d63362463a..e1b9991ccfdf5 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -447,17 +447,29 @@ pub unsafe fn uninitialized() -> T { #[stable(feature = "rust1", since = "1.0.0")] pub fn swap(x: &mut T, y: &mut T) { unsafe { + // Give ourselves some scratch space to work with + let mut t: [u8; 16] = mem::uninitialized(); + let x = x as *mut T as *mut u8; let y = y as *mut T as *mut u8; + let t = &mut t as *mut _ as *mut u8; // can't use a for loop as the `range` impl calls `mem::swap` recursively + let len = size_of::() as isize; let mut i = 0; - while i < size_of::() as isize { - // use an xor-swap as x & y are guaranteed to never alias - *x.offset(i) ^= *y.offset(i); - *y.offset(i) ^= *x.offset(i); - *x.offset(i) ^= *y.offset(i); - i += 1; + while i + 16 <= len { + // Perform the swap 16 bytes at a time, `&mut` pointers never alias + ptr::copy_nonoverlapping(x.offset(i), t, 16); + ptr::copy_nonoverlapping(y.offset(i), x.offset(i), 16); + ptr::copy_nonoverlapping(t, y.offset(i), 16); + i += 16; + } + if i < len { + // Swap any remaining bytes + let rem = (len - i) as usize; + ptr::copy_nonoverlapping(x.offset(i), t, rem); + ptr::copy_nonoverlapping(y.offset(i), x.offset(i), rem); + ptr::copy_nonoverlapping(t, y.offset(i), rem); } } } From d1fec0d87a95310fcc1c59d72953ad6be89c78a5 Mon Sep 17 00:00:00 2001 From: Djzin Date: Sun, 12 Mar 2017 18:41:58 +0000 Subject: [PATCH 04/15] fix typo --- src/libcore/mem.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index e1b9991ccfdf5..865ff3c6ee4b3 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -448,7 +448,7 @@ pub unsafe fn uninitialized() -> T { pub fn swap(x: &mut T, y: &mut T) { unsafe { // Give ourselves some scratch space to work with - let mut t: [u8; 16] = mem::uninitialized(); + let mut t: [u8; 16] = uninitialized(); let x = x as *mut T as *mut u8; let y = y as *mut T as *mut u8; From 1daf58964216dfe1f805cdaff76a91ca90d7523e Mon Sep 17 00:00:00 2001 From: Djzin Date: Mon, 13 Mar 2017 20:03:10 +0000 Subject: [PATCH 05/15] add SWAP_BLOCK_SIZE constant --- src/libcore/mem.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index 865ff3c6ee4b3..9a116e9041f78 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -447,8 +447,10 @@ pub unsafe fn uninitialized() -> T { #[stable(feature = "rust1", since = "1.0.0")] pub fn swap(x: &mut T, y: &mut T) { unsafe { + const SWAP_BLOCK_SIZE: usize = 16; + // Give ourselves some scratch space to work with - let mut t: [u8; 16] = uninitialized(); + let mut t: [u8; SWAP_BLOCK_SIZE] = uninitialized(); let x = x as *mut T as *mut u8; let y = y as *mut T as *mut u8; @@ -457,12 +459,12 @@ pub fn swap(x: &mut T, y: &mut T) { // can't use a for loop as the `range` impl calls `mem::swap` recursively let len = size_of::() as isize; let mut i = 0; - while i + 16 <= len { - // Perform the swap 16 bytes at a time, `&mut` pointers never alias - ptr::copy_nonoverlapping(x.offset(i), t, 16); - ptr::copy_nonoverlapping(y.offset(i), x.offset(i), 16); - ptr::copy_nonoverlapping(t, y.offset(i), 16); - i += 16; + while i + SWAP_BLOCK_SIZE as isize <= len { + // Perform the swap SWAP_BLOCK_SIZE bytes at a time, `&mut` pointers never alias + ptr::copy_nonoverlapping(x.offset(i), t, SWAP_BLOCK_SIZE); + ptr::copy_nonoverlapping(y.offset(i), x.offset(i), SWAP_BLOCK_SIZE); + ptr::copy_nonoverlapping(t, y.offset(i), SWAP_BLOCK_SIZE); + i += SWAP_BLOCK_SIZE as isize; } if i < len { // Swap any remaining bytes From 2816998d1e702a8597c07100856e2483590f3e2a Mon Sep 17 00:00:00 2001 From: Djzin Date: Wed, 15 Mar 2017 06:45:43 +0000 Subject: [PATCH 06/15] use simd blocks --- src/libcore/mem.rs | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index 9a116e9041f78..b6838f103c469 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -447,24 +447,34 @@ pub unsafe fn uninitialized() -> T { #[stable(feature = "rust1", since = "1.0.0")] pub fn swap(x: &mut T, y: &mut T) { unsafe { - const SWAP_BLOCK_SIZE: usize = 16; + // The approach here is to utilize simd to swap x & y efficiently. Testing reveals + // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel + // Haswell E processors. LLVM is more able to optimize if we give a struct a + // #[repr(simd)], even if we don't actually use this struct directly. + #[repr(simd)] + struct Block(u64, u64, u64, u64); + let block_size = size_of::(); - // Give ourselves some scratch space to work with - let mut t: [u8; SWAP_BLOCK_SIZE] = uninitialized(); + // Create some uninitialized memory as scratch space + let mut t: Block = uninitialized(); + // Get raw pointers to the bytes of x, y & t for easier manipulation let x = x as *mut T as *mut u8; let y = y as *mut T as *mut u8; let t = &mut t as *mut _ as *mut u8; - // can't use a for loop as the `range` impl calls `mem::swap` recursively + // Loop through x & y, copying them `Block` at a time + // The optimizer should unroll the loop fully for most types + // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively let len = size_of::() as isize; let mut i = 0; - while i + SWAP_BLOCK_SIZE as isize <= len { - // Perform the swap SWAP_BLOCK_SIZE bytes at a time, `&mut` pointers never alias - ptr::copy_nonoverlapping(x.offset(i), t, SWAP_BLOCK_SIZE); - ptr::copy_nonoverlapping(y.offset(i), x.offset(i), SWAP_BLOCK_SIZE); - ptr::copy_nonoverlapping(t, y.offset(i), SWAP_BLOCK_SIZE); - i += SWAP_BLOCK_SIZE as isize; + while i + block_size as isize <= len { + // Swap a block of bytes of x & y, using t as a temporary buffer + // This should be optimized into efficient SIMD operations where available + ptr::copy_nonoverlapping(x.offset(i), t, block_size); + ptr::copy_nonoverlapping(y.offset(i), x.offset(i), block_size); + ptr::copy_nonoverlapping(t, y.offset(i), block_size); + i += block_size as isize; } if i < len { // Swap any remaining bytes From c6ca81aa921415a33e1c8f32e0c53a2c5eb6b485 Mon Sep 17 00:00:00 2001 From: Djzin Date: Mon, 24 Apr 2017 07:40:11 +0100 Subject: [PATCH 07/15] change wording --- src/libcore/mem.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index b6838f103c469..e51976c4845ec 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -109,7 +109,7 @@ pub use intrinsics::transmute; /// [`Clone`][clone]. You need the value's destructor to run only once, /// because a double `free` is undefined behavior. /// -/// An example is the (old) definition of [`mem::swap`][swap] in this module: +/// An example is a possible implementation of [`mem::swap`][swap]: /// /// ``` /// use std::mem; From 165f3668d602c4eaa02125cf86fb8d12719cb441 Mon Sep 17 00:00:00 2001 From: Djzin Date: Sun, 7 May 2017 20:26:19 +0100 Subject: [PATCH 08/15] optimize out stack alignment for sizes < 32 --- src/libcore/mem.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index e51976c4845ec..2dc1f0e04bcc8 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -453,15 +453,13 @@ pub fn swap(x: &mut T, y: &mut T) { // #[repr(simd)], even if we don't actually use this struct directly. #[repr(simd)] struct Block(u64, u64, u64, u64); - let block_size = size_of::(); + struct UnalignedBlock(u64, u64, u64, u64); - // Create some uninitialized memory as scratch space - let mut t: Block = uninitialized(); + let block_size = size_of::(); - // Get raw pointers to the bytes of x, y & t for easier manipulation + // Get raw pointers to the bytes of x & y for easier manipulation let x = x as *mut T as *mut u8; let y = y as *mut T as *mut u8; - let t = &mut t as *mut _ as *mut u8; // Loop through x & y, copying them `Block` at a time // The optimizer should unroll the loop fully for most types @@ -469,6 +467,12 @@ pub fn swap(x: &mut T, y: &mut T) { let len = size_of::() as isize; let mut i = 0; while i + block_size as isize <= len { + // Create some uninitialized memory as scratch space + // Moving the declaration of `t` here avoids aligning the stack when + // this loop is unused + let mut t: Block = uninitialized(); + let t = &mut t as *mut _ as *mut u8; + // Swap a block of bytes of x & y, using t as a temporary buffer // This should be optimized into efficient SIMD operations where available ptr::copy_nonoverlapping(x.offset(i), t, block_size); @@ -478,6 +482,9 @@ pub fn swap(x: &mut T, y: &mut T) { } if i < len { // Swap any remaining bytes + let mut t: UnalignedBlock = uninitialized(); + let t = &mut t as *mut _ as *mut u8; + let rem = (len - i) as usize; ptr::copy_nonoverlapping(x.offset(i), t, rem); ptr::copy_nonoverlapping(y.offset(i), x.offset(i), rem); From ca2fa97b6ca858a1c7748d117f5ed096b185380a Mon Sep 17 00:00:00 2001 From: Djzin Date: Sun, 7 May 2017 20:29:39 +0100 Subject: [PATCH 09/15] improve wording --- src/libcore/mem.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index 2dc1f0e04bcc8..5fb30a9abfecc 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -468,8 +468,7 @@ pub fn swap(x: &mut T, y: &mut T) { let mut i = 0; while i + block_size as isize <= len { // Create some uninitialized memory as scratch space - // Moving the declaration of `t` here avoids aligning the stack when - // this loop is unused + // Decaring `t` here avoids aligning the stack when this loop is unused let mut t: Block = uninitialized(); let t = &mut t as *mut _ as *mut u8; From fcc970aca5d3e169c6a837c34f62b3e6e0ed74d1 Mon Sep 17 00:00:00 2001 From: Djzin Date: Sun, 7 May 2017 21:56:14 +0100 Subject: [PATCH 10/15] fix nit --- src/libcore/mem.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index 5fb30a9abfecc..87206152a0d21 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -468,7 +468,7 @@ pub fn swap(x: &mut T, y: &mut T) { let mut i = 0; while i + block_size as isize <= len { // Create some uninitialized memory as scratch space - // Decaring `t` here avoids aligning the stack when this loop is unused + // Declaring `t` here avoids aligning the stack when this loop is unused let mut t: Block = uninitialized(); let t = &mut t as *mut _ as *mut u8; From c6307a2fa55c3d62c06b85b349257a8194093442 Mon Sep 17 00:00:00 2001 From: Djzin Date: Sat, 27 May 2017 14:29:41 +0100 Subject: [PATCH 11/15] copy tail bytes better for aligned types --- src/libcore/mem.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index 87206152a0d21..2975d2e28dd38 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -479,6 +479,27 @@ pub fn swap(x: &mut T, y: &mut T) { ptr::copy_nonoverlapping(t, y.offset(i), block_size); i += block_size as isize; } + + // Swap remaining bytes 8 at a time if x & y are properly aligned + if align_of::() % 8 == 0 { + while i + 8 <= len as isize { + let t = *(x.offset(i) as *mut u64); + *(x.offset(i) as *mut u64) = *(y.offset(i) as *mut u64); + *(y.offset(i) as *mut u64) = t; + i += 8; + } + } + + // Swap remaining bytes 4 at a time if x & y are properly aligned + if align_of::() % 4 == 0 { + while i + 4 <= len as isize { + let t = *(x.offset(i) as *mut u32); + *(x.offset(i) as *mut u32) = *(y.offset(i) as *mut u32); + *(y.offset(i) as *mut u32) = t; + i += 4; + } + } + if i < len { // Swap any remaining bytes let mut t: UnalignedBlock = uninitialized(); From d4d3f53468cb392cb9b80278a232857ad8d68992 Mon Sep 17 00:00:00 2001 From: Djzin Date: Sat, 27 May 2017 16:39:51 +0100 Subject: [PATCH 12/15] better respect alignment for copying tail --- src/libcore/mem.rs | 62 +++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index 64034b05fde99..a2154e7fc69e5 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -532,35 +532,47 @@ pub fn swap(x: &mut T, y: &mut T) { i += block_size as isize; } - // Swap remaining bytes 8 at a time if x & y are properly aligned - if align_of::() % 8 == 0 { - while i + 8 <= len as isize { - let t = *(x.offset(i) as *mut u64); - *(x.offset(i) as *mut u64) = *(y.offset(i) as *mut u64); - *(y.offset(i) as *mut u64) = t; - i += 8; - } - } - - // Swap remaining bytes 4 at a time if x & y are properly aligned - if align_of::() % 4 == 0 { - while i + 4 <= len as isize { - let t = *(x.offset(i) as *mut u32); - *(x.offset(i) as *mut u32) = *(y.offset(i) as *mut u32); - *(y.offset(i) as *mut u32) = t; - i += 4; - } - } if i < len { - // Swap any remaining bytes + // Swap any remaining bytes, using aligned types to copy + // where appropriate (this information is lost by conversion + // to *mut u8, so restore it manually here) let mut t: UnalignedBlock = uninitialized(); - let t = &mut t as *mut _ as *mut u8; - let rem = (len - i) as usize; - ptr::copy_nonoverlapping(x.offset(i), t, rem); - ptr::copy_nonoverlapping(y.offset(i), x.offset(i), rem); - ptr::copy_nonoverlapping(t, y.offset(i), rem); + + if align_of::() % 8 == 0 && len % 8 == 0 { + let t = &mut t as *mut _ as *mut u64; + let x = x.offset(i) as *mut u64; + let y = y.offset(i) as *mut u64; + + ptr::copy_nonoverlapping(x, t, rem / 8); + ptr::copy_nonoverlapping(y, x, rem / 8); + ptr::copy_nonoverlapping(t, y, rem / 8); + } else if align_of::() % 4 == 0 && len % 4 == 0 { + let t = &mut t as *mut _ as *mut u32; + let x = x.offset(i) as *mut u32; + let y = y.offset(i) as *mut u32; + + ptr::copy_nonoverlapping(x, t, rem / 4); + ptr::copy_nonoverlapping(y, x, rem / 4); + ptr::copy_nonoverlapping(t, y, rem / 4); + } else if align_of::() % 2 == 0 && len % 2 == 0 { + let t = &mut t as *mut _ as *mut u16; + let x = x.offset(i) as *mut u16; + let y = y.offset(i) as *mut u16; + + ptr::copy_nonoverlapping(x, t, rem / 2); + ptr::copy_nonoverlapping(y, x, rem / 2); + ptr::copy_nonoverlapping(t, y, rem / 2); + } else { + let t = &mut t as *mut _ as *mut u8; + let x = x.offset(i); + let y = y.offset(i); + + ptr::copy_nonoverlapping(x, t, rem); + ptr::copy_nonoverlapping(y, x, rem); + ptr::copy_nonoverlapping(t, y, rem); + } } } } From 8a973dfa24b676e6e50b6c6bbb1cdce17499a2f7 Mon Sep 17 00:00:00 2001 From: Djzin Date: Sun, 28 May 2017 15:39:47 +0100 Subject: [PATCH 13/15] restore old behaviour for sizes < 128 --- src/libcore/mem.rs | 75 +++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index a2154e7fc69e5..93ec54b17390f 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -499,6 +499,24 @@ pub unsafe fn uninitialized() -> T { #[stable(feature = "rust1", since = "1.0.0")] pub fn swap(x: &mut T, y: &mut T) { unsafe { + let len = size_of::(); + + if len < 128 { + // Give ourselves some scratch space to work with + let mut t: T = uninitialized(); + + // Perform the swap, `&mut` pointers never alias + ptr::copy_nonoverlapping(&*x, &mut t, 1); + ptr::copy_nonoverlapping(&*y, x, 1); + ptr::copy_nonoverlapping(&t, y, 1); + + // y and t now point to the same thing, but we need to completely + // forget `t` because we do not want to run the destructor for `T` + // on its value, which is still owned somewhere outside this function. + forget(t); + return; + } + // The approach here is to utilize simd to swap x & y efficiently. Testing reveals // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel // Haswell E processors. LLVM is more able to optimize if we give a struct a @@ -516,20 +534,21 @@ pub fn swap(x: &mut T, y: &mut T) { // Loop through x & y, copying them `Block` at a time // The optimizer should unroll the loop fully for most types // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively - let len = size_of::() as isize; let mut i = 0; - while i + block_size as isize <= len { + while i + block_size <= len { // Create some uninitialized memory as scratch space // Declaring `t` here avoids aligning the stack when this loop is unused let mut t: Block = uninitialized(); let t = &mut t as *mut _ as *mut u8; + let x = x.offset(i as isize); + let y = y.offset(i as isize); // Swap a block of bytes of x & y, using t as a temporary buffer // This should be optimized into efficient SIMD operations where available - ptr::copy_nonoverlapping(x.offset(i), t, block_size); - ptr::copy_nonoverlapping(y.offset(i), x.offset(i), block_size); - ptr::copy_nonoverlapping(t, y.offset(i), block_size); - i += block_size as isize; + ptr::copy_nonoverlapping(x, t, block_size); + ptr::copy_nonoverlapping(y, x, block_size); + ptr::copy_nonoverlapping(t, y, block_size); + i += block_size; } @@ -538,41 +557,15 @@ pub fn swap(x: &mut T, y: &mut T) { // where appropriate (this information is lost by conversion // to *mut u8, so restore it manually here) let mut t: UnalignedBlock = uninitialized(); - let rem = (len - i) as usize; - - if align_of::() % 8 == 0 && len % 8 == 0 { - let t = &mut t as *mut _ as *mut u64; - let x = x.offset(i) as *mut u64; - let y = y.offset(i) as *mut u64; - - ptr::copy_nonoverlapping(x, t, rem / 8); - ptr::copy_nonoverlapping(y, x, rem / 8); - ptr::copy_nonoverlapping(t, y, rem / 8); - } else if align_of::() % 4 == 0 && len % 4 == 0 { - let t = &mut t as *mut _ as *mut u32; - let x = x.offset(i) as *mut u32; - let y = y.offset(i) as *mut u32; - - ptr::copy_nonoverlapping(x, t, rem / 4); - ptr::copy_nonoverlapping(y, x, rem / 4); - ptr::copy_nonoverlapping(t, y, rem / 4); - } else if align_of::() % 2 == 0 && len % 2 == 0 { - let t = &mut t as *mut _ as *mut u16; - let x = x.offset(i) as *mut u16; - let y = y.offset(i) as *mut u16; - - ptr::copy_nonoverlapping(x, t, rem / 2); - ptr::copy_nonoverlapping(y, x, rem / 2); - ptr::copy_nonoverlapping(t, y, rem / 2); - } else { - let t = &mut t as *mut _ as *mut u8; - let x = x.offset(i); - let y = y.offset(i); - - ptr::copy_nonoverlapping(x, t, rem); - ptr::copy_nonoverlapping(y, x, rem); - ptr::copy_nonoverlapping(t, y, rem); - } + let rem = len - i; + + let t = &mut t as *mut _ as *mut u8; + let x = x.offset(i as isize); + let y = y.offset(i as isize); + + ptr::copy_nonoverlapping(x, t, rem); + ptr::copy_nonoverlapping(y, x, rem); + ptr::copy_nonoverlapping(t, y, rem); } } } From b795b7b43b5812a07b8fa7ed022b2f0f012a4950 Mon Sep 17 00:00:00 2001 From: Djzin Date: Sun, 28 May 2017 18:10:12 +0100 Subject: [PATCH 14/15] restore old behaviour --- src/libcore/mem.rs | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index 93ec54b17390f..91b348fb5d677 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -499,24 +499,6 @@ pub unsafe fn uninitialized() -> T { #[stable(feature = "rust1", since = "1.0.0")] pub fn swap(x: &mut T, y: &mut T) { unsafe { - let len = size_of::(); - - if len < 128 { - // Give ourselves some scratch space to work with - let mut t: T = uninitialized(); - - // Perform the swap, `&mut` pointers never alias - ptr::copy_nonoverlapping(&*x, &mut t, 1); - ptr::copy_nonoverlapping(&*y, x, 1); - ptr::copy_nonoverlapping(&t, y, 1); - - // y and t now point to the same thing, but we need to completely - // forget `t` because we do not want to run the destructor for `T` - // on its value, which is still owned somewhere outside this function. - forget(t); - return; - } - // The approach here is to utilize simd to swap x & y efficiently. Testing reveals // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel // Haswell E processors. LLVM is more able to optimize if we give a struct a @@ -534,6 +516,7 @@ pub fn swap(x: &mut T, y: &mut T) { // Loop through x & y, copying them `Block` at a time // The optimizer should unroll the loop fully for most types // N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively + let len = size_of::(); let mut i = 0; while i + block_size <= len { // Create some uninitialized memory as scratch space From 83f1f118e56320667c04a522e05f09a9f4abb6ff Mon Sep 17 00:00:00 2001 From: Djzin Date: Fri, 9 Jun 2017 07:07:58 +0100 Subject: [PATCH 15/15] hack around bug in emscripten --- src/libcore/mem.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/libcore/mem.rs b/src/libcore/mem.rs index 91b348fb5d677..7c63ecd203e2a 100644 --- a/src/libcore/mem.rs +++ b/src/libcore/mem.rs @@ -503,7 +503,9 @@ pub fn swap(x: &mut T, y: &mut T) { // that swapping either 32 bytes or 64 bytes at a time is most efficient for intel // Haswell E processors. LLVM is more able to optimize if we give a struct a // #[repr(simd)], even if we don't actually use this struct directly. - #[repr(simd)] + // + // FIXME repr(simd) broken on emscripten + #[cfg_attr(not(target_os = "emscripten"), repr(simd))] struct Block(u64, u64, u64, u64); struct UnalignedBlock(u64, u64, u64, u64);