-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
speed up mem::swap #40454
speed up mem::swap #40454
Changes from 9 commits
4bcfbc3
85049e5
5702f43
d1fec0d
1daf589
2816998
c6ca81a
165f366
ca2fa97
fcc970a
c6307a2
7475135
d4d3f53
8a973df
b795b7b
83f1f11
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -109,7 +109,7 @@ pub use intrinsics::transmute; | |
/// [`Clone`][clone]. You need the value's destructor to run only once, | ||
/// because a double `free` is undefined behavior. | ||
/// | ||
/// An example is the definition of [`mem::swap`][swap] in this module: | ||
/// An example is a possible implementation of [`mem::swap`][swap]: | ||
/// | ||
/// ``` | ||
/// use std::mem; | ||
|
@@ -447,18 +447,48 @@ pub unsafe fn uninitialized<T>() -> T { | |
#[stable(feature = "rust1", since = "1.0.0")] | ||
pub fn swap<T>(x: &mut T, y: &mut T) { | ||
unsafe { | ||
// Give ourselves some scratch space to work with | ||
let mut t: T = uninitialized(); | ||
|
||
// Perform the swap, `&mut` pointers never alias | ||
ptr::copy_nonoverlapping(&*x, &mut t, 1); | ||
ptr::copy_nonoverlapping(&*y, x, 1); | ||
ptr::copy_nonoverlapping(&t, y, 1); | ||
|
||
// y and t now point to the same thing, but we need to completely | ||
// forget `t` because we do not want to run the destructor for `T` | ||
// on its value, which is still owned somewhere outside this function. | ||
forget(t); | ||
// The approach here is to utilize simd to swap x & y efficiently. Testing reveals | ||
// that swapping either 32 bytes or 64 bytes at a time is most efficient for intel | ||
// Haswell E processors. LLVM is more able to optimize if we give a struct a | ||
// #[repr(simd)], even if we don't actually use this struct directly. | ||
#[repr(simd)] | ||
struct Block(u64, u64, u64, u64); | ||
struct UnalignedBlock(u64, u64, u64, u64); | ||
|
||
let block_size = size_of::<Block>(); | ||
|
||
// Get raw pointers to the bytes of x & y for easier manipulation | ||
let x = x as *mut T as *mut u8; | ||
let y = y as *mut T as *mut u8; | ||
|
||
// Loop through x & y, copying them `Block` at a time | ||
// The optimizer should unroll the loop fully for most types | ||
// N.B. We can't use a for loop as the `range` impl calls `mem::swap` recursively | ||
let len = size_of::<T>() as isize; | ||
let mut i = 0; | ||
while i + block_size as isize <= len { | ||
// Create some uninitialized memory as scratch space | ||
// Decaring `t` here avoids aligning the stack when this loop is unused | ||
let mut t: Block = uninitialized(); | ||
let t = &mut t as *mut _ as *mut u8; | ||
|
||
// Swap a block of bytes of x & y, using t as a temporary buffer | ||
// This should be optimized into efficient SIMD operations where available | ||
ptr::copy_nonoverlapping(x.offset(i), t, block_size); | ||
ptr::copy_nonoverlapping(y.offset(i), x.offset(i), block_size); | ||
ptr::copy_nonoverlapping(t, y.offset(i), block_size); | ||
i += block_size as isize; | ||
} | ||
if i < len { | ||
// Swap any remaining bytes | ||
let mut t: UnalignedBlock = uninitialized(); | ||
let t = &mut t as *mut _ as *mut u8; | ||
|
||
let rem = (len - i) as usize; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know if it ever matters, but it occurs to me that calculating I doubt this matters in most cases, but I could see the optimizer failing when the size is large, since the loop might not be fully unrolled. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tested it; looks like the generated assembly is the same either way. |
||
ptr::copy_nonoverlapping(x.offset(i), t, rem); | ||
ptr::copy_nonoverlapping(y.offset(i), x.offset(i), rem); | ||
ptr::copy_nonoverlapping(t, y.offset(i), rem); | ||
} | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
Declaring
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
gah!