Make u8x16 and u8x32 have Vector call ABI

Before this commit, u8x16 and u8x32 were repr(Rust) unions. This introduced unspecified behavior because the field offsets of repr(Rust) unions are not guaranteed to be at offset 0, so that field access was potentially UB. This commit fixes that, and closes #588 . The unions were also generating a lot of unnecessary memory operations. This commit fixes that as well. The issue is that unions have an Aggregate call ABI, which is the same as the call ABI of arrays. That is, they are passed around by memory, and not in Vector registers. This is good, if most of the time one operates on them as arrays. This was, however, not the case. Most of the operations on these unions are using SIMD instructions. This means that the union needs to be copied into a SIMD register, operated on, and then spilled back to the stack, on every single operation. That's unnecessary, although apparently LLVM was able to optimize all the unnecessary memory operations away and leave these always in registers. This commit fixes this issue as well, by making the u8x16 and u8x32 repr(transparent) newtypes over the architecture specific vector types, giving them the Vector ABI. The vectors are then copied to the stack only when necessary, and as little as possible. This is done using mem::transmute, removing the need for unions altogether (fixing #588 by not having to worry about union layout at all). To make it clear when the vectors are spilled into the stack, the vector::replace(index, value) API has been removed, and instead, only a vector::bytes(self) and a vector::from_bytes(&mut self, [u8; N]) APIs are provided instead. This prevents spilling the vectors back and forth onto the stack every time an index needs to be modified, by using vector::bytes to spill the vector to the stack once, making all the random-access modifications in memory, and then using vector::from_bytes only once to move the memory back into a SIMD register.
rust-lang · Jul 1, 2019 · 3b47633 · 3b47633
1 parent 172898a
commit 3b47633
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 60 deletions.
diff --git a/src/literal/teddy_avx2/imp.rs b/src/literal/teddy_avx2/imp.rs
@@ -462,11 +462,19 @@ impl Mask {
         let byte_hi = (byte >> 4) as usize;
 
         let lo = self.lo.extract(byte_lo) | ((1 << bucket) as u8);
-        self.lo.replace(byte_lo, lo);
-        self.lo.replace(byte_lo + 16, lo);
+        {
+            let mut lo_bytes = self.lo.bytes();
+            lo_bytes[byte_lo] = lo;
+            lo_bytes[byte_lo + 16] = lo;
+            self.lo.replace_bytes(lo_bytes);
+        }
 
         let hi = self.hi.extract(byte_hi) | ((1 << bucket) as u8);
-        self.hi.replace(byte_hi, hi);
-        self.hi.replace(byte_hi + 16, hi);
+        {
+            let mut hi_bytes = self.hi.bytes();
+            hi_bytes[byte_hi] = hi;
+            hi_bytes[byte_hi + 16] = hi;
+            self.hi.replace_bytes(hi_bytes);
+        }
     }
 }
diff --git a/src/literal/teddy_ssse3/imp.rs b/src/literal/teddy_ssse3/imp.rs
@@ -772,9 +772,17 @@ impl Mask {
         let byte_hi = (byte >> 4) as usize;
 
         let lo = self.lo.extract(byte_lo);
-        self.lo.replace(byte_lo, ((1 << bucket) as u8) | lo);
+        {
+            let mut lo_bytes = self.lo.bytes();
+            lo_bytes[byte_lo] = ((1 << bucket) as u8) | lo;
+            self.lo.replace_bytes(lo_bytes);
+        }
 
         let hi = self.hi.extract(byte_hi);
-        self.hi.replace(byte_hi, ((1 << bucket) as u8) | hi);
+        {
+            let mut hi_bytes = self.hi.bytes();
+            hi_bytes[byte_hi] = ((1 << bucket) as u8) | hi;
+            self.hi.replace_bytes(hi_bytes);
+        }
     }
 }
diff --git a/src/vector/avx2.rs b/src/vector/avx2.rs
@@ -2,6 +2,7 @@
 
 use std::arch::x86_64::*;
 use std::fmt;
+use std::mem;
 
 #[derive(Clone, Copy, Debug)]
 pub struct AVX2VectorBuilder(());
@@ -56,15 +57,13 @@ impl AVX2VectorBuilder {
 
 #[derive(Clone, Copy)]
 #[allow(non_camel_case_types)]
-pub union u8x32 {
-    vector: __m256i,
-    bytes: [u8; 32],
-}
+#[repr(transparent)]
+pub struct u8x32(__m256i);
 
 impl u8x32 {
     #[inline]
     unsafe fn splat(n: u8) -> u8x32 {
-        u8x32 { vector: _mm256_set1_epi8(n as i8) }
+        u8x32(_mm256_set1_epi8(n as i8))
     }
 
     #[inline]
@@ -76,7 +75,7 @@ impl u8x32 {
     #[inline]
     unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x32 {
         let p = slice.as_ptr() as *const u8 as *const __m256i;
-        u8x32 { vector: _mm256_loadu_si256(p) }
+        u8x32(_mm256_loadu_si256(p))
     }
 
     #[inline]
@@ -89,52 +88,45 @@ impl u8x32 {
     #[inline]
     unsafe fn load_unchecked(slice: &[u8]) -> u8x32 {
         let p = slice.as_ptr() as *const u8 as *const __m256i;
-        u8x32 { vector: _mm256_load_si256(p) }
+        u8x32(_mm256_load_si256(p))
     }
 
     #[inline]
     pub fn extract(self, i: usize) -> u8 {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes[i] }
-    }
-
-    #[inline]
-    pub fn replace(&mut self, i: usize, byte: u8) {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes[i] = byte; }
+        self.bytes()[i]
     }
 
     #[inline]
     pub fn shuffle(self, indices: u8x32) -> u8x32 {
         // Safe because we know AVX2 is enabled.
         unsafe {
-            u8x32 { vector: _mm256_shuffle_epi8(self.vector, indices.vector) }
+            u8x32(_mm256_shuffle_epi8(self.0, indices.0))
         }
     }
 
     #[inline]
     pub fn ne(self, other: u8x32) -> u8x32 {
         // Safe because we know AVX2 is enabled.
         unsafe {
-            let boolv = _mm256_cmpeq_epi8(self.vector, other.vector);
+            let boolv = _mm256_cmpeq_epi8(self.0, other.0);
             let ones = _mm256_set1_epi8(0xFF as u8 as i8);
-            u8x32 { vector: _mm256_andnot_si256(boolv, ones) }
+            u8x32(_mm256_andnot_si256(boolv, ones))
         }
     }
 
     #[inline]
     pub fn and(self, other: u8x32) -> u8x32 {
         // Safe because we know AVX2 is enabled.
         unsafe {
-            u8x32 { vector: _mm256_and_si256(self.vector, other.vector) }
+            u8x32(_mm256_and_si256(self.0, other.0))
         }
     }
 
     #[inline]
     pub fn movemask(self) -> u32 {
         // Safe because we know AVX2 is enabled.
         unsafe {
-            _mm256_movemask_epi8(self.vector) as u32
+            _mm256_movemask_epi8(self.0) as u32
         }
     }
 
@@ -148,9 +140,9 @@ impl u8x32 {
             // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
             // PALIGNR instructions, which is not what we want, so we need to
             // do some extra shuffling.
-            let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21);
-            let v = _mm256_alignr_epi8(self.vector, v, 14);
-            u8x32 { vector: v }
+            let v = _mm256_permute2x128_si256(other.0, self.0, 0x21);
+            let v = _mm256_alignr_epi8(self.0, v, 14);
+            u8x32(v)
         }
     }
 
@@ -164,24 +156,35 @@ impl u8x32 {
             // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
             // PALIGNR instructions, which is not what we want, so we need to
             // do some extra shuffling.
-            let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21);
-            let v = _mm256_alignr_epi8(self.vector, v, 15);
-            u8x32 { vector: v }
+            let v = _mm256_permute2x128_si256(other.0, self.0, 0x21);
+            let v = _mm256_alignr_epi8(self.0, v, 15);
+            u8x32(v)
         }
     }
 
     #[inline]
     pub fn bit_shift_right_4(self) -> u8x32 {
         // Safe because we know AVX2 is enabled.
         unsafe {
-            u8x32 { vector: _mm256_srli_epi16(self.vector, 4) }
+            u8x32(_mm256_srli_epi16(self.0, 4))
         }
     }
+
+    #[inline]
+    pub fn bytes(self) -> [u8; 32] {
+        // Safe because __m256i and [u8; 32] are layout compatible
+        unsafe { mem::transmute(self) }
+    }
+
+    #[inline]
+    pub fn replace_bytes(&mut self, value: [u8; 32]) {
+        // Safe because __m256i and [u8; 32] are layout compatible
+        self.0 = unsafe { mem::transmute(value) };
+    }
 }
 
 impl fmt::Debug for u8x32 {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes.fmt(f) }
+        self.bytes().fmt(f)
     }
 }
diff --git a/src/vector/ssse3.rs b/src/vector/ssse3.rs
@@ -2,6 +2,7 @@
 
 use std::arch::x86_64::*;
 use std::fmt;
+use std::mem;
 
 /// A builder for SSSE3 empowered vectors.
 ///
@@ -77,15 +78,13 @@ impl SSSE3VectorBuilder {
 /// inlined, otherwise you probably have a performance bug.
 #[derive(Clone, Copy)]
 #[allow(non_camel_case_types)]
-pub union u8x16 {
-    vector: __m128i,
-    bytes: [u8; 16],
-}
+#[repr(transparent)]
+pub struct u8x16(__m128i);
 
 impl u8x16 {
     #[inline]
     unsafe fn splat(n: u8) -> u8x16 {
-        u8x16 { vector: _mm_set1_epi8(n as i8) }
+        u8x16(_mm_set1_epi8(n as i8))
     }
 
     #[inline]
@@ -97,7 +96,7 @@ impl u8x16 {
     #[inline]
     unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x16 {
         let v = _mm_loadu_si128(slice.as_ptr() as *const u8 as *const __m128i);
-        u8x16 { vector: v }
+        u8x16(v)
     }
 
     #[inline]
@@ -110,83 +109,87 @@ impl u8x16 {
     #[inline]
     unsafe fn load_unchecked(slice: &[u8]) -> u8x16 {
         let v = _mm_load_si128(slice.as_ptr() as *const u8 as *const __m128i);
-        u8x16 { vector: v }
+        u8x16(v)
     }
 
     #[inline]
     pub fn extract(self, i: usize) -> u8 {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes[i] }
-    }
-
-    #[inline]
-    pub fn replace(&mut self, i: usize, byte: u8) {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes[i] = byte; }
+        self.bytes()[i]
     }
 
     #[inline]
     pub fn shuffle(self, indices: u8x16) -> u8x16 {
         // Safe because we know SSSE3 is enabled.
         unsafe {
-            u8x16 { vector: _mm_shuffle_epi8(self.vector, indices.vector) }
+            u8x16(_mm_shuffle_epi8(self.0, indices.0))
         }
     }
 
     #[inline]
     pub fn ne(self, other: u8x16) -> u8x16 {
         // Safe because we know SSSE3 is enabled.
         unsafe {
-            let boolv = _mm_cmpeq_epi8(self.vector, other.vector);
+            let boolv = _mm_cmpeq_epi8(self.0, other.0);
             let ones = _mm_set1_epi8(0xFF as u8 as i8);
-            u8x16 { vector: _mm_andnot_si128(boolv, ones) }
+            u8x16(_mm_andnot_si128(boolv, ones))
         }
     }
 
     #[inline]
     pub fn and(self, other: u8x16) -> u8x16 {
         // Safe because we know SSSE3 is enabled.
         unsafe {
-            u8x16 { vector: _mm_and_si128(self.vector, other.vector) }
+            u8x16(_mm_and_si128(self.0, other.0))
         }
     }
 
     #[inline]
     pub fn movemask(self) -> u32 {
         // Safe because we know SSSE3 is enabled.
         unsafe {
-            _mm_movemask_epi8(self.vector) as u32
+            _mm_movemask_epi8(self.0) as u32
         }
     }
 
     #[inline]
     pub fn alignr_14(self, other: u8x16) -> u8x16 {
         // Safe because we know SSSE3 is enabled.
         unsafe {
-            u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 14) }
+            u8x16(_mm_alignr_epi8(self.0, other.0, 14))
         }
     }
 
     #[inline]
     pub fn alignr_15(self, other: u8x16) -> u8x16 {
         // Safe because we know SSSE3 is enabled.
         unsafe {
-            u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 15) }
+            u8x16(_mm_alignr_epi8(self.0, other.0, 15))
         }
     }
 
     #[inline]
     pub fn bit_shift_right_4(self) -> u8x16 {
         // Safe because we know SSSE3 is enabled.
         unsafe {
-            u8x16 { vector: _mm_srli_epi16(self.vector, 4) }
+            u8x16(_mm_srli_epi16(self.0, 4))
         }
     }
+
+    #[inline]
+    pub fn bytes(self) -> [u8; 16] {
+        // Safe because __m128i and [u8; 16] are layout compatible
+        unsafe { mem::transmute(self) }
+    }
+
+    #[inline]
+    pub fn replace_bytes(&mut self, value: [u8; 16]) {
+        // Safe because __m128i and [u8; 16] are layout compatible
+        self.0 = unsafe { mem::transmute(value) };
+    }
 }
 
 impl fmt::Debug for u8x16 {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        // Safe because `bytes` is always accessible.
-        unsafe { self.bytes.fmt(f) }
+        self.bytes().fmt(f)
     }
 }