diff --git a/src/literal/teddy_avx2/imp.rs b/src/literal/teddy_avx2/imp.rs index 3be0b1500f..12b2bf5e65 100644 --- a/src/literal/teddy_avx2/imp.rs +++ b/src/literal/teddy_avx2/imp.rs @@ -462,11 +462,19 @@ impl Mask { let byte_hi = (byte >> 4) as usize; let lo = self.lo.extract(byte_lo) | ((1 << bucket) as u8); - self.lo.replace(byte_lo, lo); - self.lo.replace(byte_lo + 16, lo); + { + let mut lo_bytes = self.lo.bytes(); + lo_bytes[byte_lo] = lo; + lo_bytes[byte_lo + 16] = lo; + self.lo.replace_bytes(lo_bytes); + } let hi = self.hi.extract(byte_hi) | ((1 << bucket) as u8); - self.hi.replace(byte_hi, hi); - self.hi.replace(byte_hi + 16, hi); + { + let mut hi_bytes = self.hi.bytes(); + hi_bytes[byte_hi] = hi; + hi_bytes[byte_hi + 16] = hi; + self.hi.replace_bytes(hi_bytes); + } } } diff --git a/src/literal/teddy_ssse3/imp.rs b/src/literal/teddy_ssse3/imp.rs index 77bb4106c8..f7ea2917b7 100644 --- a/src/literal/teddy_ssse3/imp.rs +++ b/src/literal/teddy_ssse3/imp.rs @@ -772,9 +772,17 @@ impl Mask { let byte_hi = (byte >> 4) as usize; let lo = self.lo.extract(byte_lo); - self.lo.replace(byte_lo, ((1 << bucket) as u8) | lo); + { + let mut lo_bytes = self.lo.bytes(); + lo_bytes[byte_lo] = ((1 << bucket) as u8) | lo; + self.lo.replace_bytes(lo_bytes); + } let hi = self.hi.extract(byte_hi); - self.hi.replace(byte_hi, ((1 << bucket) as u8) | hi); + { + let mut hi_bytes = self.hi.bytes(); + hi_bytes[byte_hi] = ((1 << bucket) as u8) | hi; + self.hi.replace_bytes(hi_bytes); + } } } diff --git a/src/vector/avx2.rs b/src/vector/avx2.rs index db0532c3fe..962a5cb7c5 100644 --- a/src/vector/avx2.rs +++ b/src/vector/avx2.rs @@ -2,6 +2,7 @@ use std::arch::x86_64::*; use std::fmt; +use std::mem; #[derive(Clone, Copy, Debug)] pub struct AVX2VectorBuilder(()); @@ -56,15 +57,13 @@ impl AVX2VectorBuilder { #[derive(Clone, Copy)] #[allow(non_camel_case_types)] -pub union u8x32 { - vector: __m256i, - bytes: [u8; 32], -} +#[repr(transparent)] +pub struct u8x32(__m256i); impl u8x32 { #[inline] unsafe fn splat(n: u8) -> u8x32 { - u8x32 { vector: _mm256_set1_epi8(n as i8) } + u8x32(_mm256_set1_epi8(n as i8)) } #[inline] @@ -76,7 +75,7 @@ impl u8x32 { #[inline] unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x32 { let p = slice.as_ptr() as *const u8 as *const __m256i; - u8x32 { vector: _mm256_loadu_si256(p) } + u8x32(_mm256_loadu_si256(p)) } #[inline] @@ -89,26 +88,19 @@ impl u8x32 { #[inline] unsafe fn load_unchecked(slice: &[u8]) -> u8x32 { let p = slice.as_ptr() as *const u8 as *const __m256i; - u8x32 { vector: _mm256_load_si256(p) } + u8x32(_mm256_load_si256(p)) } #[inline] pub fn extract(self, i: usize) -> u8 { - // Safe because `bytes` is always accessible. - unsafe { self.bytes[i] } - } - - #[inline] - pub fn replace(&mut self, i: usize, byte: u8) { - // Safe because `bytes` is always accessible. - unsafe { self.bytes[i] = byte; } + self.bytes()[i] } #[inline] pub fn shuffle(self, indices: u8x32) -> u8x32 { // Safe because we know AVX2 is enabled. unsafe { - u8x32 { vector: _mm256_shuffle_epi8(self.vector, indices.vector) } + u8x32(_mm256_shuffle_epi8(self.0, indices.0)) } } @@ -116,9 +108,9 @@ impl u8x32 { pub fn ne(self, other: u8x32) -> u8x32 { // Safe because we know AVX2 is enabled. unsafe { - let boolv = _mm256_cmpeq_epi8(self.vector, other.vector); + let boolv = _mm256_cmpeq_epi8(self.0, other.0); let ones = _mm256_set1_epi8(0xFF as u8 as i8); - u8x32 { vector: _mm256_andnot_si256(boolv, ones) } + u8x32(_mm256_andnot_si256(boolv, ones)) } } @@ -126,7 +118,7 @@ impl u8x32 { pub fn and(self, other: u8x32) -> u8x32 { // Safe because we know AVX2 is enabled. unsafe { - u8x32 { vector: _mm256_and_si256(self.vector, other.vector) } + u8x32(_mm256_and_si256(self.0, other.0)) } } @@ -134,7 +126,7 @@ impl u8x32 { pub fn movemask(self) -> u32 { // Safe because we know AVX2 is enabled. unsafe { - _mm256_movemask_epi8(self.vector) as u32 + _mm256_movemask_epi8(self.0) as u32 } } @@ -148,9 +140,9 @@ impl u8x32 { // TL;DR avx2's PALIGNR instruction is actually just two 128-bit // PALIGNR instructions, which is not what we want, so we need to // do some extra shuffling. - let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21); - let v = _mm256_alignr_epi8(self.vector, v, 14); - u8x32 { vector: v } + let v = _mm256_permute2x128_si256(other.0, self.0, 0x21); + let v = _mm256_alignr_epi8(self.0, v, 14); + u8x32(v) } } @@ -164,9 +156,9 @@ impl u8x32 { // TL;DR avx2's PALIGNR instruction is actually just two 128-bit // PALIGNR instructions, which is not what we want, so we need to // do some extra shuffling. - let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21); - let v = _mm256_alignr_epi8(self.vector, v, 15); - u8x32 { vector: v } + let v = _mm256_permute2x128_si256(other.0, self.0, 0x21); + let v = _mm256_alignr_epi8(self.0, v, 15); + u8x32(v) } } @@ -174,14 +166,25 @@ impl u8x32 { pub fn bit_shift_right_4(self) -> u8x32 { // Safe because we know AVX2 is enabled. unsafe { - u8x32 { vector: _mm256_srli_epi16(self.vector, 4) } + u8x32(_mm256_srli_epi16(self.0, 4)) } } + + #[inline] + pub fn bytes(self) -> [u8; 32] { + // Safe because __m256i and [u8; 32] are layout compatible + unsafe { mem::transmute(self) } + } + + #[inline] + pub fn replace_bytes(&mut self, value: [u8; 32]) { + // Safe because __m256i and [u8; 32] are layout compatible + self.0 = unsafe { mem::transmute(value) }; + } } impl fmt::Debug for u8x32 { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - // Safe because `bytes` is always accessible. - unsafe { self.bytes.fmt(f) } + self.bytes().fmt(f) } } diff --git a/src/vector/ssse3.rs b/src/vector/ssse3.rs index 0485657f01..0b8c87f214 100644 --- a/src/vector/ssse3.rs +++ b/src/vector/ssse3.rs @@ -2,6 +2,7 @@ use std::arch::x86_64::*; use std::fmt; +use std::mem; /// A builder for SSSE3 empowered vectors. /// @@ -77,15 +78,13 @@ impl SSSE3VectorBuilder { /// inlined, otherwise you probably have a performance bug. #[derive(Clone, Copy)] #[allow(non_camel_case_types)] -pub union u8x16 { - vector: __m128i, - bytes: [u8; 16], -} +#[repr(transparent)] +pub struct u8x16(__m128i); impl u8x16 { #[inline] unsafe fn splat(n: u8) -> u8x16 { - u8x16 { vector: _mm_set1_epi8(n as i8) } + u8x16(_mm_set1_epi8(n as i8)) } #[inline] @@ -97,7 +96,7 @@ impl u8x16 { #[inline] unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x16 { let v = _mm_loadu_si128(slice.as_ptr() as *const u8 as *const __m128i); - u8x16 { vector: v } + u8x16(v) } #[inline] @@ -110,26 +109,19 @@ impl u8x16 { #[inline] unsafe fn load_unchecked(slice: &[u8]) -> u8x16 { let v = _mm_load_si128(slice.as_ptr() as *const u8 as *const __m128i); - u8x16 { vector: v } + u8x16(v) } #[inline] pub fn extract(self, i: usize) -> u8 { - // Safe because `bytes` is always accessible. - unsafe { self.bytes[i] } - } - - #[inline] - pub fn replace(&mut self, i: usize, byte: u8) { - // Safe because `bytes` is always accessible. - unsafe { self.bytes[i] = byte; } + self.bytes()[i] } #[inline] pub fn shuffle(self, indices: u8x16) -> u8x16 { // Safe because we know SSSE3 is enabled. unsafe { - u8x16 { vector: _mm_shuffle_epi8(self.vector, indices.vector) } + u8x16(_mm_shuffle_epi8(self.0, indices.0)) } } @@ -137,9 +129,9 @@ impl u8x16 { pub fn ne(self, other: u8x16) -> u8x16 { // Safe because we know SSSE3 is enabled. unsafe { - let boolv = _mm_cmpeq_epi8(self.vector, other.vector); + let boolv = _mm_cmpeq_epi8(self.0, other.0); let ones = _mm_set1_epi8(0xFF as u8 as i8); - u8x16 { vector: _mm_andnot_si128(boolv, ones) } + u8x16(_mm_andnot_si128(boolv, ones)) } } @@ -147,7 +139,7 @@ impl u8x16 { pub fn and(self, other: u8x16) -> u8x16 { // Safe because we know SSSE3 is enabled. unsafe { - u8x16 { vector: _mm_and_si128(self.vector, other.vector) } + u8x16(_mm_and_si128(self.0, other.0)) } } @@ -155,7 +147,7 @@ impl u8x16 { pub fn movemask(self) -> u32 { // Safe because we know SSSE3 is enabled. unsafe { - _mm_movemask_epi8(self.vector) as u32 + _mm_movemask_epi8(self.0) as u32 } } @@ -163,7 +155,7 @@ impl u8x16 { pub fn alignr_14(self, other: u8x16) -> u8x16 { // Safe because we know SSSE3 is enabled. unsafe { - u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 14) } + u8x16(_mm_alignr_epi8(self.0, other.0, 14)) } } @@ -171,7 +163,7 @@ impl u8x16 { pub fn alignr_15(self, other: u8x16) -> u8x16 { // Safe because we know SSSE3 is enabled. unsafe { - u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 15) } + u8x16(_mm_alignr_epi8(self.0, other.0, 15)) } } @@ -179,14 +171,25 @@ impl u8x16 { pub fn bit_shift_right_4(self) -> u8x16 { // Safe because we know SSSE3 is enabled. unsafe { - u8x16 { vector: _mm_srli_epi16(self.vector, 4) } + u8x16(_mm_srli_epi16(self.0, 4)) } } + + #[inline] + pub fn bytes(self) -> [u8; 16] { + // Safe because __m128i and [u8; 16] are layout compatible + unsafe { mem::transmute(self) } + } + + #[inline] + pub fn replace_bytes(&mut self, value: [u8; 16]) { + // Safe because __m128i and [u8; 16] are layout compatible + self.0 = unsafe { mem::transmute(value) }; + } } impl fmt::Debug for u8x16 { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - // Safe because `bytes` is always accessible. - unsafe { self.bytes.fmt(f) } + self.bytes().fmt(f) } }