Skip to content

Commit

Permalink
Make u8x16 and u8x32 have Vector call ABI
Browse files Browse the repository at this point in the history
Before this commit, u8x16 and u8x32 were repr(Rust) unions. This introduced
unspecified behavior because the field offsets of repr(Rust) unions are not
guaranteed to be at offset 0, so that field access was potentially UB.

This commit fixes that, and closes #588 .

The unions were also generating a lot of unnecessary memory operations. This
commit fixes that as well.

The issue is that unions have an Aggregate call ABI, which is the same as the
call ABI of arrays. That is, they are passed around by memory, and not in Vector
registers.

This is good, if most of the time one operates on them as arrays. This was,
however, not the case. Most of the operations on these unions are using SIMD
instructions. This means that the union needs to be copied into a SIMD register,
operated on, and then spilled back to the stack, on every single operation.
That's unnecessary, although apparently LLVM was able to optimize all the
unnecessary memory operations away and leave these always in registers.

This commit fixes this issue as well, by making the u8x16 and u8x32
repr(transparent) newtypes over the architecture specific vector types, giving
them the Vector ABI.

The vectors are then copied to the stack only when necessary, and as little as
possible. This is done using mem::transmute, removing the need for unions
altogether (fixing #588 by not having to worry about union layout at all).

To make it clear when the vectors are spilled into the stack, the
vector::replace(index, value) API has been removed, and instead, only a
vector::bytes(self) and a vector::from_bytes(&mut self, [u8; N]) APIs are
provided instead. This prevents spilling the vectors back and forth onto the
stack every time an index needs to be modified, by using vector::bytes to spill
the vector to the stack once, making all the random-access modifications in
memory, and then using vector::from_bytes only once to move the memory back into
a SIMD register.
  • Loading branch information
gnzlbg committed Jul 1, 2019
1 parent 172898a commit 3b47633
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 60 deletions.
16 changes: 12 additions & 4 deletions src/literal/teddy_avx2/imp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -462,11 +462,19 @@ impl Mask {
let byte_hi = (byte >> 4) as usize;

let lo = self.lo.extract(byte_lo) | ((1 << bucket) as u8);
self.lo.replace(byte_lo, lo);
self.lo.replace(byte_lo + 16, lo);
{
let mut lo_bytes = self.lo.bytes();
lo_bytes[byte_lo] = lo;
lo_bytes[byte_lo + 16] = lo;
self.lo.replace_bytes(lo_bytes);
}

let hi = self.hi.extract(byte_hi) | ((1 << bucket) as u8);
self.hi.replace(byte_hi, hi);
self.hi.replace(byte_hi + 16, hi);
{
let mut hi_bytes = self.hi.bytes();
hi_bytes[byte_hi] = hi;
hi_bytes[byte_hi + 16] = hi;
self.hi.replace_bytes(hi_bytes);
}
}
}
12 changes: 10 additions & 2 deletions src/literal/teddy_ssse3/imp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -772,9 +772,17 @@ impl Mask {
let byte_hi = (byte >> 4) as usize;

let lo = self.lo.extract(byte_lo);
self.lo.replace(byte_lo, ((1 << bucket) as u8) | lo);
{
let mut lo_bytes = self.lo.bytes();
lo_bytes[byte_lo] = ((1 << bucket) as u8) | lo;
self.lo.replace_bytes(lo_bytes);
}

let hi = self.hi.extract(byte_hi);
self.hi.replace(byte_hi, ((1 << bucket) as u8) | hi);
{
let mut hi_bytes = self.hi.bytes();
hi_bytes[byte_hi] = ((1 << bucket) as u8) | hi;
self.hi.replace_bytes(hi_bytes);
}
}
}
61 changes: 32 additions & 29 deletions src/vector/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

use std::arch::x86_64::*;
use std::fmt;
use std::mem;

#[derive(Clone, Copy, Debug)]
pub struct AVX2VectorBuilder(());
Expand Down Expand Up @@ -56,15 +57,13 @@ impl AVX2VectorBuilder {

#[derive(Clone, Copy)]
#[allow(non_camel_case_types)]
pub union u8x32 {
vector: __m256i,
bytes: [u8; 32],
}
#[repr(transparent)]
pub struct u8x32(__m256i);

impl u8x32 {
#[inline]
unsafe fn splat(n: u8) -> u8x32 {
u8x32 { vector: _mm256_set1_epi8(n as i8) }
u8x32(_mm256_set1_epi8(n as i8))
}

#[inline]
Expand All @@ -76,7 +75,7 @@ impl u8x32 {
#[inline]
unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x32 {
let p = slice.as_ptr() as *const u8 as *const __m256i;
u8x32 { vector: _mm256_loadu_si256(p) }
u8x32(_mm256_loadu_si256(p))
}

#[inline]
Expand All @@ -89,52 +88,45 @@ impl u8x32 {
#[inline]
unsafe fn load_unchecked(slice: &[u8]) -> u8x32 {
let p = slice.as_ptr() as *const u8 as *const __m256i;
u8x32 { vector: _mm256_load_si256(p) }
u8x32(_mm256_load_si256(p))
}

#[inline]
pub fn extract(self, i: usize) -> u8 {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] }
}

#[inline]
pub fn replace(&mut self, i: usize, byte: u8) {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] = byte; }
self.bytes()[i]
}

#[inline]
pub fn shuffle(self, indices: u8x32) -> u8x32 {
// Safe because we know AVX2 is enabled.
unsafe {
u8x32 { vector: _mm256_shuffle_epi8(self.vector, indices.vector) }
u8x32(_mm256_shuffle_epi8(self.0, indices.0))
}
}

#[inline]
pub fn ne(self, other: u8x32) -> u8x32 {
// Safe because we know AVX2 is enabled.
unsafe {
let boolv = _mm256_cmpeq_epi8(self.vector, other.vector);
let boolv = _mm256_cmpeq_epi8(self.0, other.0);
let ones = _mm256_set1_epi8(0xFF as u8 as i8);
u8x32 { vector: _mm256_andnot_si256(boolv, ones) }
u8x32(_mm256_andnot_si256(boolv, ones))
}
}

#[inline]
pub fn and(self, other: u8x32) -> u8x32 {
// Safe because we know AVX2 is enabled.
unsafe {
u8x32 { vector: _mm256_and_si256(self.vector, other.vector) }
u8x32(_mm256_and_si256(self.0, other.0))
}
}

#[inline]
pub fn movemask(self) -> u32 {
// Safe because we know AVX2 is enabled.
unsafe {
_mm256_movemask_epi8(self.vector) as u32
_mm256_movemask_epi8(self.0) as u32
}
}

Expand All @@ -148,9 +140,9 @@ impl u8x32 {
// TL;DR avx2's PALIGNR instruction is actually just two 128-bit
// PALIGNR instructions, which is not what we want, so we need to
// do some extra shuffling.
let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21);
let v = _mm256_alignr_epi8(self.vector, v, 14);
u8x32 { vector: v }
let v = _mm256_permute2x128_si256(other.0, self.0, 0x21);
let v = _mm256_alignr_epi8(self.0, v, 14);
u8x32(v)
}
}

Expand All @@ -164,24 +156,35 @@ impl u8x32 {
// TL;DR avx2's PALIGNR instruction is actually just two 128-bit
// PALIGNR instructions, which is not what we want, so we need to
// do some extra shuffling.
let v = _mm256_permute2x128_si256(other.vector, self.vector, 0x21);
let v = _mm256_alignr_epi8(self.vector, v, 15);
u8x32 { vector: v }
let v = _mm256_permute2x128_si256(other.0, self.0, 0x21);
let v = _mm256_alignr_epi8(self.0, v, 15);
u8x32(v)
}
}

#[inline]
pub fn bit_shift_right_4(self) -> u8x32 {
// Safe because we know AVX2 is enabled.
unsafe {
u8x32 { vector: _mm256_srli_epi16(self.vector, 4) }
u8x32(_mm256_srli_epi16(self.0, 4))
}
}

#[inline]
pub fn bytes(self) -> [u8; 32] {
// Safe because __m256i and [u8; 32] are layout compatible
unsafe { mem::transmute(self) }
}

#[inline]
pub fn replace_bytes(&mut self, value: [u8; 32]) {
// Safe because __m256i and [u8; 32] are layout compatible
self.0 = unsafe { mem::transmute(value) };
}
}

impl fmt::Debug for u8x32 {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// Safe because `bytes` is always accessible.
unsafe { self.bytes.fmt(f) }
self.bytes().fmt(f)
}
}
53 changes: 28 additions & 25 deletions src/vector/ssse3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

use std::arch::x86_64::*;
use std::fmt;
use std::mem;

/// A builder for SSSE3 empowered vectors.
///
Expand Down Expand Up @@ -77,15 +78,13 @@ impl SSSE3VectorBuilder {
/// inlined, otherwise you probably have a performance bug.
#[derive(Clone, Copy)]
#[allow(non_camel_case_types)]
pub union u8x16 {
vector: __m128i,
bytes: [u8; 16],
}
#[repr(transparent)]
pub struct u8x16(__m128i);

impl u8x16 {
#[inline]
unsafe fn splat(n: u8) -> u8x16 {
u8x16 { vector: _mm_set1_epi8(n as i8) }
u8x16(_mm_set1_epi8(n as i8))
}

#[inline]
Expand All @@ -97,7 +96,7 @@ impl u8x16 {
#[inline]
unsafe fn load_unchecked_unaligned(slice: &[u8]) -> u8x16 {
let v = _mm_loadu_si128(slice.as_ptr() as *const u8 as *const __m128i);
u8x16 { vector: v }
u8x16(v)
}

#[inline]
Expand All @@ -110,83 +109,87 @@ impl u8x16 {
#[inline]
unsafe fn load_unchecked(slice: &[u8]) -> u8x16 {
let v = _mm_load_si128(slice.as_ptr() as *const u8 as *const __m128i);
u8x16 { vector: v }
u8x16(v)
}

#[inline]
pub fn extract(self, i: usize) -> u8 {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] }
}

#[inline]
pub fn replace(&mut self, i: usize, byte: u8) {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] = byte; }
self.bytes()[i]
}

#[inline]
pub fn shuffle(self, indices: u8x16) -> u8x16 {
// Safe because we know SSSE3 is enabled.
unsafe {
u8x16 { vector: _mm_shuffle_epi8(self.vector, indices.vector) }
u8x16(_mm_shuffle_epi8(self.0, indices.0))
}
}

#[inline]
pub fn ne(self, other: u8x16) -> u8x16 {
// Safe because we know SSSE3 is enabled.
unsafe {
let boolv = _mm_cmpeq_epi8(self.vector, other.vector);
let boolv = _mm_cmpeq_epi8(self.0, other.0);
let ones = _mm_set1_epi8(0xFF as u8 as i8);
u8x16 { vector: _mm_andnot_si128(boolv, ones) }
u8x16(_mm_andnot_si128(boolv, ones))
}
}

#[inline]
pub fn and(self, other: u8x16) -> u8x16 {
// Safe because we know SSSE3 is enabled.
unsafe {
u8x16 { vector: _mm_and_si128(self.vector, other.vector) }
u8x16(_mm_and_si128(self.0, other.0))
}
}

#[inline]
pub fn movemask(self) -> u32 {
// Safe because we know SSSE3 is enabled.
unsafe {
_mm_movemask_epi8(self.vector) as u32
_mm_movemask_epi8(self.0) as u32
}
}

#[inline]
pub fn alignr_14(self, other: u8x16) -> u8x16 {
// Safe because we know SSSE3 is enabled.
unsafe {
u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 14) }
u8x16(_mm_alignr_epi8(self.0, other.0, 14))
}
}

#[inline]
pub fn alignr_15(self, other: u8x16) -> u8x16 {
// Safe because we know SSSE3 is enabled.
unsafe {
u8x16 { vector: _mm_alignr_epi8(self.vector, other.vector, 15) }
u8x16(_mm_alignr_epi8(self.0, other.0, 15))
}
}

#[inline]
pub fn bit_shift_right_4(self) -> u8x16 {
// Safe because we know SSSE3 is enabled.
unsafe {
u8x16 { vector: _mm_srli_epi16(self.vector, 4) }
u8x16(_mm_srli_epi16(self.0, 4))
}
}

#[inline]
pub fn bytes(self) -> [u8; 16] {
// Safe because __m128i and [u8; 16] are layout compatible
unsafe { mem::transmute(self) }
}

#[inline]
pub fn replace_bytes(&mut self, value: [u8; 16]) {
// Safe because __m128i and [u8; 16] are layout compatible
self.0 = unsafe { mem::transmute(value) };
}
}

impl fmt::Debug for u8x16 {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// Safe because `bytes` is always accessible.
unsafe { self.bytes.fmt(f) }
self.bytes().fmt(f)
}
}

0 comments on commit 3b47633

Please sign in to comment.