Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make u8x16 and u8x32 have Vector call ABI #589

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions src/literal/teddy_avx2/imp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ impl Teddy {
res: u8x32,
mut bitfield: u32,
) -> Option<Match> {
let patterns = res.bytes();
gnzlbg marked this conversation as resolved.
Show resolved Hide resolved
while bitfield != 0 {
// The next offset, relative to pos, where some fingerprint
// matched.
Expand All @@ -296,7 +297,7 @@ impl Teddy {

// The bitfield telling us which patterns had fingerprints that
// match at this starting position.
let mut patterns = res.extract(byte_pos);
let mut patterns = patterns[byte_pos];
while patterns != 0 {
let bucket = patterns.trailing_zeros() as usize;
patterns &= !(1 << bucket);
Expand Down Expand Up @@ -461,12 +462,20 @@ impl Mask {
let byte_lo = (byte & 0xF) as usize;
let byte_hi = (byte >> 4) as usize;

let lo = self.lo.extract(byte_lo) | ((1 << bucket) as u8);
self.lo.replace(byte_lo, lo);
self.lo.replace(byte_lo + 16, lo);
gnzlbg marked this conversation as resolved.
Show resolved Hide resolved
{
let mut lo_bytes = self.lo.bytes();
let lo = lo_bytes[byte_lo] | ((1 << bucket) as u8);
lo_bytes[byte_lo] = lo;
lo_bytes[byte_lo + 16] = lo;
self.lo.replace_bytes(lo_bytes);
}

let hi = self.hi.extract(byte_hi) | ((1 << bucket) as u8);
self.hi.replace(byte_hi, hi);
self.hi.replace(byte_hi + 16, hi);
{
let mut hi_bytes = self.hi.bytes();
let hi = hi_bytes[byte_hi] | ((1 << bucket) as u8);
hi_bytes[byte_hi] = hi;
hi_bytes[byte_hi + 16] = hi;
self.hi.replace_bytes(hi_bytes);
}
}
}
20 changes: 14 additions & 6 deletions src/literal/teddy_ssse3/imp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,7 @@ impl Teddy {
res: u8x16,
mut bitfield: u32,
) -> Option<Match> {
let patterns = res.bytes();
while bitfield != 0 {
// The next offset, relative to pos, where some fingerprint
// matched.
Expand All @@ -606,7 +607,7 @@ impl Teddy {

// The bitfield telling us which patterns had fingerprints that
// match at this starting position.
let mut patterns = res.extract(byte_pos);
let mut patterns = patterns[byte_pos];
while patterns != 0 {
let bucket = patterns.trailing_zeros() as usize;
patterns &= !(1 << bucket);
Expand Down Expand Up @@ -771,10 +772,17 @@ impl Mask {
let byte_lo = (byte & 0xF) as usize;
let byte_hi = (byte >> 4) as usize;

let lo = self.lo.extract(byte_lo);
self.lo.replace(byte_lo, ((1 << bucket) as u8) | lo);

let hi = self.hi.extract(byte_hi);
self.hi.replace(byte_hi, ((1 << bucket) as u8) | hi);
{
let mut lo_bytes = self.lo.bytes();
let lo = lo_bytes[byte_lo];
lo_bytes[byte_lo] = ((1 << bucket) as u8) | lo;
self.lo.replace_bytes(lo_bytes);
}
{
let mut hi_bytes = self.hi.bytes();
let hi = hi_bytes[byte_hi];
hi_bytes[byte_hi] = ((1 << bucket) as u8) | hi;
self.hi.replace_bytes(hi_bytes);
}
}
}
34 changes: 17 additions & 17 deletions src/vector/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

use std::arch::x86_64::*;
use std::fmt;
use std::mem;

#[derive(Clone, Copy, Debug)]
pub struct AVX2VectorBuilder(());
Expand Down Expand Up @@ -56,9 +57,9 @@ impl AVX2VectorBuilder {

#[derive(Clone, Copy)]
#[allow(non_camel_case_types)]
pub union u8x32 {
vector: __m256i,
bytes: [u8; 32],
#[repr(transparent)]
pub struct u8x32 {
vector: __m256i
}

impl u8x32 {
Expand Down Expand Up @@ -92,18 +93,6 @@ impl u8x32 {
u8x32 { vector: _mm256_load_si256(p) }
}

#[inline]
pub fn extract(self, i: usize) -> u8 {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] }
}

#[inline]
pub fn replace(&mut self, i: usize, byte: u8) {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] = byte; }
}

#[inline]
pub fn shuffle(self, indices: u8x32) -> u8x32 {
// Safe because we know AVX2 is enabled.
Expand Down Expand Up @@ -177,11 +166,22 @@ impl u8x32 {
u8x32 { vector: _mm256_srli_epi16(self.vector, 4) }
}
}

#[inline]
pub fn bytes(self) -> [u8; 32] {
// Safe because __m256i and [u8; 32] are layout compatible
unsafe { mem::transmute(self) }
}

#[inline]
pub fn replace_bytes(&mut self, value: [u8; 32]) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps better to have bytes(&self) -> &[u8; 32] and bytes_mut(&mut self) -> &mut [u8; 32]? It would be more Rust-idiomatic than providing a getter+setter, and in some cases might give a faster way to change separate bytes without replacing the whole thing.

Copy link
Contributor Author

@gnzlbg gnzlbg Jul 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps better to have bytes(&self) -> &[u8; 32]

Where does that [u8; 32] live ? (same for bytes_mut). Or are you suggesting transmuting &__m256i into a &[u8; 32] ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The body of the function wouldn't change - you'd still use transmute, just between references and not values.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's an example with both implementations and you can see how code for setting a single byte differs between two: https://rust.godbolt.org/z/XF7m95

Copy link
Contributor Author

@gnzlbg gnzlbg Jul 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the __m256i is in a SIMD register, it doesn't have a memory address, so creating a pointer to it requires spilling it to the stack to be able to give it an address. Once the modification on the stack is done, moving it back into a SIMD register requires copying the whole thing, not a single byte (at least if dynamic indices are involved, if the indices are compile-time constants, for some index values, sometimes, the compiler can do better).

One doesn't want this back-and-forth to happen accidentally, every time one modifies a part of the vector, and returning a &[u8; 32] would encourage that.

The proposed API forces its users to explicitly move the vector contents between the registers and memory. This revealed a couple of places where, e.g., instead of doing this back-and-forth on every iteration of a loop, one can just load the vector into memory once before the loop, operate on memory, and move the contents back into a SIMD register after the loop has completed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point. I'd argue that having several .bytes_mut()[...] = ... patterns in a row would also be pretty obvious and most people would store the reference when they could, but I see how your proposed API forces this a bit better.

However, if that's the goal, I wonder if it's worth to go one step further and take mutations as a callback like .with_bytes(|bytes| ...) where one can modify contents in any way they want inside, but then on the type system level you wouldn't be able to do anything else outside, and transformation back to a register would be guaranteed to happen as part of the call?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, if that's the goal, I wonder if it's worth to go one step further and take mutations as a callback like .with_bytes(|bytes| ...) where one can modify contents in any way they want inside, but then on the type system level you wouldn't be able to do anything else outside, and transformation back to a register would be guaranteed to happen as part of the call?

Some code only wants to read the bytes, so forcing a write would be bad for that code - I've pinged you in one example.

I also don't see really an advantage in restricting access to a scope. If you wanted to mutate two vectors, you would need to nest scopes. For an API that's only intended for internal consumption, this API would feel even more like overengineering than the one I proposed - and I consider mine borderline overengineering.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, fair enough. I'm not 100% comfortable with replace_bytes vs bytes_mut for idiomaticity reasons, but I can see how this is the least of the evils :)

Copy link
Contributor Author

@gnzlbg gnzlbg Jul 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's an example with both implementations and you can see how code for setting a single byte differs between two: https://rust.godbolt.org/z/XF7m95

As mentioned, this should be properly benchmarked before being merged. Those examples do not really use the intrinsics, so the code is being generated in isolation without any objective in mind. That is not very representative of what this library actually does. If you actually try to use the APIs, you'll see that they generate the exact same code when all optimizations are turned on: https://rust.godbolt.org/z/Z8sdH9

But this is not about optimizing the implementation, this is about optimizing the amount of work that LLVM has to do to produce efficient code. If you look at the LLVM-IR produced in debug mode: https://rust.godbolt.org/z/sMfxHm the version using replace_bytes produces 134 lines of LLVM-IR, while the version using bytes_mut produces 758 lines of LLVM-IR (if you look at the assembly at opt-level=1, you also see a much better result). There are a couple of factors at play here, but producing 6x more LLVM-IR for this isn't really worth it. LLVM can optimize it without problems, at least in this case where everything is private.

// Safe because __m256i and [u8; 32] are layout compatible
self.vector = unsafe { mem::transmute(value) };
}
}

impl fmt::Debug for u8x32 {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// Safe because `bytes` is always accessible.
unsafe { self.bytes.fmt(f) }
self.bytes().fmt(f)
}
}
34 changes: 17 additions & 17 deletions src/vector/ssse3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

use std::arch::x86_64::*;
use std::fmt;
use std::mem;

/// A builder for SSSE3 empowered vectors.
///
Expand Down Expand Up @@ -77,9 +78,9 @@ impl SSSE3VectorBuilder {
/// inlined, otherwise you probably have a performance bug.
#[derive(Clone, Copy)]
#[allow(non_camel_case_types)]
pub union u8x16 {
vector: __m128i,
bytes: [u8; 16],
#[repr(transparent)]
pub struct u8x16 {
vector: __m128i
}

impl u8x16 {
Expand Down Expand Up @@ -113,18 +114,6 @@ impl u8x16 {
u8x16 { vector: v }
}

#[inline]
pub fn extract(self, i: usize) -> u8 {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] }
}

#[inline]
pub fn replace(&mut self, i: usize, byte: u8) {
// Safe because `bytes` is always accessible.
unsafe { self.bytes[i] = byte; }
}

#[inline]
pub fn shuffle(self, indices: u8x16) -> u8x16 {
// Safe because we know SSSE3 is enabled.
Expand Down Expand Up @@ -182,11 +171,22 @@ impl u8x16 {
u8x16 { vector: _mm_srli_epi16(self.vector, 4) }
}
}

#[inline]
pub fn bytes(self) -> [u8; 16] {
// Safe because __m128i and [u8; 16] are layout compatible
unsafe { mem::transmute(self) }
}

#[inline]
pub fn replace_bytes(&mut self, value: [u8; 16]) {
// Safe because __m128i and [u8; 16] are layout compatible
self.vector = unsafe { mem::transmute(value) };
}
}

impl fmt::Debug for u8x16 {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// Safe because `bytes` is always accessible.
unsafe { self.bytes.fmt(f) }
self.bytes().fmt(f)
}
}