Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x64: implement vselect with variable blend instructions #2905

Merged
merged 1 commit into from
May 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@ pub enum SseOpcode {
Andnps,
Andnpd,
Blendvpd,
Blendvps,
Comiss,
Comisd,
Cmpps,
Expand Down Expand Up @@ -547,6 +548,7 @@ pub enum SseOpcode {
Pandn,
Pavgb,
Pavgw,
Pblendvb,
Pcmpeqb,
Pcmpeqw,
Pcmpeqd,
Expand Down Expand Up @@ -769,8 +771,10 @@ impl SseOpcode {
| SseOpcode::Pshufb => SSSE3,

SseOpcode::Blendvpd
| SseOpcode::Blendvps
| SseOpcode::Insertps
| SseOpcode::Packusdw
| SseOpcode::Pblendvb
| SseOpcode::Pcmpeqq
| SseOpcode::Pextrb
| SseOpcode::Pextrd
Expand Down Expand Up @@ -828,6 +832,7 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Andnps => "andnps",
SseOpcode::Andnpd => "andnpd",
SseOpcode::Blendvpd => "blendvpd",
SseOpcode::Blendvps => "blendvps",
SseOpcode::Cmpps => "cmpps",
SseOpcode::Cmppd => "cmppd",
SseOpcode::Cmpss => "cmpss",
Expand Down Expand Up @@ -897,6 +902,7 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Pandn => "pandn",
SseOpcode::Pavgb => "pavgb",
SseOpcode::Pavgw => "pavgw",
SseOpcode::Pblendvb => "pblendvb",
SseOpcode::Pcmpeqb => "pcmpeqb",
SseOpcode::Pcmpeqw => "pcmpeqw",
SseOpcode::Pcmpeqd => "pcmpeqd",
Expand Down
2 changes: 2 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1441,6 +1441,7 @@ pub(crate) fn emit(
SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
SseOpcode::Blendvps => (LegacyPrefixes::_66, 0x0F3814, 3),
SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3),
SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2),
SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2),
Expand Down Expand Up @@ -1480,6 +1481,7 @@ pub(crate) fn emit(
SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),
SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2),
SseOpcode::Pblendvb => (LegacyPrefixes::_66, 0x0F3810, 3),
SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2),
SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2),
SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2),
Expand Down
12 changes: 12 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3432,6 +3432,18 @@ fn test_x64_emit() {
"blendvpd %xmm15, %xmm4",
));

insns.push((
Inst::xmm_rm_r(SseOpcode::Blendvps, RegMem::reg(xmm2), w_xmm3),
"660F3814DA",
"blendvps %xmm2, %xmm3",
));

insns.push((
Inst::xmm_rm_r(SseOpcode::Pblendvb, RegMem::reg(xmm12), w_xmm13),
"66450F3810EC",
"pblendvb %xmm12, %xmm13",
));

// ========================================================
// XMM_RM_R: Integer Packed

Expand Down
9 changes: 8 additions & 1 deletion cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1927,13 +1927,20 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
src.get_regs_as_uses(collector);
collector.add_def(*dst);
}
Inst::XmmRmR { src, dst, .. } => {
Inst::XmmRmR { src, dst, op, .. } => {
if inst.produces_const() {
// No need to account for src, since src == dst.
collector.add_def(*dst);
} else {
src.get_regs_as_uses(collector);
collector.add_mod(*dst);
// Some instructions have an implicit use of XMM0.
if *op == SseOpcode::Blendvpd
|| *op == SseOpcode::Blendvps
|| *op == SseOpcode::Pblendvb
{
collector.add_use(regs::xmm0());
}
}
}
Inst::XmmRmREvex {
Expand Down
45 changes: 44 additions & 1 deletion cranelift/codegen/src/isa/x64/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2029,7 +2029,50 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
} else {
unimplemented!("scalar bitselect")
unimplemented!("no lowering for scalar bitselect instruction")
}
}

Opcode::Vselect => {
let ty = ty.unwrap();
let condition = put_input_in_reg(ctx, inputs[0]);
let condition_ty = ctx.input_ty(insn, 0);
let if_true = input_to_reg_mem(ctx, inputs[1]);
let if_false = put_input_in_reg(ctx, inputs[2]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();

if ty.is_vector() {
// `vselect` relies on the bit representation of the condition:
// vector boolean types are defined in Cranelift to be all 1s or
// all 0s. This lowering relies on that fact to use x86's
// variable blend instructions, which look at the _high_bit_ of
// the condition mask. All the bits of vector booleans will
// match (all 1s or all 0s), so we can just use the high bit.
assert!(condition_ty.lane_type().is_bool());

// Variable blend instructions expect the condition mask to be
// in XMM0.
let xmm0 = Writable::from_reg(regs::xmm0());
ctx.emit(Inst::gen_move(xmm0, condition, ty));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does regalloc know that xmm0 is live until the xmm_rm_r instruction below?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cfallin can correct me if I get this wrong but every generated instruction tells regalloc how it is using each register using the RegUsageMapper trait. Well, maybe it is the RegUsageCollector. In any case, regalloc should know that a move to that virtual register (hard-coded to XMM0) is a def and that that register should be left alone until it is used.

Copy link
Member

@cfallin cfallin May 14, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, the regalloc.rs semantics are such that we compute live-ranges for both real regs and virtual regs; and a real reg is reserved for the extent of its virtual ranges. As long as we properly have xmm0 as a use below the move then everything works as expected!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it makes sense to me that it's use/def based and this is a def of xmm0, but I wasn't able to find the use of xmm0 (I'm probably missing something though?) This looks like the relevant block but it doesn't seem have anything xmm0-related there, though?

(I'd sort of expect a match on the opcode to add the use of xmm0 in some cases, like the ones used below)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, actually you're right. In talking more to @cfallin I actually came to the same conclusion. Let me fix that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, what's the failure mode for a situation like this? The register allocator sees a "def" of xmm0 but it never sees a use. Does that mean that the live range is considered infinite? Or is the live range immediately "dead" after the def?

(this seems like a worrying thing to me and easy to overlook, so curious if we could bolster up checks one way or another to prevent this from happening again)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cfallin? 😁

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the failure mode isn't great: the real-reg liverange ends at the last mention (the def) and so the register is liable to be reused for something else, clobbering the value.

Always looking for ways to foolproof this; it's a bit tricky as the question is: what is the ground-truth we can check against? If we forget to mention a register-use in the metadata we provide to the allocator, the only other way of seeing that would be to disassemble the resulting machine code independently and check its register-mentions. We could certainly do that, perhaps as another oracle during fuzzing; but it's a big project (correlating the disassembly to the VCode especially with multiple-instruction lowerings would take some care).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh nah I wouldn't want to go so all-out just yet. I would imagine something much more simple which is that if something is defined and never used, it's considered either live forever or invalid, causing a panic. In the "live forever" case we'd in theory one day ask why our code was so slow and fix this by limiting the live range, and in the latter case we'd catch the panic real fast and fix it.

I'm not sure if it's common, though, for values to be defined and never used. If that happens pretty normally then there's probably not much we can do about this other than being vigilant for now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting thoughts in any case, thanks for the ideas here!

Keeping a real-reg liverange open forever would ensure safety here but could potentially have forward-progress implications: it's essentially a "leak" of one register, and if we reserve too many real regs in this way, then we could hit a point where we don't have enough registers to satisfy constraints and finish allocation. That's less bad than clobbering but still bad, and would be somewhat hard to debug.

In theory, dead defs shouldn't happen at the CLIF level, because we lower based on demand, though it's possible at the VCode level depending on the lowering pattern. There's also the issue that we use dead defs as a way to encode clobbers (at calls, for example), so we'd need a different category for those, but it could be doable.

We can think more about this -- I'll add the disallow-dead-defs idea to my running list of "things to look at next time I focus on fuzzing infra" :-)


// Match up the source and destination registers for regalloc.
ctx.emit(Inst::gen_move(dst, if_false, ty));

// Technically PBLENDVB would work in all cases (since the bytes
// inside the mask will be all 1s or 0s we can blend
// byte-by-byte instead of word-by-word, e.g.) but
// type-specialized versions are included here for clarity when
// troubleshooting and due to slight improvements in
// latency/throughput on certain processor families.
let opcode = match condition_ty {
types::B64X2 => SseOpcode::Blendvpd,
abrown marked this conversation as resolved.
Show resolved Hide resolved
types::B32X4 => SseOpcode::Blendvps,
types::B16X8 | types::B8X16 => SseOpcode::Pblendvb,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was curious after seeing two of these get mapped to the same instruction, while the above two types have different instructions. Reading the description, couldn't pblendvb be used for all 4 types instead of just these two smaller ones?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is purely a preference thing like I've done previously on moves, e.g.: it adds a negligible overhead to compilation but it matches the original intent of the instruction more closely, which I feel helps during debugging and sometimes can help with latency/throughput. In this case, the latency/throughput differences are very minor (see IA Optimization Manual, appendix D; depending on the arch family, 1 in some cases and 2 in others).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok makes sense! Presumably there's no dedicated instruction for 16-bit types?

Could you leave a comment for how pblendvb would be correct for all of these but more specialized versions are used for slight optimizations?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, no 16-bit, at least in SSE-land (VPBLENDM* exists up in AVX512). Yeah, I'll add the comment.

_ => unimplemented!("unable lower vselect for type: {}", condition_ty),
};
ctx.emit(Inst::xmm_rm_r(opcode, if_true, dst));
} else {
unimplemented!("no lowering for scalar vselect instruction")
}
}

Expand Down
10 changes: 10 additions & 0 deletions cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,16 @@ block0:
; nextln: por %xmm1, %xmm0
; not: movdqa

function %vselect_i16x8() -> i16x8 {
block0:
v0 = vconst.b16x8 [false true false true false true false true]
v1 = vconst.i16x8 [0 0 0 0 0 0 0 0]
v2 = vconst.i16x8 [0 0 0 0 0 0 0 0]
v3 = vselect v0, v1, v2
return v3
}
; check: pblendvb %xmm1, %xmm2



; 8x16 shifts: these lower to complex sequences of instructions
Expand Down
11 changes: 11 additions & 0 deletions cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,17 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16):
; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]

function %vselect_i32x4(i32x4, i32x4) -> i32x4 {
block0(v1: i32x4, v2: i32x4):
; `make_trampoline` still does not know how to convert boolean vector types
; so we load the value directly here.
v0 = vconst.b32x4 [true true false false]
v3 = vselect v0, v1, v2
return v3
}
; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
; run: %vselect_i8x16([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]



; shift left
Expand Down