From 2059a1f179bc5e40fd014242089d7f947669dc4d Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Thu, 13 May 2021 20:04:40 -0700 Subject: [PATCH] x64: implement vselect with variable blend instructions This change implements `vselect` using SSE4.1's `BLENDVPS`, `BLENDVPD`, and `PBLENDVB`. `vselect` is a lane-selecting instruction that is used by [simple_preopt.rs](https://github.com/bytecodealliance/wasmtime/blob/fa1faf5d224b9640eb33eed97e6a890da23afa33/cranelift/codegen/src/simple_preopt.rs#L947-L999) to lower `bitselect` to a single x86 instruction when the condition mask is known to be boolean (all 1s or 0s, e.g., from a conversion). This is better than `bitselect` in general, which lowers to 4-5 instructions. The old backend had the `vselect` lowering; this simply introduces it to the new backend. --- cranelift/codegen/src/isa/x64/inst/args.rs | 6 +++ cranelift/codegen/src/isa/x64/inst/emit.rs | 2 + .../codegen/src/isa/x64/inst/emit_tests.rs | 12 +++++ cranelift/codegen/src/isa/x64/inst/mod.rs | 9 +++- cranelift/codegen/src/isa/x64/lower.rs | 45 ++++++++++++++++++- .../isa/x64/simd-bitwise-compile.clif | 10 +++++ .../filetests/isa/x64/simd-bitwise-run.clif | 11 +++++ 7 files changed, 93 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 6e0d507ab05b..77b173b42dfd 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -478,6 +478,7 @@ pub enum SseOpcode { Andnps, Andnpd, Blendvpd, + Blendvps, Comiss, Comisd, Cmpps, @@ -547,6 +548,7 @@ pub enum SseOpcode { Pandn, Pavgb, Pavgw, + Pblendvb, Pcmpeqb, Pcmpeqw, Pcmpeqd, @@ -769,8 +771,10 @@ impl SseOpcode { | SseOpcode::Pshufb => SSSE3, SseOpcode::Blendvpd + | SseOpcode::Blendvps | SseOpcode::Insertps | SseOpcode::Packusdw + | SseOpcode::Pblendvb | SseOpcode::Pcmpeqq | SseOpcode::Pextrb | SseOpcode::Pextrd @@ -828,6 +832,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Andnps => "andnps", SseOpcode::Andnpd => "andnpd", SseOpcode::Blendvpd => "blendvpd", + SseOpcode::Blendvps => "blendvps", SseOpcode::Cmpps => "cmpps", SseOpcode::Cmppd => "cmppd", SseOpcode::Cmpss => "cmpss", @@ -897,6 +902,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pandn => "pandn", SseOpcode::Pavgb => "pavgb", SseOpcode::Pavgw => "pavgw", + SseOpcode::Pblendvb => "pblendvb", SseOpcode::Pcmpeqb => "pcmpeqb", SseOpcode::Pcmpeqw => "pcmpeqw", SseOpcode::Pcmpeqd => "pcmpeqd", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 134d6eafa197..f57212a3be7a 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1441,6 +1441,7 @@ pub(crate) fn emit( SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2), SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2), SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2), + SseOpcode::Blendvps => (LegacyPrefixes::_66, 0x0F3814, 3), SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3), SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2), SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2), @@ -1480,6 +1481,7 @@ pub(crate) fn emit( SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2), SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2), SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2), + SseOpcode::Pblendvb => (LegacyPrefixes::_66, 0x0F3810, 3), SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2), SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2), SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2), diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 1d0dd4aba5df..a288327fd575 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3432,6 +3432,18 @@ fn test_x64_emit() { "blendvpd %xmm15, %xmm4", )); + insns.push(( + Inst::xmm_rm_r(SseOpcode::Blendvps, RegMem::reg(xmm2), w_xmm3), + "660F3814DA", + "blendvps %xmm2, %xmm3", + )); + + insns.push(( + Inst::xmm_rm_r(SseOpcode::Pblendvb, RegMem::reg(xmm12), w_xmm13), + "66450F3810EC", + "pblendvb %xmm12, %xmm13", + )); + // ======================================================== // XMM_RM_R: Integer Packed diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 547d8413cbfe..b253e2d696f2 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -1927,13 +1927,20 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { src.get_regs_as_uses(collector); collector.add_def(*dst); } - Inst::XmmRmR { src, dst, .. } => { + Inst::XmmRmR { src, dst, op, .. } => { if inst.produces_const() { // No need to account for src, since src == dst. collector.add_def(*dst); } else { src.get_regs_as_uses(collector); collector.add_mod(*dst); + // Some instructions have an implicit use of XMM0. + if *op == SseOpcode::Blendvpd + || *op == SseOpcode::Blendvps + || *op == SseOpcode::Pblendvb + { + collector.add_use(regs::xmm0()); + } } } Inst::XmmRmREvex { diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 9c77e879f286..a7bd6ad37276 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2029,7 +2029,50 @@ fn lower_insn_to_regs>( ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty)); ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst)); } else { - unimplemented!("scalar bitselect") + unimplemented!("no lowering for scalar bitselect instruction") + } + } + + Opcode::Vselect => { + let ty = ty.unwrap(); + let condition = put_input_in_reg(ctx, inputs[0]); + let condition_ty = ctx.input_ty(insn, 0); + let if_true = input_to_reg_mem(ctx, inputs[1]); + let if_false = put_input_in_reg(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + + if ty.is_vector() { + // `vselect` relies on the bit representation of the condition: + // vector boolean types are defined in Cranelift to be all 1s or + // all 0s. This lowering relies on that fact to use x86's + // variable blend instructions, which look at the _high_bit_ of + // the condition mask. All the bits of vector booleans will + // match (all 1s or all 0s), so we can just use the high bit. + assert!(condition_ty.lane_type().is_bool()); + + // Variable blend instructions expect the condition mask to be + // in XMM0. + let xmm0 = Writable::from_reg(regs::xmm0()); + ctx.emit(Inst::gen_move(xmm0, condition, ty)); + + // Match up the source and destination registers for regalloc. + ctx.emit(Inst::gen_move(dst, if_false, ty)); + + // Technically PBLENDVB would work in all cases (since the bytes + // inside the mask will be all 1s or 0s we can blend + // byte-by-byte instead of word-by-word, e.g.) but + // type-specialized versions are included here for clarity when + // troubleshooting and due to slight improvements in + // latency/throughput on certain processor families. + let opcode = match condition_ty { + types::B64X2 => SseOpcode::Blendvpd, + types::B32X4 => SseOpcode::Blendvps, + types::B16X8 | types::B8X16 => SseOpcode::Pblendvb, + _ => unimplemented!("unable lower vselect for type: {}", condition_ty), + }; + ctx.emit(Inst::xmm_rm_r(opcode, if_true, dst)); + } else { + unimplemented!("no lowering for scalar vselect instruction") } } diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif index f34b61f5e374..52761b1ed01b 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif @@ -15,6 +15,16 @@ block0: ; nextln: por %xmm1, %xmm0 ; not: movdqa +function %vselect_i16x8() -> i16x8 { +block0: + v0 = vconst.b16x8 [false true false true false true false true] + v1 = vconst.i16x8 [0 0 0 0 0 0 0 0] + v2 = vconst.i16x8 [0 0 0 0 0 0 0 0] + v3 = vselect v0, v1, v2 + return v3 +} +; check: pblendvb %xmm1, %xmm2 + ; 8x16 shifts: these lower to complex sequences of instructions diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif index 3dad7ac77222..da8785ebfaff 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-run.clif @@ -10,6 +10,17 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16): ; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector. ; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] +function %vselect_i32x4(i32x4, i32x4) -> i32x4 { +block0(v1: i32x4, v2: i32x4): + ; `make_trampoline` still does not know how to convert boolean vector types + ; so we load the value directly here. + v0 = vconst.b32x4 [true true false false] + v3 = vselect v0, v1, v2 + return v3 +} +; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector. +; run: %vselect_i8x16([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4] + ; shift left