From 6ed90f86c81c6937d6a2937e617ad4f93a88952a Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 15 Mar 2023 12:20:43 -0500 Subject: [PATCH] x64: Add support for the `pblendw` instruction (#6023) This commit adds another case for `shuffle` lowering to the x64 backend for the `{,v}pblendw` instruction. This instruction selects 16-bit values from either of the inputs corresponding to an immediate 8-bit-mask where each bit selects the corresponding lane from the inputs. --- cranelift/codegen/src/isa/x64/inst.isle | 10 +++++ cranelift/codegen/src/isa/x64/inst/args.rs | 8 +++- cranelift/codegen/src/isa/x64/inst/emit.rs | 2 + cranelift/codegen/src/isa/x64/lower.isle | 9 ++++ cranelift/codegen/src/isa/x64/lower/isle.rs | 35 +++++++++++++++ .../filetests/isa/x64/shuffle-avx.clif | 28 ++++++++++++ .../filetests/filetests/isa/x64/shuffle.clif | 44 ++++++++++++++----- .../filetests/runtests/simd-shuffle.clif | 10 +++++ 8 files changed, 132 insertions(+), 14 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 1a0c04912c1c..63580dc63312 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -918,6 +918,7 @@ Punpcklqdq Pshuflw Pshufhw + Pblendw )) (type CmpOpcode extern @@ -1290,6 +1291,7 @@ Vpextrw Vpextrd Vpextrq + Vpblendw )) (type Avx512Opcode extern @@ -2967,6 +2969,14 @@ (if-let $true (has_avx)) (xmm_rmr_blend_vex (AvxOpcode.Vpblendvb) src1 src2 mask)) +;; Helper for creating `pblendw` instructions. +(decl x64_pblendw (Xmm XmmMem u8) Xmm) +(rule 0 (x64_pblendw src1 src2 imm) + (xmm_rm_r_imm (SseOpcode.Pblendw) src1 src2 imm (OperandSize.Size32))) +(rule 1 (x64_pblendw src1 src2 imm) + (if-let $true (has_avx)) + (xmm_rmr_imm_vex (AvxOpcode.Vpblendw) src1 src2 imm)) + ;; Helper for creating a `movsd` instruction which creates a new vector ;; register where the upper 64-bits are from the first operand and the low ;; 64-bits are from the second operand. diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 024313c45f3f..0c2db35cd79a 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1125,6 +1125,7 @@ pub enum SseOpcode { Punpcklqdq, Pshuflw, Pshufhw, + Pblendw, } impl SseOpcode { @@ -1318,7 +1319,8 @@ impl SseOpcode { | SseOpcode::Roundps | SseOpcode::Roundpd | SseOpcode::Roundss - | SseOpcode::Roundsd => SSE41, + | SseOpcode::Roundsd + | SseOpcode::Pblendw => SSE41, SseOpcode::Pcmpgtq => SSE42, } @@ -1521,6 +1523,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Punpckhqdq => "punpckhqdq", SseOpcode::Pshuflw => "pshuflw", SseOpcode::Pshufhw => "pshufhw", + SseOpcode::Pblendw => "pblendw", }; write!(fmt, "{}", name) } @@ -1705,7 +1708,8 @@ impl AvxOpcode { | AvxOpcode::Vpextrb | AvxOpcode::Vpextrw | AvxOpcode::Vpextrd - | AvxOpcode::Vpextrq => { + | AvxOpcode::Vpextrq + | AvxOpcode::Vpblendw => { smallvec![InstructionSet::AVX] } } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index d8718ad8e537..bc3db10e307a 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -2263,6 +2263,7 @@ pub(crate) fn emit( AvxOpcode::Vpalignr => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0F), AvxOpcode::Vinsertps => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x21), AvxOpcode::Vshufps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC6), + AvxOpcode::Vpblendw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0E), _ => panic!("unexpected rmr_imm_vex opcode {op:?}"), }; @@ -2719,6 +2720,7 @@ pub(crate) fn emit( SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2), SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3), SseOpcode::Shufps => (LegacyPrefixes::None, 0x0FC6, 2), + SseOpcode::Pblendw => (LegacyPrefixes::_66, 0x0F3A0E, 3), _ => unimplemented!("Opcode {:?} not implemented", op), }; let rex = RexFlags::from(*size); diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 715b6079f0d7..c532459e0512 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3704,6 +3704,15 @@ ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Special case for `pblendw` which takes an 8-bit immediate where each bit +;; indicates which lane of the two operands is chosen for the output. A bit of +;; 0 chooses the corresponding 16-it lane from `a` and a bit of 1 chooses the +;; corresponding 16-bit lane from `b`. +(rule 14 (lower (shuffle a b (pblendw_imm n))) + (x64_pblendw a b n)) +(decl pblendw_imm (u8) Immediate) +(extern extractor pblendw_imm pblendw_imm) + ;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8 ;; bytes", that's a `palignr` instruction. Note that the order of operands are ;; swapped in the instruction here. The `palignr` instruction uses the second diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 8e297e02c0cb..66b8091f7768 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -980,6 +980,41 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { None } } + + fn pblendw_imm(&mut self, imm: Immediate) -> Option { + // First make sure that the shuffle immediate is selecting 16-bit lanes. + let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?; + + // Next build up an 8-bit mask from each of the bits of the selected + // lanes above. This instruction can only be used when each lane + // selector chooses from the corresponding lane in either of the two + // operands, meaning the Nth lane selection must satisfy `lane % 8 == + // N`. + // + // This helper closure is used to calculate the value of the + // corresponding bit. + let bit = |x: u8, c: u8| { + if x % 8 == c { + if x < 8 { + Some(0) + } else { + Some(1 << c) + } + } else { + None + } + }; + Some( + bit(a, 0)? + | bit(b, 1)? + | bit(c, 2)? + | bit(d, 3)? + | bit(e, 4)? + | bit(f, 5)? + | bit(g, 6)? + | bit(h, 7)?, + ) + } } impl IsleContext<'_, '_, MInst, X64Backend> { diff --git a/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif index 30cf9721e144..06aa6da5af56 100644 --- a/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif @@ -114,3 +114,31 @@ block0(v0: i64x2, v1: i64x2): ; popq %rbp ; retq +function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vpblendw $153, %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vpblendw $0x99, %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif index f8f9b613e08b..cf58cd916be0 100644 --- a/cranelift/filetests/filetests/isa/x64/shuffle.clif +++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif @@ -654,9 +654,7 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm4 -; movdqa %xmm1, %xmm0 -; palignr $0, %xmm0, %xmm4, %xmm0 +; pblendw $0, %xmm0, %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -666,9 +664,7 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm0, %xmm4 -; movdqa %xmm1, %xmm0 -; palignr $0, %xmm4, %xmm0 +; pblendw $0, %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -770,9 +766,7 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm0, %xmm4 -; movdqa %xmm1, %xmm0 -; palignr $16, %xmm0, %xmm4, %xmm0 +; pblendw $255, %xmm0, %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -782,9 +776,35 @@ block0(v0: i8x16, v1: i8x16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm0, %xmm4 -; movdqa %xmm1, %xmm0 -; palignr $0x10, %xmm4, %xmm0 +; pblendw $0xff, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pblendw $153, %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pblendw $0x99, %xmm1, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif index 2ef1671e22b1..cdf2adfc5caf 100644 --- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif +++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif @@ -553,3 +553,13 @@ block0(v0: i64x2, v1: i64x2): return v5 } ; run: %aarch64_rev64_words([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0506070801020304 0x0403020108070605] + +function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %pblendw_0b10011001([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 2 3 12 13 6 7 16]