From 6ed90f86c81c6937d6a2937e617ad4f93a88952a Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Wed, 15 Mar 2023 12:20:43 -0500
Subject: [PATCH] x64: Add support for the `pblendw` instruction (#6023)

This commit adds another case for `shuffle` lowering to the x64 backend
for the `{,v}pblendw` instruction. This instruction selects 16-bit
values from either of the inputs corresponding to an immediate 8-bit-mask where
each bit selects the corresponding lane from the inputs.
---
 cranelift/codegen/src/isa/x64/inst.isle       | 10 +++++
 cranelift/codegen/src/isa/x64/inst/args.rs    |  8 +++-
 cranelift/codegen/src/isa/x64/inst/emit.rs    |  2 +
 cranelift/codegen/src/isa/x64/lower.isle      |  9 ++++
 cranelift/codegen/src/isa/x64/lower/isle.rs   | 35 +++++++++++++++
 .../filetests/isa/x64/shuffle-avx.clif        | 28 ++++++++++++
 .../filetests/filetests/isa/x64/shuffle.clif  | 44 ++++++++++++++-----
 .../filetests/runtests/simd-shuffle.clif      | 10 +++++
 8 files changed, 132 insertions(+), 14 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index 1a0c04912c1c..63580dc63312 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -918,6 +918,7 @@
             Punpcklqdq
             Pshuflw
             Pshufhw
+            Pblendw
           ))
 
 (type CmpOpcode extern
@@ -1290,6 +1291,7 @@
             Vpextrw
             Vpextrd
             Vpextrq
+            Vpblendw
           ))
 
 (type Avx512Opcode extern
@@ -2967,6 +2969,14 @@
       (if-let $true (has_avx))
       (xmm_rmr_blend_vex (AvxOpcode.Vpblendvb) src1 src2 mask))
 
+;; Helper for creating `pblendw` instructions.
+(decl x64_pblendw (Xmm XmmMem u8) Xmm)
+(rule 0 (x64_pblendw src1 src2 imm)
+      (xmm_rm_r_imm (SseOpcode.Pblendw) src1 src2 imm (OperandSize.Size32)))
+(rule 1 (x64_pblendw src1 src2 imm)
+      (if-let $true (has_avx))
+      (xmm_rmr_imm_vex (AvxOpcode.Vpblendw) src1 src2 imm))
+
 ;; Helper for creating a `movsd` instruction which creates a new vector
 ;; register where the upper 64-bits are from the first operand and the low
 ;; 64-bits are from the second operand.
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 024313c45f3f..0c2db35cd79a 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1125,6 +1125,7 @@ pub enum SseOpcode {
     Punpcklqdq,
     Pshuflw,
     Pshufhw,
+    Pblendw,
 }
 
 impl SseOpcode {
@@ -1318,7 +1319,8 @@ impl SseOpcode {
             | SseOpcode::Roundps
             | SseOpcode::Roundpd
             | SseOpcode::Roundss
-            | SseOpcode::Roundsd => SSE41,
+            | SseOpcode::Roundsd
+            | SseOpcode::Pblendw => SSE41,
 
             SseOpcode::Pcmpgtq => SSE42,
         }
@@ -1521,6 +1523,7 @@ impl fmt::Debug for SseOpcode {
             SseOpcode::Punpckhqdq => "punpckhqdq",
             SseOpcode::Pshuflw => "pshuflw",
             SseOpcode::Pshufhw => "pshufhw",
+            SseOpcode::Pblendw => "pblendw",
         };
         write!(fmt, "{}", name)
     }
@@ -1705,7 +1708,8 @@ impl AvxOpcode {
             | AvxOpcode::Vpextrb
             | AvxOpcode::Vpextrw
             | AvxOpcode::Vpextrd
-            | AvxOpcode::Vpextrq => {
+            | AvxOpcode::Vpextrq
+            | AvxOpcode::Vpblendw => {
                 smallvec![InstructionSet::AVX]
             }
         }
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index d8718ad8e537..bc3db10e307a 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2263,6 +2263,7 @@ pub(crate) fn emit(
                 AvxOpcode::Vpalignr => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0F),
                 AvxOpcode::Vinsertps => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x21),
                 AvxOpcode::Vshufps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC6),
+                AvxOpcode::Vpblendw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0E),
                 _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
             };
 
@@ -2719,6 +2720,7 @@ pub(crate) fn emit(
                 SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
                 SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
                 SseOpcode::Shufps => (LegacyPrefixes::None, 0x0FC6, 2),
+                SseOpcode::Pblendw => (LegacyPrefixes::_66, 0x0F3A0E, 3),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
             let rex = RexFlags::from(*size);
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index 715b6079f0d7..c532459e0512 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3704,6 +3704,15 @@
 
 ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; Special case for `pblendw` which takes an 8-bit immediate where each bit
+;; indicates which lane of the two operands is chosen for the output. A bit of
+;; 0 chooses the corresponding 16-it lane from `a` and a bit of 1 chooses the
+;; corresponding 16-bit lane from `b`.
+(rule 14 (lower (shuffle a b (pblendw_imm n)))
+         (x64_pblendw a b n))
+(decl pblendw_imm (u8) Immediate)
+(extern extractor pblendw_imm pblendw_imm)
+
 ;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
 ;; bytes", that's a `palignr` instruction. Note that the order of operands are
 ;; swapped in the instruction here. The `palignr` instruction uses the second
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index 8e297e02c0cb..66b8091f7768 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -980,6 +980,41 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
             None
         }
     }
+
+    fn pblendw_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // First make sure that the shuffle immediate is selecting 16-bit lanes.
+        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
+
+        // Next build up an 8-bit mask from each of the bits of the selected
+        // lanes above. This instruction can only be used when each lane
+        // selector chooses from the corresponding lane in either of the two
+        // operands, meaning the Nth lane selection must satisfy `lane % 8 ==
+        // N`.
+        //
+        // This helper closure is used to calculate the value of the
+        // corresponding bit.
+        let bit = |x: u8, c: u8| {
+            if x % 8 == c {
+                if x < 8 {
+                    Some(0)
+                } else {
+                    Some(1 << c)
+                }
+            } else {
+                None
+            }
+        };
+        Some(
+            bit(a, 0)?
+                | bit(b, 1)?
+                | bit(c, 2)?
+                | bit(d, 3)?
+                | bit(e, 4)?
+                | bit(f, 5)?
+                | bit(g, 6)?
+                | bit(h, 7)?,
+        )
+    }
 }
 
 impl IsleContext<'_, '_, MInst, X64Backend> {
diff --git a/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif
index 30cf9721e144..06aa6da5af56 100644
--- a/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif
@@ -114,3 +114,31 @@ block0(v0: i64x2, v1: i64x2):
 ;   popq %rbp
 ;   retq
 
+function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpblendw $153, %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpblendw $0x99, %xmm1, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif
index f8f9b613e08b..cf58cd916be0 100644
--- a/cranelift/filetests/filetests/isa/x64/shuffle.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif
@@ -654,9 +654,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm4
-;   movdqa  %xmm1, %xmm0
-;   palignr $0, %xmm0, %xmm4, %xmm0
+;   pblendw $0, %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -666,9 +664,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqa %xmm0, %xmm4
-;   movdqa %xmm1, %xmm0
-;   palignr $0, %xmm4, %xmm0
+;   pblendw $0, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -770,9 +766,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm4
-;   movdqa  %xmm1, %xmm0
-;   palignr $16, %xmm0, %xmm4, %xmm0
+;   pblendw $255, %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -782,9 +776,35 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqa %xmm0, %xmm4
-;   movdqa %xmm1, %xmm0
-;   palignr $0x10, %xmm4, %xmm0
+;   pblendw $0xff, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pblendw $153, %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pblendw $0x99, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
index 2ef1671e22b1..cdf2adfc5caf 100644
--- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
@@ -553,3 +553,13 @@ block0(v0: i64x2, v1: i64x2):
     return v5
 }
 ; run: %aarch64_rev64_words([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0506070801020304 0x0403020108070605]
+
+function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pblendw_0b10011001([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 2 3 12 13 6 7 16]