cranelift: Port vselect over to ISLE on x64

bytecodealliance · Jan 6, 2022 · 056f7c2 · 056f7c2
1 parent 7fd78da
commit 056f7c2
Show file tree

Hide file tree

Showing 7 changed files with 224 additions and 166 deletions.
diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
@@ -954,6 +954,28 @@
 (rule (pandn src1 src2)
       (xmm_rm_r $F64X2 (SseOpcode.Pandn) src1 src2))
 
+(decl sse_blend_op (Type) SseOpcode)
+(rule (sse_blend_op $F32X4) (SseOpcode.Blendvps))
+(rule (sse_blend_op $F64X2) (SseOpcode.Blendvpd))
+(rule (sse_blend_op (multi_lane _bits _lanes)) (SseOpcode.Pblendvb))
+
+(decl sse_mov_op (Type) SseOpcode)
+(rule (sse_mov_op $F32X4) (SseOpcode.Movaps))
+(rule (sse_mov_op $F64X2) (SseOpcode.Movapd))
+(rule (sse_mov_op (multi_lane _bits _lanes)) (SseOpcode.Movdqa))
+
+;; Helper for creating `blendvp{d,s}` and `pblendvb` instructions.
+(decl sse_blend (Type RegMem RegMem Reg) Reg)
+(rule (sse_blend ty mask src1 src2)
+      ;; Move the mask into `xmm0`, as blend instructions implicitly operate on
+      ;; that register. (This kind of thing would normally happen inside of
+      ;; `Inst::mov_mitosis`, but has to happen here, where we still have the
+      ;; mask register, because the mask is implicit and doesn't appear in the
+      ;; `Inst` itself.)
+      (let ((mask2 WritableReg (xmm0))
+            (_ Unit (emit (MInst.XmmUnaryRmR (sse_mov_op ty) mask mask2))))
+        (xmm_rm_r ty (sse_blend_op ty) src2 src1)))
+
 ;; Helper for creating `blendvpd` instructions.
 (decl blendvpd (Reg RegMem Reg) Reg)
 (rule (blendvpd src1 src2 mask)

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1050,6 +1050,15 @@
             (b Reg (sse_and_not ty cond_reg (put_in_reg_mem if_false))))
         (value_reg (sse_or ty b (RegMem.Reg a)))))
 
+;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty @ (multi_lane _bits _lanes)
+                       (vselect condition if_true if_false)))
+      (value_reg (sse_blend ty
+                            (put_in_reg_mem condition)
+                            (put_in_reg_mem if_true)
+                            (put_in_reg if_false))))
+
 ;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (insertlane vec @ (value_type ty) val (u8_from_uimm8 idx)))

diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
@@ -1515,6 +1515,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
     match op {
         Opcode::Iconst
         | Opcode::Bconst
+        | Opcode::F32const
+        | Opcode::F64const
         | Opcode::Null
         | Opcode::Iadd
         | Opcode::IaddIfcout
@@ -1535,50 +1537,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::Imin
         | Opcode::Umin
         | Opcode::Bnot
-        | Opcode::Bitselect => implemented_in_isle(ctx),
-
-        Opcode::Vselect => {
-            let ty = ty.unwrap();
-            let condition = put_input_in_reg(ctx, inputs[0]);
-            let condition_ty = ctx.input_ty(insn, 0);
-            let if_true = input_to_reg_mem(ctx, inputs[1]);
-            let if_false = put_input_in_reg(ctx, inputs[2]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            if ty.is_vector() {
-                // `vselect` relies on the bit representation of the condition:
-                // vector boolean types are defined in Cranelift to be all 1s or
-                // all 0s. This lowering relies on that fact to use x86's
-                // variable blend instructions, which look at the _high_bit_ of
-                // the condition mask. All the bits of vector booleans will
-                // match (all 1s or all 0s), so we can just use the high bit.
-                assert!(condition_ty.lane_type().is_bool());
-
-                // Variable blend instructions expect the condition mask to be
-                // in XMM0.
-                let xmm0 = Writable::from_reg(regs::xmm0());
-                ctx.emit(Inst::gen_move(xmm0, condition, ty));
-
-                // Match up the source and destination registers for regalloc.
-                ctx.emit(Inst::gen_move(dst, if_false, ty));
-
-                // Technically PBLENDVB would work in all cases (since the bytes
-                // inside the mask will be all 1s or 0s we can blend
-                // byte-by-byte instead of word-by-word, e.g.) but
-                // type-specialized versions are included here for clarity when
-                // troubleshooting and due to slight improvements in
-                // latency/throughput on certain processor families.
-                let opcode = match condition_ty {
-                    types::B64X2 => SseOpcode::Blendvpd,
-                    types::B32X4 => SseOpcode::Blendvps,
-                    types::B16X8 | types::B8X16 => SseOpcode::Pblendvb,
-                    _ => unimplemented!("unable lower vselect for type: {}", condition_ty),
-                };
-                ctx.emit(Inst::xmm_rm_r(opcode, if_true, dst));
-            } else {
-                unimplemented!("no lowering for scalar vselect instruction")
-            }
-        }
+        | Opcode::Bitselect
+        | Opcode::Vselect => implemented_in_isle(ctx),
 
         Opcode::Ishl | Opcode::Ushr | Opcode::Sshr | Opcode::Rotl | Opcode::Rotr => {
             let dst_ty = ctx.output_ty(insn, 0);
@@ -3254,22 +3214,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             };
         }
 
-        Opcode::F64const => {
-            unreachable!(
-                "implemented in ISLE: inst = `{}`, type = `{:?}`",
-                ctx.dfg().display_inst(insn),
-                ty
-            );
-        }
-
-        Opcode::F32const => {
-            unreachable!(
-                "implemented in ISLE: inst = `{}`, type = `{:?}`",
-                ctx.dfg().display_inst(insn),
-                ty
-            );
-        }
-
         Opcode::WideningPairwiseDotProductS => {
             let lhs = put_input_in_reg(ctx, inputs[0]);
             let rhs = input_to_reg_mem(ctx, inputs[1]);
@@ -5927,6 +5871,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 println!("Did not match fcvt input!");
             }
         }
+
         // Unimplemented opcodes below. These are not currently used by Wasm
         // lowering or other known embeddings, but should be either supported or
         // removed eventually.

diff --git a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle f176ef3bba99365
 src/prelude.isle babc931e5dc5b4cf
-src/isa/x64/inst.isle fb5d3ac8e68c46d2
-src/isa/x64/lower.isle 5d66b88a371d4d70
+src/isa/x64/inst.isle bc5fc626492752c8
+src/isa/x64/lower.isle 33e94300f4c08455