x64: Fix false dependencies in int-to-float conversions (#7098)

* x64: Fix false dependencies in int-to-float conversions This commit is a result of the investigation on #7085. The int-to-float conversion instructions used right now on the x64 backend will implicitly source the upper bits of the result from a different register. This implicitly creates a dependency on further consumers using the conversion result on whatever previously defined the upper bits, even though they aren't used. This false dependency is the primary reason for the slowdown witnessed in #7085. The fix chosen in this commit is to model the int-to-float instructions with a new shape of instruction instead of the previous `GprToXmm{,Vex}`. This previous shape was modeled as single-input and single-output, but this does not reflect the actual nature of the `cvtsi2s{s,d}` instructions. Instead these now use `CvtIntToFloat{,Vex}` which have two source operands and one destination operand, modeling how the upper bits of a different register are used. In lowerings using this instruction the upper bits to preserver are always sourced from a zero'd out register to force breaking dependencies between instructions. Closes #7085 * Remove now dead code * Remove outdated test Golden test output covers this test case anyway nowadays * Review comments * Fix emit tests
bytecodealliance · Sep 28, 2023 · 9982992 · 9982992
1 parent b7c0eae
commit 9982992
Show file tree

Hide file tree

Showing 8 changed files with 306 additions and 118 deletions.
diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
@@ -437,6 +437,23 @@
                  (dst WritableXmm)
                  (src_size OperandSize))
 
+       ;; Conversion from signed integers to floats, the `{v,}`cvtsi2s{s,d}`
+       ;; instructions.
+       ;;
+       ;; Note that this is special in that `src1` is an xmm/float register
+       ;; while `src2` is a general purpose register as this is converting an
+       ;; integer in a gpr to an equivalent float in an xmm reg.
+       (CvtIntToFloat (op SseOpcode)
+                      (src1 Xmm)
+                      (src2 GprMem)
+                      (dst WritableXmm)
+                      (src2_size OperandSize))
+       (CvtIntToFloatVex (op AvxOpcode)
+                         (src1 Xmm)
+                         (src2 GprMem)
+                         (dst WritableXmm)
+                         (src2_size OperandSize))
+
        ;; Converts an unsigned int64 to a float32/float64.
        (CvtUint64ToFloatSeq (dst_size OperandSize) ;; 4 or 8
                             (src Gpr)
@@ -2095,6 +2112,18 @@
             (_ Unit (emit (MInst.UnaryRmRImmVex size op src dst imm))))
         dst))
 
+(decl cvt_int_to_float (SseOpcode Xmm GprMem OperandSize) Xmm)
+(rule (cvt_int_to_float op src1 src2 size)
+      (let ((dst WritableXmm (temp_writable_xmm))
+            (_ Unit (emit (MInst.CvtIntToFloat op src1 src2 dst size))))
+        dst))
+
+(decl cvt_int_to_float_vex (AvxOpcode Xmm GprMem OperandSize) Xmm)
+(rule (cvt_int_to_float_vex op src1 src2 size)
+      (let ((dst WritableXmm (temp_writable_xmm))
+            (_ Unit (emit (MInst.CvtIntToFloatVex op src1 src2 dst size))))
+        dst))
+
 (decl cvt_u64_to_float_seq (Type Gpr) Xmm)
 (rule (cvt_u64_to_float_seq ty src)
       (let ((size OperandSize (raw_operand_size_of_type ty))
@@ -4351,20 +4380,20 @@
         (xmm_unary_rm_r_vex (AvxOpcode.Vcvtdq2pd) x))
 
 ;; Helper for creating `cvtsi2ss` instructions.
-(decl x64_cvtsi2ss (Type GprMem) Xmm)
-(rule (x64_cvtsi2ss ty x)
-      (gpr_to_xmm (SseOpcode.Cvtsi2ss) x (raw_operand_size_of_type ty)))
-(rule 1 (x64_cvtsi2ss ty x)
+(decl x64_cvtsi2ss (Type Xmm GprMem) Xmm)
+(rule (x64_cvtsi2ss ty x y)
+      (cvt_int_to_float (SseOpcode.Cvtsi2ss) x y (raw_operand_size_of_type ty)))
+(rule 1 (x64_cvtsi2ss ty x y)
         (if-let $true (use_avx))
-        (gpr_to_xmm_vex (AvxOpcode.Vcvtsi2ss) x (raw_operand_size_of_type ty)))
+        (cvt_int_to_float_vex (AvxOpcode.Vcvtsi2ss) x y (raw_operand_size_of_type ty)))
 
 ;; Helper for creating `cvtsi2sd` instructions.
-(decl x64_cvtsi2sd (Type GprMem) Xmm)
-(rule (x64_cvtsi2sd ty x)
-      (gpr_to_xmm (SseOpcode.Cvtsi2sd) x (raw_operand_size_of_type ty)))
-(rule 1 (x64_cvtsi2sd ty x)
+(decl x64_cvtsi2sd (Type Xmm GprMem) Xmm)
+(rule (x64_cvtsi2sd ty x y)
+      (cvt_int_to_float (SseOpcode.Cvtsi2sd) x y (raw_operand_size_of_type ty)))
+(rule 1 (x64_cvtsi2sd ty x y)
         (if-let $true (use_avx))
-        (gpr_to_xmm_vex (AvxOpcode.Vcvtsi2sd) x (raw_operand_size_of_type ty)))
+        (cvt_int_to_float_vex (AvxOpcode.Vcvtsi2sd) x y (raw_operand_size_of_type ty)))
 
 ;; Helper for creating `cvttps2dq` instructions.
 (decl x64_cvttps2dq (XmmMem) Xmm)

diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -32,8 +32,14 @@ fn emit_signed_cvt(
     } else {
         SseOpcode::Cvtsi2ss
     };
-    let inst = Inst::gpr_to_xmm(op, RegMem::reg(src), OperandSize::Size64, dst);
-    inst.emit(&[], sink, info, state);
+    Inst::CvtIntToFloat {
+        op,
+        dst: Writable::from_reg(Xmm::new(dst.to_reg()).unwrap()),
+        src1: Xmm::new(dst.to_reg()).unwrap(),
+        src2: GprMem::new(RegMem::reg(src)).unwrap(),
+        src2_size: OperandSize::Size64,
+    }
+    .emit(&[], sink, info, state);
 }
 
 /// Emits a one way conditional jump if CC is set (true).
@@ -2872,30 +2878,21 @@ pub(crate) fn emit(
             let (prefix, map, opcode) = match op {
                 // vmovd/vmovq are differentiated by `w`
                 AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x6E),
-                AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A),
-                AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
             let w = match src_size {
                 OperandSize::Size64 => true,
                 _ => false,
             };
-            let mut insn = VexInstruction::new()
+            VexInstruction::new()
                 .length(VexVectorLength::V128)
                 .w(w)
                 .prefix(prefix)
                 .map(map)
                 .opcode(opcode)
                 .rm(src)
-                .reg(dst.to_real_reg().unwrap().hw_enc());
-            // These opcodes technically take a second operand which is the
-            // upper bits to preserve during the float conversion. We don't
-            // actually use this in this backend right now so reuse the
-            // destination register. This at least matches what LLVM does.
-            if let AvxOpcode::Vcvtsi2ss | AvxOpcode::Vcvtsi2sd = op {
-                insn = insn.vvvv(dst.to_real_reg().unwrap().hw_enc());
-            }
-            insn.encode(sink);
+                .reg(dst.to_real_reg().unwrap().hw_enc())
+                .encode(sink);
         }
 
         Inst::XmmRmREvex {
@@ -3200,8 +3197,6 @@ pub(crate) fn emit(
                 // Movd and movq use the same opcode; the presence of the REX prefix (set below)
                 // actually determines which is used.
                 SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E),
-                SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
-                SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
                 _ => panic!("unexpected opcode {:?}", op),
             };
             let rex = RexFlags::from(*src_size);
@@ -3239,6 +3234,72 @@ pub(crate) fn emit(
             }
         }
 
+        Inst::CvtIntToFloat {
+            op,
+            src1,
+            src2,
+            dst,
+            src2_size,
+        } => {
+            let src1 = allocs.next(src1.to_reg());
+            let dst = allocs.next(dst.to_reg().to_reg());
+            assert_eq!(src1, dst);
+            let src2 = src2.clone().to_reg_mem().with_allocs(allocs);
+
+            let (prefix, opcode) = match op {
+                SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
+                SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
+                _ => panic!("unexpected opcode {:?}", op),
+            };
+            let rex = RexFlags::from(*src2_size);
+            match src2 {
+                RegMem::Reg { reg: src2 } => {
+                    emit_std_reg_reg(sink, prefix, opcode, 2, dst, src2, rex);
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state, sink);
+                    emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex, 0);
+                }
+            }
+        }
+
+        Inst::CvtIntToFloatVex {
+            op,
+            src1,
+            src2,
+            dst,
+            src2_size,
+        } => {
+            let dst = allocs.next(dst.to_reg().to_reg());
+            let src1 = allocs.next(src1.to_reg());
+            let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
+                RegMem::Reg { reg } => {
+                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
+                }
+                RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
+            };
+
+            let (prefix, map, opcode) = match op {
+                AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A),
+                AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A),
+                _ => unimplemented!("Opcode {:?} not implemented", op),
+            };
+            let w = match src2_size {
+                OperandSize::Size64 => true,
+                _ => false,
+            };
+            VexInstruction::new()
+                .length(VexVectorLength::V128)
+                .w(w)
+                .prefix(prefix)
+                .map(map)
+                .opcode(opcode)
+                .rm(src2)
+                .reg(dst.to_real_reg().unwrap().hw_enc())
+                .vvvv(src1.to_real_reg().unwrap().hw_enc())
+                .encode(sink);
+        }
+
         Inst::CvtUint64ToFloatSeq {
             dst_size,
             src,

diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -5051,26 +5051,6 @@ fn test_x64_emit() {
         "664C0F6EFF",
         "movq    %rdi, %xmm15",
     ));
-    insns.push((
-        Inst::gpr_to_xmm(
-            SseOpcode::Cvtsi2ss,
-            RegMem::reg(rdi),
-            OperandSize::Size32,
-            w_xmm15,
-        ),
-        "F3440F2AFF",
-        "cvtsi2ss %edi, %xmm15",
-    ));
-    insns.push((
-        Inst::gpr_to_xmm(
-            SseOpcode::Cvtsi2sd,
-            RegMem::reg(rsi),
-            OperandSize::Size64,
-            w_xmm1,
-        ),
-        "F2480F2ACE",
-        "cvtsi2sd %rsi, %xmm1",
-    ));
 
     // ========================================================
     // XmmRmi

diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -178,7 +178,8 @@ impl Inst {
             | Inst::XmmToGprImm { op, .. }
             | Inst::XmmUnaryRmRImm { op, .. }
             | Inst::XmmUnaryRmRUnaligned { op, .. }
-            | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
+            | Inst::XmmUnaryRmR { op, .. }
+            | Inst::CvtIntToFloat { op, .. } => smallvec![op.available_from()],
 
             Inst::XmmUnaryRmREvex { op, .. }
             | Inst::XmmRmREvex { op, .. }
@@ -196,7 +197,8 @@ impl Inst {
             | Inst::XmmMovRMImmVex { op, .. }
             | Inst::XmmToGprImmVex { op, .. }
             | Inst::XmmToGprVex { op, .. }
-            | Inst::GprToXmmVex { op, .. } => op.available_from(),
+            | Inst::GprToXmmVex { op, .. }
+            | Inst::CvtIntToFloatVex { op, .. } => op.available_from(),
         }
     }
 }
@@ -1296,6 +1298,34 @@ impl PrettyPrint for Inst {
                 format!("{op} {src}, {dst}")
             }
 
+            Inst::CvtIntToFloat {
+                op,
+                src1,
+                src2,
+                dst,
+                src2_size,
+            } => {
+                let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
+                let dst = pretty_print_reg(*dst.to_reg(), 8, allocs);
+                let src2 = src2.pretty_print(src2_size.to_bytes(), allocs);
+                let op = ljustify(op.to_string());
+                format!("{op} {src1}, {src2}, {dst}")
+            }
+
+            Inst::CvtIntToFloatVex {
+                op,
+                src1,
+                src2,
+                dst,
+                src2_size,
+            } => {
+                let dst = pretty_print_reg(*dst.to_reg(), 8, allocs);
+                let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
+                let src2 = src2.pretty_print(src2_size.to_bytes(), allocs);
+                let op = ljustify(op.to_string());
+                format!("{op} {src1}, {src2}, {dst}")
+            }
+
             Inst::CvtUint64ToFloatSeq {
                 src,
                 dst,
@@ -2164,6 +2194,20 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
             collector.reg_def(dst.to_writable_reg());
             src.get_operands(collector);
         }
+        Inst::CvtIntToFloat {
+            src1, src2, dst, ..
+        } => {
+            collector.reg_use(src1.to_reg());
+            collector.reg_reuse_def(dst.to_writable_reg(), 0);
+            src2.get_operands(collector);
+        }
+        Inst::CvtIntToFloatVex {
+            src1, src2, dst, ..
+        } => {
+            collector.reg_def(dst.to_writable_reg());
+            collector.reg_use(src1.to_reg());
+            src2.get_operands(collector);
+        }
         Inst::CvtUint64ToFloatSeq {
             src,
             dst,

diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3336,23 +3336,40 @@
 
 ;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; Note that the `cvtsi2s{s,d}` instruction is not just an int-to-float
+;; conversion instruction in isolation, it also takes the upper 64-bits of an
+;; xmm register and places it into the destination. We don't actually want that
+;; to happen as it could accidentally create a false dependency with a
+;; previous instruction defining the register's upper 64-bits. See #7085 for
+;; an instance of this.
+;;
+;; This means that the first operand to all of the int-to-float conversions here
+;; are `(xmm_zero)` operands which is a guaranteed zero register that has no
+;; dependencies on other instructions.
+;;
+;; Ideally this would be lifted out to a higher level to get deduplicated
+;; between consecutive int-to-float operations but that's not easy
+;; to do at this time. One possibility would be a mid-end rule which rewrites
+;; `fcvt_from_sint` to an x86-specific opcode using a zero constant which would
+;; be subject to normal LICM, but that's not feasible today.
+
 (rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I8))))
-      (x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
+      (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))
 
 (rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I16))))
-      (x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
+      (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))
 
 (rule 1 (lower (has_type $F32 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty))))))
-      (x64_cvtsi2ss ty a))
+      (x64_cvtsi2ss ty (xmm_zero $F32X4) a))
 
 (rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I8))))
-      (x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
+      (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))
 
 (rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I16))))
-      (x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
+      (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))
 
 (rule 1 (lower (has_type $F64 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty))))))
-      (x64_cvtsi2sd ty a))
+      (x64_cvtsi2sd ty (xmm_zero $F64X2) a))
 
 (rule 0 (lower (fcvt_from_sint a @ (value_type $I32X4)))
       (x64_cvtdq2ps a))
@@ -3363,10 +3380,10 @@
 ;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule 1 (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
-      (x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
+      (x64_cvtsi2ss $I64 (xmm_zero $F32X4) (extend_to_gpr val $I64 (ExtendKind.Zero))))
 
 (rule 1 (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
-      (x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
+      (x64_cvtsi2sd $I64 (xmm_zero $F64X2) (extend_to_gpr val $I64 (ExtendKind.Zero))))
 
 (rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64))))
       (cvt_u64_to_float_seq ty val))