Skip to content

Commit

Permalink
x64: Fix false dependencies in int-to-float conversions (#7098)
Browse files Browse the repository at this point in the history
* x64: Fix false dependencies in int-to-float conversions

This commit is a result of the investigation on #7085. The int-to-float
conversion instructions used right now on the x64 backend will
implicitly source the upper bits of the result from a different
register. This implicitly creates a dependency on further consumers
using the conversion result on whatever previously defined the upper
bits, even though they aren't used. This false dependency is the primary
reason for the slowdown witnessed in #7085.

The fix chosen in this commit is to model the int-to-float instructions
with a new shape of instruction instead of the previous `GprToXmm{,Vex}`. This
previous shape was modeled as single-input and single-output, but this
does not reflect the actual nature of the `cvtsi2s{s,d}` instructions.
Instead these now use `CvtIntToFloat{,Vex}` which have two source
operands and one destination operand, modeling how the upper bits of a
different register are used. In lowerings using this instruction the
upper bits to preserver are always sourced from a zero'd out register to
force breaking dependencies between instructions.

Closes #7085

* Remove now dead code

* Remove outdated test

Golden test output covers this test case anyway nowadays

* Review comments

* Fix emit tests
  • Loading branch information
alexcrichton authored Sep 28, 2023
1 parent b7c0eae commit 9982992
Show file tree
Hide file tree
Showing 8 changed files with 306 additions and 118 deletions.
49 changes: 39 additions & 10 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,23 @@
(dst WritableXmm)
(src_size OperandSize))

;; Conversion from signed integers to floats, the `{v,}`cvtsi2s{s,d}`
;; instructions.
;;
;; Note that this is special in that `src1` is an xmm/float register
;; while `src2` is a general purpose register as this is converting an
;; integer in a gpr to an equivalent float in an xmm reg.
(CvtIntToFloat (op SseOpcode)
(src1 Xmm)
(src2 GprMem)
(dst WritableXmm)
(src2_size OperandSize))
(CvtIntToFloatVex (op AvxOpcode)
(src1 Xmm)
(src2 GprMem)
(dst WritableXmm)
(src2_size OperandSize))

;; Converts an unsigned int64 to a float32/float64.
(CvtUint64ToFloatSeq (dst_size OperandSize) ;; 4 or 8
(src Gpr)
Expand Down Expand Up @@ -2095,6 +2112,18 @@
(_ Unit (emit (MInst.UnaryRmRImmVex size op src dst imm))))
dst))

(decl cvt_int_to_float (SseOpcode Xmm GprMem OperandSize) Xmm)
(rule (cvt_int_to_float op src1 src2 size)
(let ((dst WritableXmm (temp_writable_xmm))
(_ Unit (emit (MInst.CvtIntToFloat op src1 src2 dst size))))
dst))

(decl cvt_int_to_float_vex (AvxOpcode Xmm GprMem OperandSize) Xmm)
(rule (cvt_int_to_float_vex op src1 src2 size)
(let ((dst WritableXmm (temp_writable_xmm))
(_ Unit (emit (MInst.CvtIntToFloatVex op src1 src2 dst size))))
dst))

(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
(rule (cvt_u64_to_float_seq ty src)
(let ((size OperandSize (raw_operand_size_of_type ty))
Expand Down Expand Up @@ -4351,20 +4380,20 @@
(xmm_unary_rm_r_vex (AvxOpcode.Vcvtdq2pd) x))

;; Helper for creating `cvtsi2ss` instructions.
(decl x64_cvtsi2ss (Type GprMem) Xmm)
(rule (x64_cvtsi2ss ty x)
(gpr_to_xmm (SseOpcode.Cvtsi2ss) x (raw_operand_size_of_type ty)))
(rule 1 (x64_cvtsi2ss ty x)
(decl x64_cvtsi2ss (Type Xmm GprMem) Xmm)
(rule (x64_cvtsi2ss ty x y)
(cvt_int_to_float (SseOpcode.Cvtsi2ss) x y (raw_operand_size_of_type ty)))
(rule 1 (x64_cvtsi2ss ty x y)
(if-let $true (use_avx))
(gpr_to_xmm_vex (AvxOpcode.Vcvtsi2ss) x (raw_operand_size_of_type ty)))
(cvt_int_to_float_vex (AvxOpcode.Vcvtsi2ss) x y (raw_operand_size_of_type ty)))

;; Helper for creating `cvtsi2sd` instructions.
(decl x64_cvtsi2sd (Type GprMem) Xmm)
(rule (x64_cvtsi2sd ty x)
(gpr_to_xmm (SseOpcode.Cvtsi2sd) x (raw_operand_size_of_type ty)))
(rule 1 (x64_cvtsi2sd ty x)
(decl x64_cvtsi2sd (Type Xmm GprMem) Xmm)
(rule (x64_cvtsi2sd ty x y)
(cvt_int_to_float (SseOpcode.Cvtsi2sd) x y (raw_operand_size_of_type ty)))
(rule 1 (x64_cvtsi2sd ty x y)
(if-let $true (use_avx))
(gpr_to_xmm_vex (AvxOpcode.Vcvtsi2sd) x (raw_operand_size_of_type ty)))
(cvt_int_to_float_vex (AvxOpcode.Vcvtsi2sd) x y (raw_operand_size_of_type ty)))

;; Helper for creating `cvttps2dq` instructions.
(decl x64_cvttps2dq (XmmMem) Xmm)
Expand Down
93 changes: 77 additions & 16 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,14 @@ fn emit_signed_cvt(
} else {
SseOpcode::Cvtsi2ss
};
let inst = Inst::gpr_to_xmm(op, RegMem::reg(src), OperandSize::Size64, dst);
inst.emit(&[], sink, info, state);
Inst::CvtIntToFloat {
op,
dst: Writable::from_reg(Xmm::new(dst.to_reg()).unwrap()),
src1: Xmm::new(dst.to_reg()).unwrap(),
src2: GprMem::new(RegMem::reg(src)).unwrap(),
src2_size: OperandSize::Size64,
}
.emit(&[], sink, info, state);
}

/// Emits a one way conditional jump if CC is set (true).
Expand Down Expand Up @@ -2872,30 +2878,21 @@ pub(crate) fn emit(
let (prefix, map, opcode) = match op {
// vmovd/vmovq are differentiated by `w`
AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x6E),
AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A),
AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
let w = match src_size {
OperandSize::Size64 => true,
_ => false,
};
let mut insn = VexInstruction::new()
VexInstruction::new()
.length(VexVectorLength::V128)
.w(w)
.prefix(prefix)
.map(map)
.opcode(opcode)
.rm(src)
.reg(dst.to_real_reg().unwrap().hw_enc());
// These opcodes technically take a second operand which is the
// upper bits to preserve during the float conversion. We don't
// actually use this in this backend right now so reuse the
// destination register. This at least matches what LLVM does.
if let AvxOpcode::Vcvtsi2ss | AvxOpcode::Vcvtsi2sd = op {
insn = insn.vvvv(dst.to_real_reg().unwrap().hw_enc());
}
insn.encode(sink);
.reg(dst.to_real_reg().unwrap().hw_enc())
.encode(sink);
}

Inst::XmmRmREvex {
Expand Down Expand Up @@ -3200,8 +3197,6 @@ pub(crate) fn emit(
// Movd and movq use the same opcode; the presence of the REX prefix (set below)
// actually determines which is used.
SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E),
SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
_ => panic!("unexpected opcode {:?}", op),
};
let rex = RexFlags::from(*src_size);
Expand Down Expand Up @@ -3239,6 +3234,72 @@ pub(crate) fn emit(
}
}

Inst::CvtIntToFloat {
op,
src1,
src2,
dst,
src2_size,
} => {
let src1 = allocs.next(src1.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
assert_eq!(src1, dst);
let src2 = src2.clone().to_reg_mem().with_allocs(allocs);

let (prefix, opcode) = match op {
SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
_ => panic!("unexpected opcode {:?}", op),
};
let rex = RexFlags::from(*src2_size);
match src2 {
RegMem::Reg { reg: src2 } => {
emit_std_reg_reg(sink, prefix, opcode, 2, dst, src2, rex);
}
RegMem::Mem { addr } => {
let addr = &addr.finalize(state, sink);
emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex, 0);
}
}
}

Inst::CvtIntToFloatVex {
op,
src1,
src2,
dst,
src2_size,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};

let (prefix, map, opcode) = match op {
AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A),
AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
let w = match src2_size {
OperandSize::Size64 => true,
_ => false,
};
VexInstruction::new()
.length(VexVectorLength::V128)
.w(w)
.prefix(prefix)
.map(map)
.opcode(opcode)
.rm(src2)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.encode(sink);
}

Inst::CvtUint64ToFloatSeq {
dst_size,
src,
Expand Down
20 changes: 0 additions & 20 deletions cranelift/codegen/src/isa/x64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5051,26 +5051,6 @@ fn test_x64_emit() {
"664C0F6EFF",
"movq %rdi, %xmm15",
));
insns.push((
Inst::gpr_to_xmm(
SseOpcode::Cvtsi2ss,
RegMem::reg(rdi),
OperandSize::Size32,
w_xmm15,
),
"F3440F2AFF",
"cvtsi2ss %edi, %xmm15",
));
insns.push((
Inst::gpr_to_xmm(
SseOpcode::Cvtsi2sd,
RegMem::reg(rsi),
OperandSize::Size64,
w_xmm1,
),
"F2480F2ACE",
"cvtsi2sd %rsi, %xmm1",
));

// ========================================================
// XmmRmi
Expand Down
48 changes: 46 additions & 2 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ impl Inst {
| Inst::XmmToGprImm { op, .. }
| Inst::XmmUnaryRmRImm { op, .. }
| Inst::XmmUnaryRmRUnaligned { op, .. }
| Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
| Inst::XmmUnaryRmR { op, .. }
| Inst::CvtIntToFloat { op, .. } => smallvec![op.available_from()],

Inst::XmmUnaryRmREvex { op, .. }
| Inst::XmmRmREvex { op, .. }
Expand All @@ -196,7 +197,8 @@ impl Inst {
| Inst::XmmMovRMImmVex { op, .. }
| Inst::XmmToGprImmVex { op, .. }
| Inst::XmmToGprVex { op, .. }
| Inst::GprToXmmVex { op, .. } => op.available_from(),
| Inst::GprToXmmVex { op, .. }
| Inst::CvtIntToFloatVex { op, .. } => op.available_from(),
}
}
}
Expand Down Expand Up @@ -1296,6 +1298,34 @@ impl PrettyPrint for Inst {
format!("{op} {src}, {dst}")
}

Inst::CvtIntToFloat {
op,
src1,
src2,
dst,
src2_size,
} => {
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
let dst = pretty_print_reg(*dst.to_reg(), 8, allocs);
let src2 = src2.pretty_print(src2_size.to_bytes(), allocs);
let op = ljustify(op.to_string());
format!("{op} {src1}, {src2}, {dst}")
}

Inst::CvtIntToFloatVex {
op,
src1,
src2,
dst,
src2_size,
} => {
let dst = pretty_print_reg(*dst.to_reg(), 8, allocs);
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
let src2 = src2.pretty_print(src2_size.to_bytes(), allocs);
let op = ljustify(op.to_string());
format!("{op} {src1}, {src2}, {dst}")
}

Inst::CvtUint64ToFloatSeq {
src,
dst,
Expand Down Expand Up @@ -2164,6 +2194,20 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
collector.reg_def(dst.to_writable_reg());
src.get_operands(collector);
}
Inst::CvtIntToFloat {
src1, src2, dst, ..
} => {
collector.reg_use(src1.to_reg());
collector.reg_reuse_def(dst.to_writable_reg(), 0);
src2.get_operands(collector);
}
Inst::CvtIntToFloatVex {
src1, src2, dst, ..
} => {
collector.reg_def(dst.to_writable_reg());
collector.reg_use(src1.to_reg());
src2.get_operands(collector);
}
Inst::CvtUint64ToFloatSeq {
src,
dst,
Expand Down
33 changes: 25 additions & 8 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3336,23 +3336,40 @@

;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Note that the `cvtsi2s{s,d}` instruction is not just an int-to-float
;; conversion instruction in isolation, it also takes the upper 64-bits of an
;; xmm register and places it into the destination. We don't actually want that
;; to happen as it could accidentally create a false dependency with a
;; previous instruction defining the register's upper 64-bits. See #7085 for
;; an instance of this.
;;
;; This means that the first operand to all of the int-to-float conversions here
;; are `(xmm_zero)` operands which is a guaranteed zero register that has no
;; dependencies on other instructions.
;;
;; Ideally this would be lifted out to a higher level to get deduplicated
;; between consecutive int-to-float operations but that's not easy
;; to do at this time. One possibility would be a mid-end rule which rewrites
;; `fcvt_from_sint` to an x86-specific opcode using a zero constant which would
;; be subject to normal LICM, but that's not feasible today.

(rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I8))))
(x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
(x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))

(rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I16))))
(x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
(x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))

(rule 1 (lower (has_type $F32 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty))))))
(x64_cvtsi2ss ty a))
(x64_cvtsi2ss ty (xmm_zero $F32X4) a))

(rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I8))))
(x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
(x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))

(rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I16))))
(x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
(x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))

(rule 1 (lower (has_type $F64 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty))))))
(x64_cvtsi2sd ty a))
(x64_cvtsi2sd ty (xmm_zero $F64X2) a))

(rule 0 (lower (fcvt_from_sint a @ (value_type $I32X4)))
(x64_cvtdq2ps a))
Expand All @@ -3363,10 +3380,10 @@
;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
(x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
(x64_cvtsi2ss $I64 (xmm_zero $F32X4) (extend_to_gpr val $I64 (ExtendKind.Zero))))

(rule 1 (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
(x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
(x64_cvtsi2sd $I64 (xmm_zero $F64X2) (extend_to_gpr val $I64 (ExtendKind.Zero))))

(rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64))))
(cvt_u64_to_float_seq ty val))
Expand Down
Loading

0 comments on commit 9982992

Please sign in to comment.