From 998299267f22eae612a846604daae86f17451c1f Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Thu, 28 Sep 2023 09:53:01 -0500 Subject: [PATCH] x64: Fix false dependencies in int-to-float conversions (#7098) * x64: Fix false dependencies in int-to-float conversions This commit is a result of the investigation on #7085. The int-to-float conversion instructions used right now on the x64 backend will implicitly source the upper bits of the result from a different register. This implicitly creates a dependency on further consumers using the conversion result on whatever previously defined the upper bits, even though they aren't used. This false dependency is the primary reason for the slowdown witnessed in #7085. The fix chosen in this commit is to model the int-to-float instructions with a new shape of instruction instead of the previous `GprToXmm{,Vex}`. This previous shape was modeled as single-input and single-output, but this does not reflect the actual nature of the `cvtsi2s{s,d}` instructions. Instead these now use `CvtIntToFloat{,Vex}` which have two source operands and one destination operand, modeling how the upper bits of a different register are used. In lowerings using this instruction the upper bits to preserver are always sourced from a zero'd out register to force breaking dependencies between instructions. Closes #7085 * Remove now dead code * Remove outdated test Golden test output covers this test case anyway nowadays * Review comments * Fix emit tests --- cranelift/codegen/src/isa/x64/inst.isle | 49 +++++-- cranelift/codegen/src/isa/x64/inst/emit.rs | 93 ++++++++++--- .../codegen/src/isa/x64/inst/emit_tests.rs | 20 --- cranelift/codegen/src/isa/x64/inst/mod.rs | 48 ++++++- cranelift/codegen/src/isa/x64/lower.isle | 33 +++-- .../filetests/filetests/isa/x64/fastcall.clif | 26 +++- .../filetests/filetests/isa/x64/fcvt-avx.clif | 28 ++-- .../filetests/filetests/isa/x64/fcvt.clif | 127 +++++++++++------- 8 files changed, 306 insertions(+), 118 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index e9fb7779cadf..a3c7bb654c1c 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -437,6 +437,23 @@ (dst WritableXmm) (src_size OperandSize)) + ;; Conversion from signed integers to floats, the `{v,}`cvtsi2s{s,d}` + ;; instructions. + ;; + ;; Note that this is special in that `src1` is an xmm/float register + ;; while `src2` is a general purpose register as this is converting an + ;; integer in a gpr to an equivalent float in an xmm reg. + (CvtIntToFloat (op SseOpcode) + (src1 Xmm) + (src2 GprMem) + (dst WritableXmm) + (src2_size OperandSize)) + (CvtIntToFloatVex (op AvxOpcode) + (src1 Xmm) + (src2 GprMem) + (dst WritableXmm) + (src2_size OperandSize)) + ;; Converts an unsigned int64 to a float32/float64. (CvtUint64ToFloatSeq (dst_size OperandSize) ;; 4 or 8 (src Gpr) @@ -2095,6 +2112,18 @@ (_ Unit (emit (MInst.UnaryRmRImmVex size op src dst imm)))) dst)) +(decl cvt_int_to_float (SseOpcode Xmm GprMem OperandSize) Xmm) +(rule (cvt_int_to_float op src1 src2 size) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.CvtIntToFloat op src1 src2 dst size)))) + dst)) + +(decl cvt_int_to_float_vex (AvxOpcode Xmm GprMem OperandSize) Xmm) +(rule (cvt_int_to_float_vex op src1 src2 size) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.CvtIntToFloatVex op src1 src2 dst size)))) + dst)) + (decl cvt_u64_to_float_seq (Type Gpr) Xmm) (rule (cvt_u64_to_float_seq ty src) (let ((size OperandSize (raw_operand_size_of_type ty)) @@ -4351,20 +4380,20 @@ (xmm_unary_rm_r_vex (AvxOpcode.Vcvtdq2pd) x)) ;; Helper for creating `cvtsi2ss` instructions. -(decl x64_cvtsi2ss (Type GprMem) Xmm) -(rule (x64_cvtsi2ss ty x) - (gpr_to_xmm (SseOpcode.Cvtsi2ss) x (raw_operand_size_of_type ty))) -(rule 1 (x64_cvtsi2ss ty x) +(decl x64_cvtsi2ss (Type Xmm GprMem) Xmm) +(rule (x64_cvtsi2ss ty x y) + (cvt_int_to_float (SseOpcode.Cvtsi2ss) x y (raw_operand_size_of_type ty))) +(rule 1 (x64_cvtsi2ss ty x y) (if-let $true (use_avx)) - (gpr_to_xmm_vex (AvxOpcode.Vcvtsi2ss) x (raw_operand_size_of_type ty))) + (cvt_int_to_float_vex (AvxOpcode.Vcvtsi2ss) x y (raw_operand_size_of_type ty))) ;; Helper for creating `cvtsi2sd` instructions. -(decl x64_cvtsi2sd (Type GprMem) Xmm) -(rule (x64_cvtsi2sd ty x) - (gpr_to_xmm (SseOpcode.Cvtsi2sd) x (raw_operand_size_of_type ty))) -(rule 1 (x64_cvtsi2sd ty x) +(decl x64_cvtsi2sd (Type Xmm GprMem) Xmm) +(rule (x64_cvtsi2sd ty x y) + (cvt_int_to_float (SseOpcode.Cvtsi2sd) x y (raw_operand_size_of_type ty))) +(rule 1 (x64_cvtsi2sd ty x y) (if-let $true (use_avx)) - (gpr_to_xmm_vex (AvxOpcode.Vcvtsi2sd) x (raw_operand_size_of_type ty))) + (cvt_int_to_float_vex (AvxOpcode.Vcvtsi2sd) x y (raw_operand_size_of_type ty))) ;; Helper for creating `cvttps2dq` instructions. (decl x64_cvttps2dq (XmmMem) Xmm) diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 0be0db116c2e..6bd65154eb23 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -32,8 +32,14 @@ fn emit_signed_cvt( } else { SseOpcode::Cvtsi2ss }; - let inst = Inst::gpr_to_xmm(op, RegMem::reg(src), OperandSize::Size64, dst); - inst.emit(&[], sink, info, state); + Inst::CvtIntToFloat { + op, + dst: Writable::from_reg(Xmm::new(dst.to_reg()).unwrap()), + src1: Xmm::new(dst.to_reg()).unwrap(), + src2: GprMem::new(RegMem::reg(src)).unwrap(), + src2_size: OperandSize::Size64, + } + .emit(&[], sink, info, state); } /// Emits a one way conditional jump if CC is set (true). @@ -2872,30 +2878,21 @@ pub(crate) fn emit( let (prefix, map, opcode) = match op { // vmovd/vmovq are differentiated by `w` AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x6E), - AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A), - AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A), _ => unimplemented!("Opcode {:?} not implemented", op), }; let w = match src_size { OperandSize::Size64 => true, _ => false, }; - let mut insn = VexInstruction::new() + VexInstruction::new() .length(VexVectorLength::V128) .w(w) .prefix(prefix) .map(map) .opcode(opcode) .rm(src) - .reg(dst.to_real_reg().unwrap().hw_enc()); - // These opcodes technically take a second operand which is the - // upper bits to preserve during the float conversion. We don't - // actually use this in this backend right now so reuse the - // destination register. This at least matches what LLVM does. - if let AvxOpcode::Vcvtsi2ss | AvxOpcode::Vcvtsi2sd = op { - insn = insn.vvvv(dst.to_real_reg().unwrap().hw_enc()); - } - insn.encode(sink); + .reg(dst.to_real_reg().unwrap().hw_enc()) + .encode(sink); } Inst::XmmRmREvex { @@ -3200,8 +3197,6 @@ pub(crate) fn emit( // Movd and movq use the same opcode; the presence of the REX prefix (set below) // actually determines which is used. SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E), - SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A), - SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A), _ => panic!("unexpected opcode {:?}", op), }; let rex = RexFlags::from(*src_size); @@ -3239,6 +3234,72 @@ pub(crate) fn emit( } } + Inst::CvtIntToFloat { + op, + src1, + src2, + dst, + src2_size, + } => { + let src1 = allocs.next(src1.to_reg()); + let dst = allocs.next(dst.to_reg().to_reg()); + assert_eq!(src1, dst); + let src2 = src2.clone().to_reg_mem().with_allocs(allocs); + + let (prefix, opcode) = match op { + SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A), + SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A), + _ => panic!("unexpected opcode {:?}", op), + }; + let rex = RexFlags::from(*src2_size); + match src2 { + RegMem::Reg { reg: src2 } => { + emit_std_reg_reg(sink, prefix, opcode, 2, dst, src2, rex); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state, sink); + emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex, 0); + } + } + } + + Inst::CvtIntToFloatVex { + op, + src1, + src2, + dst, + src2_size, + } => { + let dst = allocs.next(dst.to_reg().to_reg()); + let src1 = allocs.next(src1.to_reg()); + let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) { + RegMem::Reg { reg } => { + RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into()) + } + RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), + }; + + let (prefix, map, opcode) = match op { + AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A), + AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + let w = match src2_size { + OperandSize::Size64 => true, + _ => false, + }; + VexInstruction::new() + .length(VexVectorLength::V128) + .w(w) + .prefix(prefix) + .map(map) + .opcode(opcode) + .rm(src2) + .reg(dst.to_real_reg().unwrap().hw_enc()) + .vvvv(src1.to_real_reg().unwrap().hw_enc()) + .encode(sink); + } + Inst::CvtUint64ToFloatSeq { dst_size, src, diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index d0050b8d4499..147f61deb832 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -5051,26 +5051,6 @@ fn test_x64_emit() { "664C0F6EFF", "movq %rdi, %xmm15", )); - insns.push(( - Inst::gpr_to_xmm( - SseOpcode::Cvtsi2ss, - RegMem::reg(rdi), - OperandSize::Size32, - w_xmm15, - ), - "F3440F2AFF", - "cvtsi2ss %edi, %xmm15", - )); - insns.push(( - Inst::gpr_to_xmm( - SseOpcode::Cvtsi2sd, - RegMem::reg(rsi), - OperandSize::Size64, - w_xmm1, - ), - "F2480F2ACE", - "cvtsi2sd %rsi, %xmm1", - )); // ======================================================== // XmmRmi diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index ca72a434bb87..2a321c7f44f6 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -178,7 +178,8 @@ impl Inst { | Inst::XmmToGprImm { op, .. } | Inst::XmmUnaryRmRImm { op, .. } | Inst::XmmUnaryRmRUnaligned { op, .. } - | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()], + | Inst::XmmUnaryRmR { op, .. } + | Inst::CvtIntToFloat { op, .. } => smallvec![op.available_from()], Inst::XmmUnaryRmREvex { op, .. } | Inst::XmmRmREvex { op, .. } @@ -196,7 +197,8 @@ impl Inst { | Inst::XmmMovRMImmVex { op, .. } | Inst::XmmToGprImmVex { op, .. } | Inst::XmmToGprVex { op, .. } - | Inst::GprToXmmVex { op, .. } => op.available_from(), + | Inst::GprToXmmVex { op, .. } + | Inst::CvtIntToFloatVex { op, .. } => op.available_from(), } } } @@ -1296,6 +1298,34 @@ impl PrettyPrint for Inst { format!("{op} {src}, {dst}") } + Inst::CvtIntToFloat { + op, + src1, + src2, + dst, + src2_size, + } => { + let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); + let dst = pretty_print_reg(*dst.to_reg(), 8, allocs); + let src2 = src2.pretty_print(src2_size.to_bytes(), allocs); + let op = ljustify(op.to_string()); + format!("{op} {src1}, {src2}, {dst}") + } + + Inst::CvtIntToFloatVex { + op, + src1, + src2, + dst, + src2_size, + } => { + let dst = pretty_print_reg(*dst.to_reg(), 8, allocs); + let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); + let src2 = src2.pretty_print(src2_size.to_bytes(), allocs); + let op = ljustify(op.to_string()); + format!("{op} {src1}, {src2}, {dst}") + } + Inst::CvtUint64ToFloatSeq { src, dst, @@ -2164,6 +2194,20 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_def(dst.to_writable_reg()); src.get_operands(collector); } + Inst::CvtIntToFloat { + src1, src2, dst, .. + } => { + collector.reg_use(src1.to_reg()); + collector.reg_reuse_def(dst.to_writable_reg(), 0); + src2.get_operands(collector); + } + Inst::CvtIntToFloatVex { + src1, src2, dst, .. + } => { + collector.reg_def(dst.to_writable_reg()); + collector.reg_use(src1.to_reg()); + src2.get_operands(collector); + } Inst::CvtUint64ToFloatSeq { src, dst, diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 69e9f229f15c..9691cec469ef 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3336,23 +3336,40 @@ ;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Note that the `cvtsi2s{s,d}` instruction is not just an int-to-float +;; conversion instruction in isolation, it also takes the upper 64-bits of an +;; xmm register and places it into the destination. We don't actually want that +;; to happen as it could accidentally create a false dependency with a +;; previous instruction defining the register's upper 64-bits. See #7085 for +;; an instance of this. +;; +;; This means that the first operand to all of the int-to-float conversions here +;; are `(xmm_zero)` operands which is a guaranteed zero register that has no +;; dependencies on other instructions. +;; +;; Ideally this would be lifted out to a higher level to get deduplicated +;; between consecutive int-to-float operations but that's not easy +;; to do at this time. One possibility would be a mid-end rule which rewrites +;; `fcvt_from_sint` to an x86-specific opcode using a zero constant which would +;; be subject to normal LICM, but that's not feasible today. + (rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I8)))) - (x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) + (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign)))) (rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I16)))) - (x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) + (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign)))) (rule 1 (lower (has_type $F32 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty)))))) - (x64_cvtsi2ss ty a)) + (x64_cvtsi2ss ty (xmm_zero $F32X4) a)) (rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I8)))) - (x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) + (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign)))) (rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I16)))) - (x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) + (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign)))) (rule 1 (lower (has_type $F64 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty)))))) - (x64_cvtsi2sd ty a)) + (x64_cvtsi2sd ty (xmm_zero $F64X2) a)) (rule 0 (lower (fcvt_from_sint a @ (value_type $I32X4))) (x64_cvtdq2ps a)) @@ -3363,10 +3380,10 @@ ;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 1 (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) - (x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero)))) + (x64_cvtsi2ss $I64 (xmm_zero $F32X4) (extend_to_gpr val $I64 (ExtendKind.Zero)))) (rule 1 (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) - (x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero)))) + (x64_cvtsi2sd $I64 (xmm_zero $F64X2) (extend_to_gpr val $I64 (ExtendKind.Zero)))) (rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64)))) (cvt_u64_to_float_seq ty val)) diff --git a/cranelift/filetests/filetests/isa/x64/fastcall.clif b/cranelift/filetests/filetests/isa/x64/fastcall.clif index 86e865b61493..d83c9fbee0f0 100644 --- a/cranelift/filetests/filetests/isa/x64/fastcall.clif +++ b/cranelift/filetests/filetests/isa/x64/fastcall.clif @@ -241,20 +241,27 @@ block0(v0: i64): ; pushq %rbp ; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } ; movq %rsp, %rbp -; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 16 } +; subq %rsp, $16, %rsp +; movq %rdi, 0(%rsp) +; unwind SaveReg { clobber_offset: 0, reg: p7i } ; block0: -; cvtsi2sd %rcx, %xmm3 +; uninit %xmm3 +; xorpd %xmm3, %xmm3, %xmm3 +; cvtsi2sd %xmm3, %rcx, %xmm3 ; subq %rsp, $48, %rsp ; virtual_sp_offset_adjust 48 ; movq %rcx, 32(%rsp) ; movq %rcx, 40(%rsp) ; movq %rcx, %rdx -; load_ext_name %g+0, %r11 +; load_ext_name %g+0, %rdi ; movq %rdx, %rcx ; movdqa %xmm3, %xmm2 -; call *%r11 +; call *%rdi ; addq %rsp, $48, %rsp ; virtual_sp_offset_adjust -48 +; movq 0(%rsp), %rdi +; addq %rsp, $16, %rsp ; movq %rbp, %rsp ; popq %rbp ; ret @@ -263,17 +270,22 @@ block0(v0: i64): ; block0: ; offset 0x0 ; pushq %rbp ; movq %rsp, %rbp -; block1: ; offset 0x4 +; subq $0x10, %rsp +; movq %rdi, (%rsp) +; block1: ; offset 0xc +; xorpd %xmm3, %xmm3 ; cvtsi2sdq %rcx, %xmm3 ; subq $0x30, %rsp ; movq %rcx, 0x20(%rsp) ; movq %rcx, 0x28(%rsp) ; movq %rcx, %rdx -; movabsq $0, %r11 ; reloc_external Abs8 %g 0 +; movabsq $0, %rdi ; reloc_external Abs8 %g 0 ; movq %rdx, %rcx ; movdqa %xmm3, %xmm2 -; callq *%r11 +; callq *%rdi ; addq $0x30, %rsp +; movq (%rsp), %rdi +; addq $0x10, %rsp ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/fcvt-avx.clif b/cranelift/filetests/filetests/isa/x64/fcvt-avx.clif index 98e47d2b79f8..d46212e18597 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt-avx.clif @@ -11,7 +11,9 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vcvtsi2ss %edi, %xmm0 +; uninit %xmm2 +; vxorps %xmm2, %xmm2, %xmm4 +; vcvtsi2ss %xmm4, %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -21,7 +23,8 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vcvtsi2ssl %edi, %xmm0, %xmm0 +; vxorps %xmm2, %xmm2, %xmm4 +; vcvtsi2ssl %edi, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -36,7 +39,9 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vcvtsi2ss %rdi, %xmm0 +; uninit %xmm2 +; vxorps %xmm2, %xmm2, %xmm4 +; vcvtsi2ss %xmm4, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -46,7 +51,8 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vcvtsi2ssq %rdi, %xmm0, %xmm0 +; vxorps %xmm2, %xmm2, %xmm4 +; vcvtsi2ssq %rdi, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -61,7 +67,9 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vcvtsi2sd %edi, %xmm0 +; uninit %xmm2 +; vxorpd %xmm2, %xmm2, %xmm4 +; vcvtsi2sd %xmm4, %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -71,7 +79,8 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vcvtsi2sdl %edi, %xmm0, %xmm0 +; vxorpd %xmm2, %xmm2, %xmm4 +; vcvtsi2sdl %edi, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -86,7 +95,9 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vcvtsi2sd %rdi, %xmm0 +; uninit %xmm2 +; vxorpd %xmm2, %xmm2, %xmm4 +; vcvtsi2sd %xmm4, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -96,7 +107,8 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vcvtsi2sdq %rdi, %xmm0, %xmm0 +; vxorpd %xmm2, %xmm2, %xmm4 +; vcvtsi2sdq %rdi, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif index 0dfa8637e0cd..1a04881cf022 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif @@ -11,8 +11,10 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movsbl %dil, %eax -; cvtsi2ss %eax, %xmm0 +; uninit %xmm0 +; xorps %xmm0, %xmm0, %xmm0 +; movsbl %dil, %r9d +; cvtsi2ss %xmm0, %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -22,8 +24,9 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movsbl %dil, %eax -; cvtsi2ssl %eax, %xmm0 +; xorps %xmm0, %xmm0 +; movsbl %dil, %r9d +; cvtsi2ssl %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -38,8 +41,10 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movswl %di, %eax -; cvtsi2ss %eax, %xmm0 +; uninit %xmm0 +; xorps %xmm0, %xmm0, %xmm0 +; movswl %di, %r9d +; cvtsi2ss %xmm0, %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -49,8 +54,9 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movswl %di, %eax -; cvtsi2ssl %eax, %xmm0 +; xorps %xmm0, %xmm0 +; movswl %di, %r9d +; cvtsi2ssl %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -65,7 +71,9 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; cvtsi2ss %edi, %xmm0 +; uninit %xmm0 +; xorps %xmm0, %xmm0, %xmm0 +; cvtsi2ss %xmm0, %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -75,6 +83,7 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 +; xorps %xmm0, %xmm0 ; cvtsi2ssl %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -90,7 +99,9 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; cvtsi2ss %rdi, %xmm0 +; uninit %xmm0 +; xorps %xmm0, %xmm0, %xmm0 +; cvtsi2ss %xmm0, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -100,6 +111,7 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 +; xorps %xmm0, %xmm0 ; cvtsi2ssq %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -115,8 +127,10 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movsbl %dil, %eax -; cvtsi2sd %eax, %xmm0 +; uninit %xmm0 +; xorpd %xmm0, %xmm0, %xmm0 +; movsbl %dil, %r9d +; cvtsi2sd %xmm0, %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -126,8 +140,9 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movsbl %dil, %eax -; cvtsi2sdl %eax, %xmm0 +; xorpd %xmm0, %xmm0 +; movsbl %dil, %r9d +; cvtsi2sdl %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -142,8 +157,10 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movswl %di, %eax -; cvtsi2sd %eax, %xmm0 +; uninit %xmm0 +; xorpd %xmm0, %xmm0, %xmm0 +; movswl %di, %r9d +; cvtsi2sd %xmm0, %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -153,8 +170,9 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movswl %di, %eax -; cvtsi2sdl %eax, %xmm0 +; xorpd %xmm0, %xmm0 +; movswl %di, %r9d +; cvtsi2sdl %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -169,7 +187,9 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; cvtsi2sd %edi, %xmm0 +; uninit %xmm0 +; xorpd %xmm0, %xmm0, %xmm0 +; cvtsi2sd %xmm0, %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -179,6 +199,7 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 +; xorpd %xmm0, %xmm0 ; cvtsi2sdl %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -194,7 +215,9 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; cvtsi2sd %rdi, %xmm0 +; uninit %xmm0 +; xorpd %xmm0, %xmm0, %xmm0 +; cvtsi2sd %xmm0, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -204,6 +227,7 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 +; xorpd %xmm0, %xmm0 ; cvtsi2sdq %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -251,16 +275,22 @@ block0(v0: i8, v1: i16, v2: i32, v3: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movzbq %dil, %r9 -; cvtsi2ss %r9, %xmm0 -; movzwq %si, %r9 -; cvtsi2ss %r9, %xmm1 -; movl %edx, %r9d -; cvtsi2ss %r9, %xmm2 -; u64_to_f32_seq %rcx, %xmm6, %r9, %r10 -; addss %xmm0, %xmm1, %xmm0 -; addss %xmm0, %xmm2, %xmm0 +; uninit %xmm0 +; xorps %xmm0, %xmm0, %xmm0 +; movzbq %dil, %r8 +; cvtsi2ss %xmm0, %r8, %xmm0 +; uninit %xmm6 +; xorps %xmm6, %xmm6, %xmm6 +; movzwq %si, %r8 +; cvtsi2ss %xmm6, %r8, %xmm6 +; uninit %xmm7 +; xorps %xmm7, %xmm7, %xmm7 +; movl %edx, %r8d +; cvtsi2ss %xmm7, %r8, %xmm7 +; u64_to_f32_seq %rcx, %xmm4, %r8, %rdx ; addss %xmm0, %xmm6, %xmm0 +; addss %xmm0, %xmm7, %xmm0 +; addss %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -270,26 +300,29 @@ block0(v0: i8, v1: i16, v2: i32, v3: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movzbq %dil, %r9 -; cvtsi2ssq %r9, %xmm0 -; movzwq %si, %r9 -; cvtsi2ssq %r9, %xmm1 -; movl %edx, %r9d -; cvtsi2ssq %r9, %xmm2 +; xorps %xmm0, %xmm0 +; movzbq %dil, %r8 +; cvtsi2ssq %r8, %xmm0 +; xorps %xmm6, %xmm6 +; movzwq %si, %r8 +; cvtsi2ssq %r8, %xmm6 +; xorps %xmm7, %xmm7 +; movl %edx, %r8d +; cvtsi2ssq %r8, %xmm7 ; cmpq $0, %rcx -; jl 0x32 -; cvtsi2ssq %rcx, %xmm6 -; jmp 0x4c -; movq %rcx, %r9 -; shrq $1, %r9 -; movq %rcx, %r10 -; andq $1, %r10 -; orq %r9, %r10 -; cvtsi2ssq %r10, %xmm6 -; addss %xmm6, %xmm6 -; addss %xmm1, %xmm0 -; addss %xmm2, %xmm0 +; jl 0x3b +; cvtsi2ssq %rcx, %xmm4 +; jmp 0x55 +; movq %rcx, %r8 +; shrq $1, %r8 +; movq %rcx, %rdx +; andq $1, %rdx +; orq %r8, %rdx +; cvtsi2ssq %rdx, %xmm4 +; addss %xmm4, %xmm4 ; addss %xmm6, %xmm0 +; addss %xmm7, %xmm0 +; addss %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq