diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index e9fb7779cadf..a3c7bb654c1c 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -437,6 +437,23 @@ (dst WritableXmm) (src_size OperandSize)) + ;; Conversion from signed integers to floats, the `{v,}`cvtsi2s{s,d}` + ;; instructions. + ;; + ;; Note that this is special in that `src1` is an xmm/float register + ;; while `src2` is a general purpose register as this is converting an + ;; integer in a gpr to an equivalent float in an xmm reg. + (CvtIntToFloat (op SseOpcode) + (src1 Xmm) + (src2 GprMem) + (dst WritableXmm) + (src2_size OperandSize)) + (CvtIntToFloatVex (op AvxOpcode) + (src1 Xmm) + (src2 GprMem) + (dst WritableXmm) + (src2_size OperandSize)) + ;; Converts an unsigned int64 to a float32/float64. (CvtUint64ToFloatSeq (dst_size OperandSize) ;; 4 or 8 (src Gpr) @@ -2095,6 +2112,18 @@ (_ Unit (emit (MInst.UnaryRmRImmVex size op src dst imm)))) dst)) +(decl cvt_int_to_float (SseOpcode Xmm GprMem OperandSize) Xmm) +(rule (cvt_int_to_float op src1 src2 size) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.CvtIntToFloat op src1 src2 dst size)))) + dst)) + +(decl cvt_int_to_float_vex (AvxOpcode Xmm GprMem OperandSize) Xmm) +(rule (cvt_int_to_float_vex op src1 src2 size) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.CvtIntToFloatVex op src1 src2 dst size)))) + dst)) + (decl cvt_u64_to_float_seq (Type Gpr) Xmm) (rule (cvt_u64_to_float_seq ty src) (let ((size OperandSize (raw_operand_size_of_type ty)) @@ -4351,20 +4380,20 @@ (xmm_unary_rm_r_vex (AvxOpcode.Vcvtdq2pd) x)) ;; Helper for creating `cvtsi2ss` instructions. -(decl x64_cvtsi2ss (Type GprMem) Xmm) -(rule (x64_cvtsi2ss ty x) - (gpr_to_xmm (SseOpcode.Cvtsi2ss) x (raw_operand_size_of_type ty))) -(rule 1 (x64_cvtsi2ss ty x) +(decl x64_cvtsi2ss (Type Xmm GprMem) Xmm) +(rule (x64_cvtsi2ss ty x y) + (cvt_int_to_float (SseOpcode.Cvtsi2ss) x y (raw_operand_size_of_type ty))) +(rule 1 (x64_cvtsi2ss ty x y) (if-let $true (use_avx)) - (gpr_to_xmm_vex (AvxOpcode.Vcvtsi2ss) x (raw_operand_size_of_type ty))) + (cvt_int_to_float_vex (AvxOpcode.Vcvtsi2ss) x y (raw_operand_size_of_type ty))) ;; Helper for creating `cvtsi2sd` instructions. -(decl x64_cvtsi2sd (Type GprMem) Xmm) -(rule (x64_cvtsi2sd ty x) - (gpr_to_xmm (SseOpcode.Cvtsi2sd) x (raw_operand_size_of_type ty))) -(rule 1 (x64_cvtsi2sd ty x) +(decl x64_cvtsi2sd (Type Xmm GprMem) Xmm) +(rule (x64_cvtsi2sd ty x y) + (cvt_int_to_float (SseOpcode.Cvtsi2sd) x y (raw_operand_size_of_type ty))) +(rule 1 (x64_cvtsi2sd ty x y) (if-let $true (use_avx)) - (gpr_to_xmm_vex (AvxOpcode.Vcvtsi2sd) x (raw_operand_size_of_type ty))) + (cvt_int_to_float_vex (AvxOpcode.Vcvtsi2sd) x y (raw_operand_size_of_type ty))) ;; Helper for creating `cvttps2dq` instructions. (decl x64_cvttps2dq (XmmMem) Xmm) diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 0be0db116c2e..6bd65154eb23 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -32,8 +32,14 @@ fn emit_signed_cvt( } else { SseOpcode::Cvtsi2ss }; - let inst = Inst::gpr_to_xmm(op, RegMem::reg(src), OperandSize::Size64, dst); - inst.emit(&[], sink, info, state); + Inst::CvtIntToFloat { + op, + dst: Writable::from_reg(Xmm::new(dst.to_reg()).unwrap()), + src1: Xmm::new(dst.to_reg()).unwrap(), + src2: GprMem::new(RegMem::reg(src)).unwrap(), + src2_size: OperandSize::Size64, + } + .emit(&[], sink, info, state); } /// Emits a one way conditional jump if CC is set (true). @@ -2872,30 +2878,21 @@ pub(crate) fn emit( let (prefix, map, opcode) = match op { // vmovd/vmovq are differentiated by `w` AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x6E), - AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A), - AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A), _ => unimplemented!("Opcode {:?} not implemented", op), }; let w = match src_size { OperandSize::Size64 => true, _ => false, }; - let mut insn = VexInstruction::new() + VexInstruction::new() .length(VexVectorLength::V128) .w(w) .prefix(prefix) .map(map) .opcode(opcode) .rm(src) - .reg(dst.to_real_reg().unwrap().hw_enc()); - // These opcodes technically take a second operand which is the - // upper bits to preserve during the float conversion. We don't - // actually use this in this backend right now so reuse the - // destination register. This at least matches what LLVM does. - if let AvxOpcode::Vcvtsi2ss | AvxOpcode::Vcvtsi2sd = op { - insn = insn.vvvv(dst.to_real_reg().unwrap().hw_enc()); - } - insn.encode(sink); + .reg(dst.to_real_reg().unwrap().hw_enc()) + .encode(sink); } Inst::XmmRmREvex { @@ -3200,8 +3197,6 @@ pub(crate) fn emit( // Movd and movq use the same opcode; the presence of the REX prefix (set below) // actually determines which is used. SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E), - SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A), - SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A), _ => panic!("unexpected opcode {:?}", op), }; let rex = RexFlags::from(*src_size); @@ -3239,6 +3234,72 @@ pub(crate) fn emit( } } + Inst::CvtIntToFloat { + op, + src1, + src2, + dst, + src2_size, + } => { + let src1 = allocs.next(src1.to_reg()); + let dst = allocs.next(dst.to_reg().to_reg()); + assert_eq!(src1, dst); + let src2 = src2.clone().to_reg_mem().with_allocs(allocs); + + let (prefix, opcode) = match op { + SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A), + SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A), + _ => panic!("unexpected opcode {:?}", op), + }; + let rex = RexFlags::from(*src2_size); + match src2 { + RegMem::Reg { reg: src2 } => { + emit_std_reg_reg(sink, prefix, opcode, 2, dst, src2, rex); + } + RegMem::Mem { addr } => { + let addr = &addr.finalize(state, sink); + emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex, 0); + } + } + } + + Inst::CvtIntToFloatVex { + op, + src1, + src2, + dst, + src2_size, + } => { + let dst = allocs.next(dst.to_reg().to_reg()); + let src1 = allocs.next(src1.to_reg()); + let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) { + RegMem::Reg { reg } => { + RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into()) + } + RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), + }; + + let (prefix, map, opcode) = match op { + AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A), + AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A), + _ => unimplemented!("Opcode {:?} not implemented", op), + }; + let w = match src2_size { + OperandSize::Size64 => true, + _ => false, + }; + VexInstruction::new() + .length(VexVectorLength::V128) + .w(w) + .prefix(prefix) + .map(map) + .opcode(opcode) + .rm(src2) + .reg(dst.to_real_reg().unwrap().hw_enc()) + .vvvv(src1.to_real_reg().unwrap().hw_enc()) + .encode(sink); + } + Inst::CvtUint64ToFloatSeq { dst_size, src, diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index d0050b8d4499..147f61deb832 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -5051,26 +5051,6 @@ fn test_x64_emit() { "664C0F6EFF", "movq %rdi, %xmm15", )); - insns.push(( - Inst::gpr_to_xmm( - SseOpcode::Cvtsi2ss, - RegMem::reg(rdi), - OperandSize::Size32, - w_xmm15, - ), - "F3440F2AFF", - "cvtsi2ss %edi, %xmm15", - )); - insns.push(( - Inst::gpr_to_xmm( - SseOpcode::Cvtsi2sd, - RegMem::reg(rsi), - OperandSize::Size64, - w_xmm1, - ), - "F2480F2ACE", - "cvtsi2sd %rsi, %xmm1", - )); // ======================================================== // XmmRmi diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index ca72a434bb87..2a321c7f44f6 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -178,7 +178,8 @@ impl Inst { | Inst::XmmToGprImm { op, .. } | Inst::XmmUnaryRmRImm { op, .. } | Inst::XmmUnaryRmRUnaligned { op, .. } - | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()], + | Inst::XmmUnaryRmR { op, .. } + | Inst::CvtIntToFloat { op, .. } => smallvec![op.available_from()], Inst::XmmUnaryRmREvex { op, .. } | Inst::XmmRmREvex { op, .. } @@ -196,7 +197,8 @@ impl Inst { | Inst::XmmMovRMImmVex { op, .. } | Inst::XmmToGprImmVex { op, .. } | Inst::XmmToGprVex { op, .. } - | Inst::GprToXmmVex { op, .. } => op.available_from(), + | Inst::GprToXmmVex { op, .. } + | Inst::CvtIntToFloatVex { op, .. } => op.available_from(), } } } @@ -1296,6 +1298,34 @@ impl PrettyPrint for Inst { format!("{op} {src}, {dst}") } + Inst::CvtIntToFloat { + op, + src1, + src2, + dst, + src2_size, + } => { + let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); + let dst = pretty_print_reg(*dst.to_reg(), 8, allocs); + let src2 = src2.pretty_print(src2_size.to_bytes(), allocs); + let op = ljustify(op.to_string()); + format!("{op} {src1}, {src2}, {dst}") + } + + Inst::CvtIntToFloatVex { + op, + src1, + src2, + dst, + src2_size, + } => { + let dst = pretty_print_reg(*dst.to_reg(), 8, allocs); + let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); + let src2 = src2.pretty_print(src2_size.to_bytes(), allocs); + let op = ljustify(op.to_string()); + format!("{op} {src1}, {src2}, {dst}") + } + Inst::CvtUint64ToFloatSeq { src, dst, @@ -2164,6 +2194,20 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_def(dst.to_writable_reg()); src.get_operands(collector); } + Inst::CvtIntToFloat { + src1, src2, dst, .. + } => { + collector.reg_use(src1.to_reg()); + collector.reg_reuse_def(dst.to_writable_reg(), 0); + src2.get_operands(collector); + } + Inst::CvtIntToFloatVex { + src1, src2, dst, .. + } => { + collector.reg_def(dst.to_writable_reg()); + collector.reg_use(src1.to_reg()); + src2.get_operands(collector); + } Inst::CvtUint64ToFloatSeq { src, dst, diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 69e9f229f15c..9691cec469ef 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3336,23 +3336,40 @@ ;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Note that the `cvtsi2s{s,d}` instruction is not just an int-to-float +;; conversion instruction in isolation, it also takes the upper 64-bits of an +;; xmm register and places it into the destination. We don't actually want that +;; to happen as it could accidentally create a false dependency with a +;; previous instruction defining the register's upper 64-bits. See #7085 for +;; an instance of this. +;; +;; This means that the first operand to all of the int-to-float conversions here +;; are `(xmm_zero)` operands which is a guaranteed zero register that has no +;; dependencies on other instructions. +;; +;; Ideally this would be lifted out to a higher level to get deduplicated +;; between consecutive int-to-float operations but that's not easy +;; to do at this time. One possibility would be a mid-end rule which rewrites +;; `fcvt_from_sint` to an x86-specific opcode using a zero constant which would +;; be subject to normal LICM, but that's not feasible today. + (rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I8)))) - (x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) + (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign)))) (rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I16)))) - (x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) + (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign)))) (rule 1 (lower (has_type $F32 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty)))))) - (x64_cvtsi2ss ty a)) + (x64_cvtsi2ss ty (xmm_zero $F32X4) a)) (rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I8)))) - (x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) + (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign)))) (rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I16)))) - (x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign)))) + (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign)))) (rule 1 (lower (has_type $F64 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty)))))) - (x64_cvtsi2sd ty a)) + (x64_cvtsi2sd ty (xmm_zero $F64X2) a)) (rule 0 (lower (fcvt_from_sint a @ (value_type $I32X4))) (x64_cvtdq2ps a)) @@ -3363,10 +3380,10 @@ ;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 1 (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) - (x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero)))) + (x64_cvtsi2ss $I64 (xmm_zero $F32X4) (extend_to_gpr val $I64 (ExtendKind.Zero)))) (rule 1 (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty)))))) - (x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero)))) + (x64_cvtsi2sd $I64 (xmm_zero $F64X2) (extend_to_gpr val $I64 (ExtendKind.Zero)))) (rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64)))) (cvt_u64_to_float_seq ty val)) diff --git a/cranelift/filetests/filetests/isa/x64/fastcall.clif b/cranelift/filetests/filetests/isa/x64/fastcall.clif index 86e865b61493..d83c9fbee0f0 100644 --- a/cranelift/filetests/filetests/isa/x64/fastcall.clif +++ b/cranelift/filetests/filetests/isa/x64/fastcall.clif @@ -241,20 +241,27 @@ block0(v0: i64): ; pushq %rbp ; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } ; movq %rsp, %rbp -; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 16 } +; subq %rsp, $16, %rsp +; movq %rdi, 0(%rsp) +; unwind SaveReg { clobber_offset: 0, reg: p7i } ; block0: -; cvtsi2sd %rcx, %xmm3 +; uninit %xmm3 +; xorpd %xmm3, %xmm3, %xmm3 +; cvtsi2sd %xmm3, %rcx, %xmm3 ; subq %rsp, $48, %rsp ; virtual_sp_offset_adjust 48 ; movq %rcx, 32(%rsp) ; movq %rcx, 40(%rsp) ; movq %rcx, %rdx -; load_ext_name %g+0, %r11 +; load_ext_name %g+0, %rdi ; movq %rdx, %rcx ; movdqa %xmm3, %xmm2 -; call *%r11 +; call *%rdi ; addq %rsp, $48, %rsp ; virtual_sp_offset_adjust -48 +; movq 0(%rsp), %rdi +; addq %rsp, $16, %rsp ; movq %rbp, %rsp ; popq %rbp ; ret @@ -263,17 +270,22 @@ block0(v0: i64): ; block0: ; offset 0x0 ; pushq %rbp ; movq %rsp, %rbp -; block1: ; offset 0x4 +; subq $0x10, %rsp +; movq %rdi, (%rsp) +; block1: ; offset 0xc +; xorpd %xmm3, %xmm3 ; cvtsi2sdq %rcx, %xmm3 ; subq $0x30, %rsp ; movq %rcx, 0x20(%rsp) ; movq %rcx, 0x28(%rsp) ; movq %rcx, %rdx -; movabsq $0, %r11 ; reloc_external Abs8 %g 0 +; movabsq $0, %rdi ; reloc_external Abs8 %g 0 ; movq %rdx, %rcx ; movdqa %xmm3, %xmm2 -; callq *%r11 +; callq *%rdi ; addq $0x30, %rsp +; movq (%rsp), %rdi +; addq $0x10, %rsp ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/fcvt-avx.clif b/cranelift/filetests/filetests/isa/x64/fcvt-avx.clif index 98e47d2b79f8..d46212e18597 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt-avx.clif @@ -11,7 +11,9 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vcvtsi2ss %edi, %xmm0 +; uninit %xmm2 +; vxorps %xmm2, %xmm2, %xmm4 +; vcvtsi2ss %xmm4, %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -21,7 +23,8 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vcvtsi2ssl %edi, %xmm0, %xmm0 +; vxorps %xmm2, %xmm2, %xmm4 +; vcvtsi2ssl %edi, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -36,7 +39,9 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vcvtsi2ss %rdi, %xmm0 +; uninit %xmm2 +; vxorps %xmm2, %xmm2, %xmm4 +; vcvtsi2ss %xmm4, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -46,7 +51,8 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vcvtsi2ssq %rdi, %xmm0, %xmm0 +; vxorps %xmm2, %xmm2, %xmm4 +; vcvtsi2ssq %rdi, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -61,7 +67,9 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vcvtsi2sd %edi, %xmm0 +; uninit %xmm2 +; vxorpd %xmm2, %xmm2, %xmm4 +; vcvtsi2sd %xmm4, %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -71,7 +79,8 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vcvtsi2sdl %edi, %xmm0, %xmm0 +; vxorpd %xmm2, %xmm2, %xmm4 +; vcvtsi2sdl %edi, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -86,7 +95,9 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vcvtsi2sd %rdi, %xmm0 +; uninit %xmm2 +; vxorpd %xmm2, %xmm2, %xmm4 +; vcvtsi2sd %xmm4, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -96,7 +107,8 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vcvtsi2sdq %rdi, %xmm0, %xmm0 +; vxorpd %xmm2, %xmm2, %xmm4 +; vcvtsi2sdq %rdi, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif index 0dfa8637e0cd..1a04881cf022 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif @@ -11,8 +11,10 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movsbl %dil, %eax -; cvtsi2ss %eax, %xmm0 +; uninit %xmm0 +; xorps %xmm0, %xmm0, %xmm0 +; movsbl %dil, %r9d +; cvtsi2ss %xmm0, %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -22,8 +24,9 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movsbl %dil, %eax -; cvtsi2ssl %eax, %xmm0 +; xorps %xmm0, %xmm0 +; movsbl %dil, %r9d +; cvtsi2ssl %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -38,8 +41,10 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movswl %di, %eax -; cvtsi2ss %eax, %xmm0 +; uninit %xmm0 +; xorps %xmm0, %xmm0, %xmm0 +; movswl %di, %r9d +; cvtsi2ss %xmm0, %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -49,8 +54,9 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movswl %di, %eax -; cvtsi2ssl %eax, %xmm0 +; xorps %xmm0, %xmm0 +; movswl %di, %r9d +; cvtsi2ssl %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -65,7 +71,9 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; cvtsi2ss %edi, %xmm0 +; uninit %xmm0 +; xorps %xmm0, %xmm0, %xmm0 +; cvtsi2ss %xmm0, %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -75,6 +83,7 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 +; xorps %xmm0, %xmm0 ; cvtsi2ssl %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -90,7 +99,9 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; cvtsi2ss %rdi, %xmm0 +; uninit %xmm0 +; xorps %xmm0, %xmm0, %xmm0 +; cvtsi2ss %xmm0, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -100,6 +111,7 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 +; xorps %xmm0, %xmm0 ; cvtsi2ssq %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -115,8 +127,10 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movsbl %dil, %eax -; cvtsi2sd %eax, %xmm0 +; uninit %xmm0 +; xorpd %xmm0, %xmm0, %xmm0 +; movsbl %dil, %r9d +; cvtsi2sd %xmm0, %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -126,8 +140,9 @@ block0(v0: i8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movsbl %dil, %eax -; cvtsi2sdl %eax, %xmm0 +; xorpd %xmm0, %xmm0 +; movsbl %dil, %r9d +; cvtsi2sdl %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -142,8 +157,10 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movswl %di, %eax -; cvtsi2sd %eax, %xmm0 +; uninit %xmm0 +; xorpd %xmm0, %xmm0, %xmm0 +; movswl %di, %r9d +; cvtsi2sd %xmm0, %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -153,8 +170,9 @@ block0(v0: i16): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movswl %di, %eax -; cvtsi2sdl %eax, %xmm0 +; xorpd %xmm0, %xmm0 +; movswl %di, %r9d +; cvtsi2sdl %r9d, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq @@ -169,7 +187,9 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; cvtsi2sd %edi, %xmm0 +; uninit %xmm0 +; xorpd %xmm0, %xmm0, %xmm0 +; cvtsi2sd %xmm0, %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -179,6 +199,7 @@ block0(v0: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 +; xorpd %xmm0, %xmm0 ; cvtsi2sdl %edi, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -194,7 +215,9 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; cvtsi2sd %rdi, %xmm0 +; uninit %xmm0 +; xorpd %xmm0, %xmm0, %xmm0 +; cvtsi2sd %xmm0, %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -204,6 +227,7 @@ block0(v0: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 +; xorpd %xmm0, %xmm0 ; cvtsi2sdq %rdi, %xmm0 ; movq %rbp, %rsp ; popq %rbp @@ -251,16 +275,22 @@ block0(v0: i8, v1: i16, v2: i32, v3: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movzbq %dil, %r9 -; cvtsi2ss %r9, %xmm0 -; movzwq %si, %r9 -; cvtsi2ss %r9, %xmm1 -; movl %edx, %r9d -; cvtsi2ss %r9, %xmm2 -; u64_to_f32_seq %rcx, %xmm6, %r9, %r10 -; addss %xmm0, %xmm1, %xmm0 -; addss %xmm0, %xmm2, %xmm0 +; uninit %xmm0 +; xorps %xmm0, %xmm0, %xmm0 +; movzbq %dil, %r8 +; cvtsi2ss %xmm0, %r8, %xmm0 +; uninit %xmm6 +; xorps %xmm6, %xmm6, %xmm6 +; movzwq %si, %r8 +; cvtsi2ss %xmm6, %r8, %xmm6 +; uninit %xmm7 +; xorps %xmm7, %xmm7, %xmm7 +; movl %edx, %r8d +; cvtsi2ss %xmm7, %r8, %xmm7 +; u64_to_f32_seq %rcx, %xmm4, %r8, %rdx ; addss %xmm0, %xmm6, %xmm0 +; addss %xmm0, %xmm7, %xmm0 +; addss %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -270,26 +300,29 @@ block0(v0: i8, v1: i16, v2: i32, v3: i64): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movzbq %dil, %r9 -; cvtsi2ssq %r9, %xmm0 -; movzwq %si, %r9 -; cvtsi2ssq %r9, %xmm1 -; movl %edx, %r9d -; cvtsi2ssq %r9, %xmm2 +; xorps %xmm0, %xmm0 +; movzbq %dil, %r8 +; cvtsi2ssq %r8, %xmm0 +; xorps %xmm6, %xmm6 +; movzwq %si, %r8 +; cvtsi2ssq %r8, %xmm6 +; xorps %xmm7, %xmm7 +; movl %edx, %r8d +; cvtsi2ssq %r8, %xmm7 ; cmpq $0, %rcx -; jl 0x32 -; cvtsi2ssq %rcx, %xmm6 -; jmp 0x4c -; movq %rcx, %r9 -; shrq $1, %r9 -; movq %rcx, %r10 -; andq $1, %r10 -; orq %r9, %r10 -; cvtsi2ssq %r10, %xmm6 -; addss %xmm6, %xmm6 -; addss %xmm1, %xmm0 -; addss %xmm2, %xmm0 +; jl 0x3b +; cvtsi2ssq %rcx, %xmm4 +; jmp 0x55 +; movq %rcx, %r8 +; shrq $1, %r8 +; movq %rcx, %rdx +; andq $1, %rdx +; orq %r8, %rdx +; cvtsi2ssq %rdx, %xmm4 +; addss %xmm4, %xmm4 ; addss %xmm6, %xmm0 +; addss %xmm7, %xmm0 +; addss %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq