Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IGNORE] [mono] Add support for SSE1 in JIT mode. #88075

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion src/mono/mono/arch/amd64/amd64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,11 @@ typedef union {
#define amd64_sse_movss_membase_reg(inst,basereg,disp,reg) emit_sse_membase_reg ((inst), (basereg), (disp), (reg), 0xf3, 0x0f, 0x11)

#define amd64_sse_movss_reg_membase(inst,dreg,basereg,disp) emit_sse_reg_membase ((inst), (dreg), (basereg), (disp), 0xf3, 0x0f, 0x10)
#define amd64_sse_movlps_reg_membase(inst,dreg,basereg,disp) emit_sse_reg_membase_op2 ((inst), (dreg), (basereg), (disp), 0x0f, 0x12)
#define amd64_sse_movhps_reg_membase(inst,dreg,basereg,disp) emit_sse_reg_membase_op2 ((inst), (dreg), (basereg), (disp), 0x0f, 0x16)

#define amd64_sse_movlps_membase_reg(inst,basereg,disp,reg) emit_sse_membase_reg_op2 ((inst), (basereg), (disp), (reg), 0x0f, 0x13)
#define amd64_sse_movhps_membase_reg(inst,basereg,disp,reg) emit_sse_membase_reg_op2 ((inst), (basereg), (disp), (reg), 0x0f, 0x17)

#define amd64_sse_comisd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst),(dreg),(reg),0x66,0x0f,0x2f)
#define amd64_sse_comiss_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst),(dreg),(reg),0x67,0x0f,0x2f)
Expand All @@ -813,9 +818,11 @@ typedef union {
#define amd64_sse_cvtss2si_reg_reg(inst,dreg,reg) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf3, 0x0f, 0x2d, 8)

#define amd64_sse_cvttsd2si_reg_reg_size(inst,dreg,reg,size) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf2, 0x0f, 0x2c, (size))
#define amd64_sse_cvtss2si_reg_reg_size(inst,dreg,reg,size) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf3, 0x0f, 0x2c, (size))
#define amd64_sse_cvtss2si_reg_reg_size(inst,dreg,reg,size) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf3, 0x0f, 0x2d, (size))
#define amd64_sse_cvttss2si_reg_reg_size(inst,dreg,reg,size) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf3, 0x0f, 0x2c, (size))

#define amd64_sse_cvttsd2si_reg_reg(inst,dreg,reg) amd64_sse_cvttsd2si_reg_reg_size ((inst), (dreg), (reg), 8)
#define amd64_sse_cvttss2si_reg_reg(inst,dreg,reg) amd64_sse_cvttsd2si_reg_reg_size ((inst), (dreg), (reg), 8)

#define amd64_sse_cvtsi2sd_reg_reg_size(inst,dreg,reg,size) emit_sse_reg_reg_size ((inst), (dreg), (reg), 0xf2, 0x0f, 0x2a, (size))

Expand Down Expand Up @@ -1178,6 +1185,8 @@ typedef union {

#define amd64_sse_movntps_reg_membase(inst, dreg, basereg, disp) emit_sse_reg_membase_op2((inst), (dreg), (basereg), (disp), 0x0f, 0x2b)

#define amd64_sse_movntps_membase_reg(inst, basereg, disp, reg) emit_sse_membase_reg_op2((inst), (basereg), (disp), (reg), 0x0f, 0x2b)

#define amd64_sse_prefetch_reg_membase(inst, arg, basereg, disp) emit_sse_reg_membase_op2((inst), (arg), (basereg), (disp), 0x0f, 0x18)

#define amd64_sse_lzcnt_reg_reg_size(inst, dreg, reg, size) emit_sse_reg_reg_size((inst), (dreg), (reg), 0xf3, 0x0f, 0xbd, (size))
Expand All @@ -1191,6 +1200,8 @@ typedef union {
#define amd64_sse_blendpd_reg_reg(inst,dreg,sreg,imm) emit_sse_reg_reg_op4_imm((inst), (dreg), (sreg), 0x66, 0x0f, 0x3a, 0x0d, (imm))
#define amd64_movq_reg_reg(inst,dreg,sreg) emit_sse_reg_reg ((inst), (dreg), (sreg), 0xf3, 0x0f, 0x7e)

#define amd64_sse_sfence(inst) emit_opcode3 ((inst), 0x0f, 0xae, 0xf8)

/* Generated from x86-codegen.h */

#define amd64_breakpoint_size(inst,size) do { x86_breakpoint(inst); } while (0)
Expand Down
19 changes: 19 additions & 0 deletions src/mono/mono/mini/cpu-amd64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -825,12 +825,31 @@ expand_i4: dest:x src1:i len:11
expand_i8: dest:x src1:i len:11
expand_r4: dest:x src1:f len:16
expand_r8: dest:x src1:f len:13
xop: len:16
xop_x_x_x: dest:x src1:x src2:x len:16 clob:1
xop_x_x: dest:x src1:x len:16 clob:1
xop_i4_x: dest:i src1:x len:16
xop_i8_x: dest:i src1:x len:16
sse41_dpps_imm: dest:x src1:x src2:x len:7 clob:1
sse41_dppd_imm: dest:x src1:x src2:x len:7 clob:1
vector_andnot: dest:x src1:x src2:x len:7 clob:1

sse_cmpss: dest:x src1:x src2:x len:21 clob:1
sse_comiss: dest:i src1:x src2:x len:7 clob:1
sse_ucomiss: dest:i src1:x src2:x len:7 clob:1
sse_addss: dest:x src1:x src2:x len:7 clob:1
sse_subss: dest:x src1:x src2:x len:7 clob:1
sse_mulss: dest:x src1:x src2:x len:7 clob:1
sse_divss: dest:x src1:x src2:x len:7 clob:1
sse_cvtsi2ss: dest:x src1:x src2:i len:7 clob:1
sse_cvtsi2ss64: dest:x src1:x src2:i len:7 clob:1
sse_movss: dest:x src1:b len:16
sse_movlps_load: dest:x src1:x src2:b len:16 clob:1
sse_movhps_load: dest:x src1:x src2:b len:16 clob:1
sse_movlps_store: src1:i src2:x len:16
sse_movhps_store: src1:i src2:x len:16
sse_movss_store: src1:i src2:x len:16

roundp: dest:x src1:x len:10

liverange_start: len:0
Expand Down
81 changes: 75 additions & 6 deletions src/mono/mono/mini/mini-amd64.c
Original file line number Diff line number Diff line change
Expand Up @@ -5923,23 +5923,23 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
break;

case OP_RCONV_TO_I1:
amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
amd64_sse_cvttss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, FALSE);
break;
case OP_RCONV_TO_U1:
amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
amd64_sse_cvttss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE);
break;
case OP_RCONV_TO_I2:
amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
amd64_sse_cvttss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
amd64_widen_reg (code, ins->dreg, ins->dreg, TRUE, TRUE);
break;
case OP_RCONV_TO_U2:
amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
amd64_sse_cvttss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, TRUE);
break;
case OP_RCONV_TO_I4:
amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
amd64_sse_cvttss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
break;
case OP_RCONV_TO_U4:
// Use 8 as register size to get Nan/Inf conversion result truncated to 0
Expand Down Expand Up @@ -6712,6 +6712,16 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
case OP_XOP:
switch (ins->inst_c0) {
case INTRINS_SSE_SFENCE:
amd64_sse_sfence (code);
break;
default:
g_assert_not_reached ();
break;
}
break;
case OP_XOP_X_X_X: {
switch (ins->inst_c0) {
case INTRINS_SSE_PHADDW:
Expand Down Expand Up @@ -6746,6 +6756,28 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
case OP_XOP_I4_X:
case OP_XOP_I8_X: {
switch (ins->inst_c0) {
case INTRINS_SSE_CVTSS2SI:
amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
break;
case INTRINS_SSE_CVTTSS2SI:
amd64_sse_cvttss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 4);
break;
case INTRINS_SSE_CVTSS2SI64:
amd64_sse_cvtss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 8);
break;
case INTRINS_SSE_CVTTSS2SI64:
amd64_sse_cvttss2si_reg_reg_size (code, ins->dreg, ins->sreg1, 8);
break;
default:
g_assert_not_reached ();
break;
}
break;
}

case OP_SSE41_DPPS_IMM:
amd64_sse_dpps_reg_reg (code, ins->dreg, ins->sreg2, ins->inst_c0);
break;
Expand Down Expand Up @@ -6936,6 +6968,43 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
amd64_sse_movddup_reg_reg (code, ins->dreg, ins->sreg1);
break;

case OP_SSE_ADDSS:
amd64_sse_addss_reg_reg (code, ins->dreg, ins->sreg2);
break;
case OP_SSE_SUBSS:
amd64_sse_subss_reg_reg (code, ins->dreg, ins->sreg2);
break;
case OP_SSE_MULSS:
amd64_sse_mulss_reg_reg (code, ins->dreg, ins->sreg2);
break;
case OP_SSE_DIVSS:
amd64_sse_divss_reg_reg (code, ins->dreg, ins->sreg2);
break;
case OP_SSE_CVTSI2SS:
amd64_sse_cvtsi2ss_reg_reg_size (code, ins->dreg, ins->sreg2, 4);
break;
case OP_SSE_CVTSI2SS64:
amd64_sse_cvtsi2ss_reg_reg_size (code, ins->dreg, ins->sreg2, 8);
break;
case OP_SSE_MOVSS:
amd64_sse_movss_reg_membase (code, ins->dreg, ins->sreg1, 0);
break;
case OP_SSE_MOVLPS_LOAD:
amd64_sse_movlps_reg_membase (code, ins->dreg, ins->sreg2, 0);
break;
case OP_SSE_MOVHPS_LOAD:
amd64_sse_movhps_reg_membase (code, ins->dreg, ins->sreg2, 0);
break;
case OP_SSE_MOVLPS_STORE:
amd64_sse_movlps_membase_reg (code, ins->sreg1, 0, ins->sreg2);
break;
case OP_SSE_MOVHPS_STORE:
amd64_sse_movhps_membase_reg (code, ins->sreg1, 0, ins->sreg2);
break;
case OP_SSE_MOVSS_STORE:
amd64_sse_movss_membase_reg (code, ins->sreg1, 0, ins->sreg2);
break;

case OP_EXTRACT_MASK:
amd64_sse_pmovmskb_reg_reg (code, ins->dreg, ins->sreg1);
break;
Expand Down Expand Up @@ -7422,7 +7491,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
amd64_sse_movaps_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
break;
case OP_STOREX_NTA_MEMBASE_REG:
amd64_sse_movntps_reg_membase (code, ins->dreg, ins->sreg1, ins->inst_offset);
amd64_sse_movntps_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1);
break;
case OP_PREFETCH_MEMBASE:
amd64_sse_prefetch_reg_membase (code, ins->backend.arg_info, ins->sreg1, ins->inst_offset);
Expand Down
115 changes: 106 additions & 9 deletions src/mono/mono/mini/simd-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,73 @@ emit_hardware_intrinsics (
goto support_probe_complete;
id = info->id;

#ifdef TARGET_AMD64
if (intrin_group->feature == MONO_CPU_X86_SSE && !COMPILE_LLVM (cfg)) {
switch (id) {
case SN_CompareEqual:
case SN_CompareGreaterThan:
case SN_CompareGreaterThanOrEqual:
case SN_CompareLessThan:
case SN_CompareLessThanOrEqual:
case SN_CompareNotEqual:
case SN_CompareNotGreaterThan:
case SN_CompareNotGreaterThanOrEqual:
case SN_CompareNotLessThan:
case SN_CompareNotLessThanOrEqual:
case SN_CompareOrdered:
case SN_CompareScalarEqual:
case SN_CompareScalarGreaterThan:
case SN_CompareScalarGreaterThanOrEqual:
case SN_CompareScalarLessThan:
case SN_CompareScalarLessThanOrEqual:
case SN_CompareScalarNotEqual:
case SN_CompareScalarNotGreaterThan:
case SN_CompareScalarNotGreaterThanOrEqual:
case SN_CompareScalarNotLessThan:
case SN_CompareScalarNotLessThanOrEqual:
case SN_CompareScalarOrdered:
case SN_CompareScalarOrderedEqual:
case SN_CompareScalarOrderedGreaterThan:
case SN_CompareScalarOrderedGreaterThanOrEqual:
case SN_CompareScalarOrderedLessThan:
case SN_CompareScalarOrderedLessThanOrEqual:
case SN_CompareScalarOrderedNotEqual:
case SN_CompareScalarUnordered:
case SN_CompareScalarUnorderedEqual:
case SN_CompareScalarUnorderedGreaterThan:
case SN_CompareScalarUnorderedGreaterThanOrEqual:
case SN_CompareScalarUnorderedLessThan:
case SN_CompareScalarUnorderedLessThanOrEqual:
case SN_CompareScalarUnorderedNotEqual:
case SN_CompareUnordered:
case SN_Max:
case SN_MaxScalar:
case SN_Min:
case SN_MinScalar:
case SN_MoveHighToLow:
case SN_MoveLowToHigh:
case SN_MoveMask:
case SN_MoveScalar:
case SN_Prefetch0:
case SN_Prefetch1:
case SN_Prefetch2:
case SN_PrefetchNonTemporal:
case SN_Reciprocal:
case SN_ReciprocalScalar:
case SN_ReciprocalSqrt:
case SN_ReciprocalSqrtScalar:
case SN_Shuffle:
case SN_Sqrt:
case SN_SqrtScalar:
case SN_UnpackHigh:
case SN_UnpackLow:
return NULL;
default:
break;
}
}
#endif

#ifdef TARGET_ARM64
if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp) && !intrin_group->jit_supported) {
goto support_probe_complete;
Expand Down Expand Up @@ -4196,7 +4263,7 @@ emit_arm64_intrinsics (
static SimdIntrinsic sse_methods [] = {
{SN_Add, OP_XBINOP, OP_FADD},
{SN_AddScalar, OP_SSE_ADDSS},
{SN_And, OP_SSE_AND},
{SN_And, OP_ANDPS},
{SN_AndNot, OP_VECTOR_ANDN},
{SN_CompareEqual, OP_XCOMPARE_FP, CMP_EQ},
{SN_CompareGreaterThan, OP_XCOMPARE_FP,CMP_GT},
Expand Down Expand Up @@ -4241,11 +4308,11 @@ static SimdIntrinsic sse_methods [] = {
{SN_ConvertToInt64WithTruncation, OP_XOP_I8_X, INTRINS_SSE_CVTTSS2SI64},
{SN_Divide, OP_XBINOP, OP_FDIV},
{SN_DivideScalar, OP_SSE_DIVSS},
{SN_LoadAlignedVector128, OP_SSE_LOADU, 16 /* alignment */},
{SN_LoadAlignedVector128, OP_LOADX_ALIGNED_MEMBASE},
{SN_LoadHigh, OP_SSE_MOVHPS_LOAD},
{SN_LoadLow, OP_SSE_MOVLPS_LOAD},
{SN_LoadScalarVector128, OP_SSE_MOVSS},
{SN_LoadVector128, OP_SSE_LOADU, 1 /* alignment */},
{SN_LoadVector128, OP_LOADX_MEMBASE},
{SN_Max, OP_XOP_X_X_X, INTRINS_SSE_MAXPS},
{SN_MaxScalar, OP_XOP_X_X_X, INTRINS_SSE_MAXSS},
{SN_Min, OP_XOP_X_X_X, INTRINS_SSE_MINPS},
Expand All @@ -4256,7 +4323,7 @@ static SimdIntrinsic sse_methods [] = {
{SN_MoveScalar, OP_SSE_MOVS2},
{SN_Multiply, OP_XBINOP, OP_FMUL},
{SN_MultiplyScalar, OP_SSE_MULSS},
{SN_Or, OP_SSE_OR},
{SN_Or, OP_ORPS},
{SN_Prefetch0, OP_SSE_PREFETCHT0},
{SN_Prefetch1, OP_SSE_PREFETCHT1},
{SN_Prefetch2, OP_SSE_PREFETCHT2},
Expand All @@ -4268,9 +4335,9 @@ static SimdIntrinsic sse_methods [] = {
{SN_Shuffle},
{SN_Sqrt, OP_XOP_X_X, INTRINS_SIMD_SQRT_R4},
{SN_SqrtScalar},
{SN_Store, OP_SIMD_STORE, 1 /* alignment */},
{SN_StoreAligned, OP_SIMD_STORE, 16 /* alignment */},
{SN_StoreAlignedNonTemporal, OP_SSE_MOVNTPS, 16 /* alignment */},
{SN_Store},
{SN_StoreAligned},
{SN_StoreAlignedNonTemporal},
{SN_StoreFence, OP_XOP, INTRINS_SSE_SFENCE},
{SN_StoreHigh, OP_SSE_MOVHPS_STORE},
{SN_StoreLow, OP_SSE_MOVLPS_STORE},
Expand All @@ -4279,7 +4346,7 @@ static SimdIntrinsic sse_methods [] = {
{SN_SubtractScalar, OP_SSE_SUBSS},
{SN_UnpackHigh, OP_SSE_UNPACKHI},
{SN_UnpackLow, OP_SSE_UNPACKLO},
{SN_Xor, OP_SSE_XOR},
{SN_Xor, OP_XORPS},
{SN_get_IsSupported}
};

Expand Down Expand Up @@ -4526,7 +4593,7 @@ static const IntrinGroup supported_x86_intrinsics [] = {
{ "Lzcnt", MONO_CPU_X86_LZCNT, lzcnt_methods, sizeof (lzcnt_methods), TRUE },
{ "Pclmulqdq", MONO_CPU_X86_PCLMUL, pclmulqdq_methods, sizeof (pclmulqdq_methods) },
{ "Popcnt", MONO_CPU_X86_POPCNT, popcnt_methods, sizeof (popcnt_methods), TRUE },
{ "Sse", MONO_CPU_X86_SSE, sse_methods, sizeof (sse_methods) },
{ "Sse", MONO_CPU_X86_SSE, sse_methods, sizeof (sse_methods), TRUE },
{ "Sse2", MONO_CPU_X86_SSE2, sse2_methods, sizeof (sse2_methods) },
{ "Sse3", MONO_CPU_X86_SSE3, sse3_methods, sizeof (sse3_methods) },
{ "Sse41", MONO_CPU_X86_SSE41, sse41_methods, sizeof (sse41_methods) },
Expand Down Expand Up @@ -4576,6 +4643,36 @@ emit_x86_intrinsics (
g_assert_not_reached ();
break;
}
case SN_Store: {
if (!COMPILE_LLVM (cfg)) {
MonoInst *ins;
EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_MEMBASE, args [0]->dreg, 0, args [1]->dreg);
ins->klass = klass;
return ins;
} else {
return emit_simd_ins_for_sig (cfg, klass, OP_SIMD_STORE, 1, arg0_type, fsig, args);
}
}
case SN_StoreAligned: {
if (!COMPILE_LLVM (cfg)) {
MonoInst *ins;
EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_ALIGNED_MEMBASE_REG, args [0]->dreg, 0, args [1]->dreg);
ins->klass = klass;
return ins;
} else {
return emit_simd_ins_for_sig (cfg, klass, OP_SIMD_STORE, 16, arg0_type, fsig, args);
}
}
case SN_StoreAlignedNonTemporal: {
if (!COMPILE_LLVM (cfg)) {
MonoInst *ins;
EMIT_NEW_STORE_MEMBASE (cfg, ins, OP_STOREX_NTA_MEMBASE_REG, args [0]->dreg, 0, args [1]->dreg);
ins->klass = klass;
return ins;
} else {
return emit_simd_ins_for_sig (cfg, klass, OP_SSE_MOVNTPS, 16, arg0_type, fsig, args);
}
}
case SN_LoadScalarVector128:
return NULL;
default:
Expand Down