From f2119844cf453df504cae4211dd889aca18e674f Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Mon, 10 Apr 2023 16:27:23 -0700 Subject: [PATCH] Adding support for Vector512 `Equals`, `EqualsAny`, `op_Equality`, and `op_Inequality`. (#83470) * Implement `Vector512.Equals` by way of special `AVX512` intrinsic. * Add Vector512 `Equals`, `EqualsAll/op_Equality` `op_Inequality`. * Fix `vpmovm2x` instructions for `MoveMaskToVectorSpecial`. * Move `Vector512` Equals into `gtNewSimdCmpOpNode`. * Adds `EqualsAny`. * Fix `kortestq` and `kortestd` opcode gen (W bit). * Fix merge and update instr flags. * Addressing review comments. * Removing unnecessary files. * Addressing review comments. * Fixing bug. * Fixing k reg display on x86. * Setting Resets_ZF to right value. * Fixing extract. --------- Co-authored-by: Deepak Rajendrakumaran Co-authored-by: Tanner Gooding --- src/coreclr/jit/emitxarch.cpp | 68 +++++++++++++++- src/coreclr/jit/emitxarch.h | 7 +- src/coreclr/jit/gentree.cpp | 42 +++++++++- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 45 ++++++++++- src/coreclr/jit/hwintrinsiclistxarch.h | 14 +++- src/coreclr/jit/hwintrinsicxarch.cpp | 48 ++++++++++++ src/coreclr/jit/instr.h | 59 +++++++------- src/coreclr/jit/instrsxarch.h | 86 ++++++++++++--------- src/coreclr/jit/lowerxarch.cpp | 23 +++++- src/coreclr/jit/targetx86.h | 2 +- 10 files changed, 311 insertions(+), 83 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index d65e84123e3b8..b201609bd23af 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -34,9 +34,19 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins) return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); } +//------------------------------------------------------------------------ +// IsKInstruction: Does this instruction require K register. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if this instruction requires K register. +// bool emitter::IsKInstruction(instruction ins) { - return (ins >= INS_FIRST_K_INSTRUCTION) && (ins <= INS_LAST_K_INSTRUCTION); + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & KInstruction) != 0; } //------------------------------------------------------------------------ @@ -243,6 +253,17 @@ bool emitter::IsDstSrcSrcAVXInstruction(instruction ins) const return (flags & INS_Flags_IsDstSrcSrcAVXInstruction) != 0; } +bool emitter::IsThreeOperandAVXInstruction(instruction ins) const +{ + if (!UseSimdEncoding()) + { + return false; + } + + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & INS_Flags_Is3OperandInstructionMask) != 0; +} + //------------------------------------------------------------------------ // HasRegularWideForm: Many x86/x64 instructions follow a regular encoding scheme where the // byte-sized version of an instruction has the lowest bit of the opcode cleared @@ -9849,6 +9870,10 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName) #endif // TARGET_AMD64 #ifdef TARGET_X86 + if (isMaskReg(reg)) + { + return rn; + } assert(strlen(rn) >= 3); switch (EA_SIZE(attr)) @@ -18367,7 +18392,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vpmovd2m: case INS_vpmovq2m: { - result.insLatency += PERFSCORE_LATENCY_1C; + result.insLatency += PERFSCORE_LATENCY_3C; result.insThroughput = PERFSCORE_THROUGHPUT_1C; break; } @@ -18386,6 +18411,45 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } + case INS_vpcmpb: + case INS_vpcmpw: + case INS_vpcmpd: + case INS_vpcmpq: + case INS_vpcmpub: + case INS_vpcmpuw: + case INS_vpcmpud: + case INS_vpcmpuq: + { + result.insLatency += PERFSCORE_LATENCY_4C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + } + + case INS_vpmovm2b: + case INS_vpmovm2w: + { + result.insLatency += PERFSCORE_LATENCY_3C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + } + case INS_vpmovm2d: + case INS_vpmovm2q: + { + result.insLatency += PERFSCORE_LATENCY_1C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + } + + case INS_kortestb: + case INS_kortestw: + case INS_kortestd: + case INS_kortestq: + { + result.insLatency += PERFSCORE_LATENCY_1C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + } + default: // unhandled instruction insFmt combination perfScoreUnhandledInstruction(id, &result); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 435753adb0065..25846609b507f 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -199,6 +199,7 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr bool HasKMaskRegisterDest(instruction ins) const { assert(UseEvexEncoding() == true); + switch (ins) { // Requires KMask. @@ -403,6 +404,7 @@ void SetContains256bitOrMoreAVX(bool value) bool IsDstDstSrcAVXInstruction(instruction ins) const; bool IsDstSrcSrcAVXInstruction(instruction ins) const; +bool IsThreeOperandAVXInstruction(instruction ins) const; static bool HasRegularWideForm(instruction ins); static bool HasRegularWideImmediateForm(instruction ins); static bool DoesWriteZeroFlag(instruction ins); @@ -414,11 +416,6 @@ static bool IsRexW1Instruction(instruction ins); static bool IsRexWXInstruction(instruction ins); static bool IsRexW1EvexInstruction(instruction ins); -bool IsThreeOperandAVXInstruction(instruction ins) -{ - return (IsDstDstSrcAVXInstruction(ins) || IsDstSrcSrcAVXInstruction(ins)); -} - bool isAvxBlendv(instruction ins) { return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 6a4a04ab75369..27073d655726d 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20172,6 +20172,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode( NamedIntrinsic intrinsic = NI_Illegal; + if (simdSize == 64) + { + assert(op == GT_EQ); + } + switch (op) { #if defined(TARGET_XARCH) @@ -20191,6 +20196,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode( intrinsic = NI_AVX2_CompareEqual; } } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_AVX512F_CompareEqualSpecial; + } else if (simdBaseType == TYP_FLOAT) { intrinsic = NI_SSE_CompareEqual; @@ -20760,7 +20770,20 @@ GenTree* Compiler::gtNewSimdCmpOpNode( } assert(intrinsic != NI_Illegal); + +#if defined(TARGET_XARCH) + if (simdSize != 64) + { + return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); + } + else + { + GenTree* cmp = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, intrinsic, simdBaseJitType, simdSize); + return gtNewSimdHWIntrinsicNode(type, cmp, NI_AVX512F_MoveMaskToVectorSpecial, simdBaseJitType, simdSize); + } +#else return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); +#endif } GenTree* Compiler::gtNewSimdCmpOpAllNode( @@ -20788,7 +20811,12 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode( #if defined(TARGET_XARCH) case GT_EQ: { - if (simdSize == 32) + if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_Vector512_op_Equality; + } + else if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); @@ -20934,6 +20962,11 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode( intrinsic = NI_Vector256_op_Inequality; } + else if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_Vector512_op_Inequality; + } else { intrinsic = NI_Vector128_op_Inequality; @@ -20957,7 +20990,12 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode( case GT_NE: { - if (simdSize == 32) + if (simdSize == 64) + { + assert(IsBaselineVector512IsaSupportedDebugOnly()); + intrinsic = NI_Vector512_op_Inequality; + } + else if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 9448ba8b8d148..069aba27015f4 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1738,15 +1738,54 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) unreached(); } - // TODO-XARCH-AVX512 remove REG_K1 check when all K registers possible for - // allocation. - assert(emitter::isMaskReg(maskReg) && maskReg == REG_K1); + assert(emitter::isMaskReg(maskReg)); emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg); emit->emitIns_Mov(kmovIns, EA_8BYTE, targetReg, maskReg, INS_FLAGS_DONT_CARE); break; } + case NI_AVX512F_CompareEqualSpecial: + { + GenTree* op2 = node->Op(2); + op1Reg = op1->GetRegNum(); + regNumber op2Reg = op2->GetRegNum(); + + instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareEqualSpecial, baseType); + + assert(compareIns != INS_invalid); + assert(emitter::isMaskReg(targetReg)); + + emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 0); + break; + } + + case NI_AVX512F_MoveMaskToVectorSpecial: + { + op1Reg = op1->GetRegNum(); + + instruction maskMovIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_MoveMaskToVectorSpecial, baseType); + + assert(maskMovIns != INS_invalid); + assert(emitter::isMaskReg(op1Reg)); + + emit->emitIns_R_R(maskMovIns, attr, targetReg, op1Reg); + break; + } + + case NI_AVX512F_KORTEST: + { + op1Reg = op1->GetRegNum(); + + instruction testIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_KORTEST, baseType); + + assert(testIns != INS_invalid); + assert(emitter::isMaskReg(op1Reg)); + + emit->emitIns_R_R(testIns, EA_8BYTE, op1Reg, op1Reg); + break; + } + case NI_AVX512F_ConvertToVector128Int16: case NI_AVX512F_ConvertToVector128Int32: case NI_AVX512F_ConvertToVector128UInt16: diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 53ffea32e6f51..b7ad5a68aa1ef 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -292,6 +292,12 @@ HARDWARE_INTRINSIC(Vector512, WidenUpper, HARDWARE_INTRINSIC(Vector512, WithLower, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, WithUpper, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, Xor, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, Equals, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) + +HARDWARE_INTRINSIC(Vector512, EqualsAll, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, EqualsAny, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, op_Equality, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) +HARDWARE_INTRINSIC(Vector512, op_Inequality, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -962,7 +968,13 @@ HARDWARE_INTRINSIC(SSE2, UCOMISD, HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) + +HARDWARE_INTRINSIC(AVX512F, CompareEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, MoveMaskToVectorSpecial, 64, 1, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) + + +HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, 1, {INS_kortestq, INS_kortestq, INS_kortestd, INS_kortestd, INS_kortestw, INS_kortestw, INS_kortestb, INS_kortestb, INS_kortestw, INS_kortestb}, HW_Category_Special, HW_Flag_NoRMWSemantics) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index e9cef9eb2c2e6..cff245b9f1193 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1338,6 +1338,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_Equals: case NI_Vector256_Equals: + case NI_Vector512_Equals: { assert(sig->numArgs == 2); @@ -1351,6 +1352,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector512_EqualsAll: + case NI_Vector512_op_Equality: + { + assert(sig->numArgs == 2); + assert(IsBaselineVector512IsaSupportedDebugOnly()); + + var_types simdType = getSIMDTypeForSize(simdSize); + + op2 = impSIMDPopStack(simdType); + op1 = impSIMDPopStack(simdType); + + retNode = gtNewSimdCmpOpAllNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize); + break; + } + case NI_Vector128_EqualsAll: case NI_Vector256_EqualsAll: case NI_Vector128_op_Equality: @@ -1370,6 +1386,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector512_EqualsAny: + { + assert(sig->numArgs == 2); + assert(simdSize == 64); + assert(IsBaselineVector512IsaSupportedDebugOnly()); + + var_types simdType = getSIMDTypeForSize(simdSize); + + op2 = impSIMDPopStack(simdType); + op1 = impSIMDPopStack(simdType); + + retNode = gtNewSimdCmpOpAnyNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize); + break; + } + case NI_Vector128_EqualsAny: case NI_Vector256_EqualsAny: { @@ -2011,6 +2042,23 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector512_op_Inequality: + { + assert(sig->numArgs == 2); + + if (IsBaselineVector512IsaSupported()) + { + var_types simdType = getSIMDTypeForSize(simdSize); + + op2 = impSIMDPopStack(simdType); + op1 = impSIMDPopStack(simdType); + + retNode = gtNewSimdCmpOpAnyNode(GT_NE, retType, op1, op2, simdBaseJitType, simdSize); + } + + break; + } + case NI_Vector128_op_UnaryPlus: case NI_Vector256_op_UnaryPlus: { diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index 745f08b29bb2e..ba84dd142c9c9 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -124,60 +124,61 @@ enum insFlags : uint64_t // Resets Resets_OF = 1ULL << 12, Resets_SF = 1ULL << 13, - Resets_ZF = 1ULL << 39, - Resets_AF = 1ULL << 14, - Resets_PF = 1ULL << 15, - Resets_CF = 1ULL << 16, + Resets_ZF = 1ULL << 14, + Resets_AF = 1ULL << 15, + Resets_PF = 1ULL << 16, + Resets_CF = 1ULL << 17, // Undefined - Undefined_OF = 1ULL << 17, - Undefined_SF = 1ULL << 18, - Undefined_ZF = 1ULL << 19, - Undefined_AF = 1ULL << 20, - Undefined_PF = 1ULL << 21, - Undefined_CF = 1ULL << 22, + Undefined_OF = 1ULL << 18, + Undefined_SF = 1ULL << 19, + Undefined_ZF = 1ULL << 20, + Undefined_AF = 1ULL << 21, + Undefined_PF = 1ULL << 22, + Undefined_CF = 1ULL << 23, // Restore - Restore_SF_ZF_AF_PF_CF = 1ULL << 23, + Restore_SF_ZF_AF_PF_CF = 1ULL << 24, // x87 instruction - INS_FLAGS_x87Instr = 1ULL << 24, + INS_FLAGS_x87Instr = 1ULL << 25, // Avx - INS_Flags_IsDstDstSrcAVXInstruction = 1ULL << 25, - INS_Flags_IsDstSrcSrcAVXInstruction = 1ULL << 26, + INS_Flags_IsDstDstSrcAVXInstruction = 1ULL << 26, + INS_Flags_IsDstSrcSrcAVXInstruction = 1ULL << 27, + INS_Flags_IsMskSrcSrcEvexInstruction = 1ULL << 28, + INS_Flags_Is3OperandInstructionMask = (INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_IsDstSrcSrcAVXInstruction | INS_Flags_IsMskSrcSrcEvexInstruction), // w and s bits - INS_FLAGS_Has_Wbit = 1ULL << 27, - INS_FLAGS_Has_Sbit = 1ULL << 28, + INS_FLAGS_Has_Wbit = 1ULL << 29, + INS_FLAGS_Has_Sbit = 1ULL << 30, // instruction input size // if not input size is set, instruction defaults to using // the emitAttr for size - Input_8Bit = 1ULL << 29, - Input_16Bit = 1ULL << 30, - Input_32Bit = 1ULL << 31, - Input_64Bit = 1ULL << 32, - Input_Mask = (0xFULL) << 29, + Input_8Bit = 1ULL << 31, + Input_16Bit = 1ULL << 32, + Input_32Bit = 1ULL << 33, + Input_64Bit = 1ULL << 34, + Input_Mask = (0xFULL) << 31, // encoding of the REX.W-bit - REX_W0 = 1ULL << 33, - REX_W1 = 1ULL << 34, - REX_WX = 1ULL << 35, + REX_W0 = 1ULL << 35, + REX_W1 = 1ULL << 36, + REX_WX = 1ULL << 37, // encoding of the REX.W-bit is considered for EVEX only and W0 or WIG otherwise REX_W0_EVEX = REX_W0, - REX_W1_EVEX = 1ULL << 36, + REX_W1_EVEX = 1ULL << 38, // encoding of the REX.W-bit is ignored REX_WIG = REX_W0, // whether VEX or EVEX encodings are directly supported - Encoding_VEX = 1ULL << 37, - Encoding_EVEX = 1ULL << 38, + Encoding_VEX = 1ULL << 39, + Encoding_EVEX = 1ULL << 40, - // Listed above so it is "inline" with the other Resets_* flags - // Resets_ZF = 1ULL << 39, + KInstruction = 1ULL << 41, // TODO-Cleanup: Remove this flag and its usage from TARGET_XARCH INS_FLAGS_DONT_CARE = 0x00ULL, diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 207525db40a50..c16d16b466907 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -603,53 +603,63 @@ INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(FIRST_K_INSTRUCTION, "FIRST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) - -INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX) -INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX) -INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX) -INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W1 | Encoding_VEX) - -INST3(kmovb_msk, "kmovb", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX) -INST3(kmovw_msk, "kmovw", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX) -INST3(kmovd_msk, "kmovd", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX) -INST3(kmovq_msk, "kmovq", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX) - -INST3(LAST_K_INSTRUCTION, "LAST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) - INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // AVX512F -INST3(vextractf64x4, "extractf64x4", IUM_WR, SSE3A(0x1B), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed double-precision floating point values -INST3(vextracti64x4, "extracti64x4", IUM_WR, SSE3A(0x3B), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed quadword integer values -INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values -INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values -INST3(vmovdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_FLAGS_None) -INST3(vmovdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_FLAGS_None) -INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs -INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs -INST3(vpmovdw, "pmovdw", IUM_WR, PSSE38(0xF3, 0x33), BAD_CODE, PSSE38(0xF3, 0x33), INS_TT_HALF_MEM, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovqd, "pmovqd", IUM_WR, PSSE38(0xF3, 0x35), BAD_CODE, PSSE38(0xF3, 0x35), INS_TT_HALF_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vporq, "porq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs -INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(vpxorq, "pxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs +INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) +INST3(kmovw_msk, "kmovw", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) +INST3(kortestw, "kortestw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) +INST3(vextractf64x4, "extractf64x4", IUM_WR, SSE3A(0x1B), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed double-precision floating point values +INST3(vextracti64x4, "extracti64x4", IUM_WR, SSE3A(0x3B), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed quadword integer values +INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(vmovdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_FLAGS_None) +INST3(vmovdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_FLAGS_None) +INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs +INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs +INST3(vpmovdw, "pmovdw", IUM_WR, PSSE38(0xF3, 0x33), BAD_CODE, PSSE38(0xF3, 0x33), INS_TT_HALF_MEM, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) +INST3(vpmovqd, "pmovqd", IUM_WR, PSSE38(0xF3, 0x35), BAD_CODE, PSSE38(0xF3, 0x35), INS_TT_HALF_MEM, Input_64Bit | REX_W0_EVEX | Encoding_EVEX) +INST3(vporq, "porq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs +INST3(vpternlogd, "pternlogd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x25), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) +INST3(vpxorq, "pxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs // AVX512BW -INST3(vmovdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vmovdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) -INST3(vpmovb2m, "pmovb2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x29), INS_TT_NONE, Input_8Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovw2m, "pmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x29), INS_TT_NONE, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) -INST3(vpmovwb, "pmovwb", IUM_WR, PSSE38(0xF3, 0x30), BAD_CODE, PSSE38(0xF3, 0x30), INS_TT_HALF_MEM, Input_16Bit | REX_W0_EVEX | Encoding_EVEX) +INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) +INST3(kmovd_msk, "kmovd", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) +INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) +INST3(kmovq_msk, "kmovq", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) +INST3(kortestd, "kortestd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) +INST3(kortestq, "kortestq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) +INST3(vmovdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX) +INST3(vmovdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) +INST3(vpcmpb, "pcmpb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) +INST3(vpcmpw, "pcmpw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL, Input_16Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) +INST3(vpcmpub, "pcmpub", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) +INST3(vpcmpuw, "pcmpuw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL, Input_16Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) +INST3(vpmovb2m, "pmovb2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x29), INS_TT_NONE, Input_8Bit | REX_W0_EVEX | Encoding_EVEX) +INST3(vpmovm2b, "pmovm2b", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x28), INS_TT_NONE, Input_8Bit | REX_W0_EVEX | Encoding_EVEX) +INST3(vpmovm2w, "pmovm2w", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x28), INS_TT_NONE, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) +INST3(vpmovw2m, "pmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x29), INS_TT_NONE, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) +INST3(vpmovwb, "pmovwb", IUM_WR, PSSE38(0xF3, 0x30), BAD_CODE, PSSE38(0xF3, 0x30), INS_TT_HALF_MEM, Input_16Bit | REX_W0_EVEX | Encoding_EVEX) // AVX512DQ -INST3(vextractf32x8, "extractf32x8", IUM_WR, SSE3A(0x1B), BAD_CODE, BAD_CODE, INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed double-precision floating point values -INST3(vextracti32x8, "extracti32x8", IUM_WR, SSE3A(0x3B), BAD_CODE, BAD_CODE, INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Extract 256-bit packed quadword integer values -INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values -INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values -INST3(vpmovd2m, "pmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x39), INS_TT_NONE, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) -INST3(vpmovq2m, "pmovq2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x39), INS_TT_NONE, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) +INST3(kortestb, "kortestb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) +INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) +INST3(kmovb_msk, "kmovb", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) +INST3(vextractf32x8, "extractf32x8", IUM_WR, SSE3A(0x1B), BAD_CODE, BAD_CODE, INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // Extract 256-bit packed double-precision floating point values +INST3(vextracti32x8, "extracti32x8", IUM_WR, SSE3A(0x3B), BAD_CODE, BAD_CODE, INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) // Extract 256-bit packed quadword integer values +INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(vpcmpd, "pcmpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1F), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) +INST3(vpcmpq, "pcmpq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1F), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) +INST3(vpcmpud, "pcmpud", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1E), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) +INST3(vpcmpuq, "pcmpuq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1E), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask) +INST3(vpmovd2m, "pmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x39), INS_TT_NONE, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) +INST3(vpmovm2d, "pmovm2d", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x38), INS_TT_NONE, Input_32Bit | REX_W0_EVEX | Encoding_EVEX) +INST3(vpmovm2q, "pmovm2q", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x38), INS_TT_NONE, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) +INST3(vpmovq2m, "pmovq2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x39), INS_TT_NONE, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 7e21041683ff6..b9239308fe768 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -890,7 +890,8 @@ void Lowering::LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIn { GenTreeCC* cc = LowerNodeCC(node, condition); - assert(HWIntrinsicInfo::lookupNumArgs(newIntrinsicId) == 2); + // TODO-XARCH-AVX512 remove the KORTEST check when its promoted to 2 proper arguments + assert(HWIntrinsicInfo::lookupNumArgs(newIntrinsicId) == 2 || newIntrinsicId == NI_AVX512F_KORTEST); node->ChangeHWIntrinsicId(newIntrinsicId); node->gtType = TYP_VOID; node->ClearUnusedValue(); @@ -926,6 +927,7 @@ void Lowering::LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIn } break; + case NI_AVX512F_KORTEST: case NI_SSE41_PTEST: case NI_AVX_PTEST: // If we need the Carry flag then we can't swap operands. @@ -1199,12 +1201,14 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector128_op_Equality: case NI_Vector256_op_Equality: + case NI_Vector512_op_Equality: { return LowerHWIntrinsicCmpOp(node, GT_EQ); } case NI_Vector128_op_Inequality: case NI_Vector256_op_Inequality: + case NI_Vector512_op_Inequality: { return LowerHWIntrinsicCmpOp(node, GT_NE); } @@ -1657,7 +1661,8 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm var_types simdType = Compiler::getSIMDTypeForSize(simdSize); assert((intrinsicId == NI_Vector128_op_Equality) || (intrinsicId == NI_Vector128_op_Inequality) || - (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality)); + (intrinsicId == NI_Vector256_op_Equality) || (intrinsicId == NI_Vector256_op_Inequality) || + (intrinsicId == NI_Vector512_op_Equality) || (intrinsicId == NI_Vector512_op_Inequality)); assert(varTypeIsSIMD(simdType)); assert(varTypeIsArithmetic(simdBaseType)); @@ -1673,6 +1678,20 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm GenTree* op1 = node->Op(1); GenTree* op2 = node->Op(2); + if (simdSize == 64) + { + GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512F_CompareEqualSpecial, + simdBaseJitType, simdSize); + BlockRange().InsertBefore(node, cmp); + LowerNode(cmp); + + node->ResetHWIntrinsicId(NI_AVX512F_KORTEST, cmp); + GenCondition cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC; + LowerHWIntrinsicCC(node, NI_AVX512F_KORTEST, cmpCnd); + + return node->gtNext; + } + GenCondition cmpCnd = (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE; if (!varTypeIsFloating(simdBaseType) && op2->IsVectorZero() && diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index f2833fa4b51ae..e6916ed952001 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -93,7 +93,7 @@ #define RBM_ALLDOUBLE RBM_ALLFLOAT #if !defined(UNIX_X86_ABI) - #define RBM_ALLMASK REG_K1 + #define RBM_ALLMASK RBM_K1 #else #define RBM_ALLMASK (0) #endif