Skip to content

Commit

Permalink
Adding support for Vector512 Equals, EqualsAny, op_Equality, an…
Browse files Browse the repository at this point in the history
…d `op_Inequality`. (#83470)

* Implement `Vector512.Equals` by way of special `AVX512` intrinsic.

* Add Vector512 `Equals`, `EqualsAll/op_Equality` `op_Inequality`.

* Fix `vpmovm2x` instructions for `MoveMaskToVectorSpecial`.

* Move `Vector512` Equals into `gtNewSimdCmpOpNode`.

* Adds `EqualsAny`.

* Fix `kortestq` and `kortestd` opcode gen (W bit).

* Fix merge and update instr flags.

* Addressing review comments.

* Removing unnecessary files.

* Addressing review comments.

* Fixing bug.

* Fixing k reg display on x86.

* Setting Resets_ZF to right value.

* Fixing extract.

---------

Co-authored-by: Deepak Rajendrakumaran <deepak.rajendrakumaran@intel.com>
Co-authored-by: Tanner Gooding <tagoo@outlook.com>
  • Loading branch information
3 people authored Apr 10, 2023
1 parent 02e5b58 commit f211984
Show file tree
Hide file tree
Showing 10 changed files with 311 additions and 83 deletions.
68 changes: 66 additions & 2 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,19 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins)
return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION);
}

//------------------------------------------------------------------------
// IsKInstruction: Does this instruction require K register.
//
// Arguments:
// ins - The instruction to check.
//
// Returns:
// `true` if this instruction requires K register.
//
bool emitter::IsKInstruction(instruction ins)
{
return (ins >= INS_FIRST_K_INSTRUCTION) && (ins <= INS_LAST_K_INSTRUCTION);
insFlags flags = CodeGenInterface::instInfo[ins];
return (flags & KInstruction) != 0;
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -243,6 +253,17 @@ bool emitter::IsDstSrcSrcAVXInstruction(instruction ins) const
return (flags & INS_Flags_IsDstSrcSrcAVXInstruction) != 0;
}

bool emitter::IsThreeOperandAVXInstruction(instruction ins) const
{
if (!UseSimdEncoding())
{
return false;
}

insFlags flags = CodeGenInterface::instInfo[ins];
return (flags & INS_Flags_Is3OperandInstructionMask) != 0;
}

//------------------------------------------------------------------------
// HasRegularWideForm: Many x86/x64 instructions follow a regular encoding scheme where the
// byte-sized version of an instruction has the lowest bit of the opcode cleared
Expand Down Expand Up @@ -9849,6 +9870,10 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName)
#endif // TARGET_AMD64

#ifdef TARGET_X86
if (isMaskReg(reg))
{
return rn;
}
assert(strlen(rn) >= 3);

switch (EA_SIZE(attr))
Expand Down Expand Up @@ -18367,7 +18392,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_vpmovd2m:
case INS_vpmovq2m:
{
result.insLatency += PERFSCORE_LATENCY_1C;
result.insLatency += PERFSCORE_LATENCY_3C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}
Expand All @@ -18386,6 +18411,45 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
break;
}

case INS_vpcmpb:
case INS_vpcmpw:
case INS_vpcmpd:
case INS_vpcmpq:
case INS_vpcmpub:
case INS_vpcmpuw:
case INS_vpcmpud:
case INS_vpcmpuq:
{
result.insLatency += PERFSCORE_LATENCY_4C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}

case INS_vpmovm2b:
case INS_vpmovm2w:
{
result.insLatency += PERFSCORE_LATENCY_3C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}
case INS_vpmovm2d:
case INS_vpmovm2q:
{
result.insLatency += PERFSCORE_LATENCY_1C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}

case INS_kortestb:
case INS_kortestw:
case INS_kortestd:
case INS_kortestq:
{
result.insLatency += PERFSCORE_LATENCY_1C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}

default:
// unhandled instruction insFmt combination
perfScoreUnhandledInstruction(id, &result);
Expand Down
7 changes: 2 additions & 5 deletions src/coreclr/jit/emitxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr
bool HasKMaskRegisterDest(instruction ins) const
{
assert(UseEvexEncoding() == true);

switch (ins)
{
// Requires KMask.
Expand Down Expand Up @@ -403,6 +404,7 @@ void SetContains256bitOrMoreAVX(bool value)

bool IsDstDstSrcAVXInstruction(instruction ins) const;
bool IsDstSrcSrcAVXInstruction(instruction ins) const;
bool IsThreeOperandAVXInstruction(instruction ins) const;
static bool HasRegularWideForm(instruction ins);
static bool HasRegularWideImmediateForm(instruction ins);
static bool DoesWriteZeroFlag(instruction ins);
Expand All @@ -414,11 +416,6 @@ static bool IsRexW1Instruction(instruction ins);
static bool IsRexWXInstruction(instruction ins);
static bool IsRexW1EvexInstruction(instruction ins);

bool IsThreeOperandAVXInstruction(instruction ins)
{
return (IsDstDstSrcAVXInstruction(ins) || IsDstSrcSrcAVXInstruction(ins));
}

bool isAvxBlendv(instruction ins)
{
return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb;
Expand Down
42 changes: 40 additions & 2 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20172,6 +20172,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode(

NamedIntrinsic intrinsic = NI_Illegal;

if (simdSize == 64)
{
assert(op == GT_EQ);
}

switch (op)
{
#if defined(TARGET_XARCH)
Expand All @@ -20191,6 +20196,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
intrinsic = NI_AVX2_CompareEqual;
}
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_AVX512F_CompareEqualSpecial;
}
else if (simdBaseType == TYP_FLOAT)
{
intrinsic = NI_SSE_CompareEqual;
Expand Down Expand Up @@ -20760,7 +20770,20 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
}

assert(intrinsic != NI_Illegal);

#if defined(TARGET_XARCH)
if (simdSize != 64)
{
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
}
else
{
GenTree* cmp = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, intrinsic, simdBaseJitType, simdSize);
return gtNewSimdHWIntrinsicNode(type, cmp, NI_AVX512F_MoveMaskToVectorSpecial, simdBaseJitType, simdSize);
}
#else
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
#endif
}

GenTree* Compiler::gtNewSimdCmpOpAllNode(
Expand Down Expand Up @@ -20788,7 +20811,12 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(
#if defined(TARGET_XARCH)
case GT_EQ:
{
if (simdSize == 32)
if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_op_Equality;
}
else if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
Expand Down Expand Up @@ -20934,6 +20962,11 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(

intrinsic = NI_Vector256_op_Inequality;
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_op_Inequality;
}
else
{
intrinsic = NI_Vector128_op_Inequality;
Expand All @@ -20957,7 +20990,12 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(

case GT_NE:
{
if (simdSize == 32)
if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_op_Inequality;
}
else if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
Expand Down
45 changes: 42 additions & 3 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1738,15 +1738,54 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node)
unreached();
}

// TODO-XARCH-AVX512 remove REG_K1 check when all K registers possible for
// allocation.
assert(emitter::isMaskReg(maskReg) && maskReg == REG_K1);
assert(emitter::isMaskReg(maskReg));

emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg);
emit->emitIns_Mov(kmovIns, EA_8BYTE, targetReg, maskReg, INS_FLAGS_DONT_CARE);
break;
}

case NI_AVX512F_CompareEqualSpecial:
{
GenTree* op2 = node->Op(2);
op1Reg = op1->GetRegNum();
regNumber op2Reg = op2->GetRegNum();

instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareEqualSpecial, baseType);

assert(compareIns != INS_invalid);
assert(emitter::isMaskReg(targetReg));

emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 0);
break;
}

case NI_AVX512F_MoveMaskToVectorSpecial:
{
op1Reg = op1->GetRegNum();

instruction maskMovIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_MoveMaskToVectorSpecial, baseType);

assert(maskMovIns != INS_invalid);
assert(emitter::isMaskReg(op1Reg));

emit->emitIns_R_R(maskMovIns, attr, targetReg, op1Reg);
break;
}

case NI_AVX512F_KORTEST:
{
op1Reg = op1->GetRegNum();

instruction testIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_KORTEST, baseType);

assert(testIns != INS_invalid);
assert(emitter::isMaskReg(op1Reg));

emit->emitIns_R_R(testIns, EA_8BYTE, op1Reg, op1Reg);
break;
}

case NI_AVX512F_ConvertToVector128Int16:
case NI_AVX512F_ConvertToVector128Int32:
case NI_AVX512F_ConvertToVector128UInt16:
Expand Down
14 changes: 13 additions & 1 deletion src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,12 @@ HARDWARE_INTRINSIC(Vector512, WidenUpper,
HARDWARE_INTRINSIC(Vector512, WithLower, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, WithUpper, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, Xor, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, Equals, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)

HARDWARE_INTRINSIC(Vector512, EqualsAll, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, EqualsAny, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, op_Equality, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative)
HARDWARE_INTRINSIC(Vector512, op_Inequality, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative)

// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size NumArg Instructions Category Flags
Expand Down Expand Up @@ -962,7 +968,13 @@ HARDWARE_INTRINSIC(SSE2, UCOMISD,
HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)

HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)

HARDWARE_INTRINSIC(AVX512F, CompareEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX512F, MoveMaskToVectorSpecial, 64, 1, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)


HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, 1, {INS_kortestq, INS_kortestq, INS_kortestd, INS_kortestd, INS_kortestw, INS_kortestw, INS_kortestb, INS_kortestb, INS_kortestw, INS_kortestb}, HW_Category_Special, HW_Flag_NoRMWSemantics)

#endif // FEATURE_HW_INTRINSIC

Expand Down
48 changes: 48 additions & 0 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1338,6 +1338,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

case NI_Vector128_Equals:
case NI_Vector256_Equals:
case NI_Vector512_Equals:
{
assert(sig->numArgs == 2);

Expand All @@ -1351,6 +1352,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_Vector512_EqualsAll:
case NI_Vector512_op_Equality:
{
assert(sig->numArgs == 2);
assert(IsBaselineVector512IsaSupportedDebugOnly());

var_types simdType = getSIMDTypeForSize(simdSize);

op2 = impSIMDPopStack(simdType);
op1 = impSIMDPopStack(simdType);

retNode = gtNewSimdCmpOpAllNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize);
break;
}

case NI_Vector128_EqualsAll:
case NI_Vector256_EqualsAll:
case NI_Vector128_op_Equality:
Expand All @@ -1370,6 +1386,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_Vector512_EqualsAny:
{
assert(sig->numArgs == 2);
assert(simdSize == 64);
assert(IsBaselineVector512IsaSupportedDebugOnly());

var_types simdType = getSIMDTypeForSize(simdSize);

op2 = impSIMDPopStack(simdType);
op1 = impSIMDPopStack(simdType);

retNode = gtNewSimdCmpOpAnyNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize);
break;
}

case NI_Vector128_EqualsAny:
case NI_Vector256_EqualsAny:
{
Expand Down Expand Up @@ -2011,6 +2042,23 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_Vector512_op_Inequality:
{
assert(sig->numArgs == 2);

if (IsBaselineVector512IsaSupported())
{
var_types simdType = getSIMDTypeForSize(simdSize);

op2 = impSIMDPopStack(simdType);
op1 = impSIMDPopStack(simdType);

retNode = gtNewSimdCmpOpAnyNode(GT_NE, retType, op1, op2, simdBaseJitType, simdSize);
}

break;
}

case NI_Vector128_op_UnaryPlus:
case NI_Vector256_op_UnaryPlus:
{
Expand Down
Loading

0 comments on commit f211984

Please sign in to comment.