Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for Vector512 Equals, EqualsAny, op_Equality, and op_Inequality. #83470

Merged
merged 18 commits into from
Apr 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 66 additions & 2 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,19 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins)
return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION);
}

//------------------------------------------------------------------------
// IsKInstruction: Does this instruction require K register.
//
// Arguments:
// ins - The instruction to check.
//
// Returns:
// `true` if this instruction requires K register.
//
bool emitter::IsKInstruction(instruction ins)
{
return (ins >= INS_FIRST_K_INSTRUCTION) && (ins <= INS_LAST_K_INSTRUCTION);
insFlags flags = CodeGenInterface::instInfo[ins];
return (flags & KInstruction) != 0;
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -243,6 +253,17 @@ bool emitter::IsDstSrcSrcAVXInstruction(instruction ins) const
return (flags & INS_Flags_IsDstSrcSrcAVXInstruction) != 0;
}

bool emitter::IsThreeOperandAVXInstruction(instruction ins) const
{
if (!UseSimdEncoding())
{
return false;
}
Comment on lines +258 to +261
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't expect you to do this here, since this is consistent with the other methods.

But I think we can in general simplify this and several of the other SIMD only flags to something like:

insFlags flags = CodeGenInterface::instInfo[ins];

if ((flags & INS_Flags_Is3OperandInstructionMask) != 0)
{
    assert(UseSimdEncoding());
    return true;
}

return false;

The UseSimdEncoding() is itself a flag check now and we should never be setting these SIMD only flags on non-SIMD instructions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll log an issue for us to look at that as a cleanup item in some follow up PR.


insFlags flags = CodeGenInterface::instInfo[ins];
return (flags & INS_Flags_Is3OperandInstructionMask) != 0;
}

//------------------------------------------------------------------------
// HasRegularWideForm: Many x86/x64 instructions follow a regular encoding scheme where the
// byte-sized version of an instruction has the lowest bit of the opcode cleared
Expand Down Expand Up @@ -9849,6 +9870,10 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName)
#endif // TARGET_AMD64

#ifdef TARGET_X86
if (isMaskReg(reg))
{
return rn;
}
assert(strlen(rn) >= 3);

switch (EA_SIZE(attr))
Expand Down Expand Up @@ -18367,7 +18392,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_vpmovd2m:
case INS_vpmovq2m:
{
result.insLatency += PERFSCORE_LATENCY_1C;
result.insLatency += PERFSCORE_LATENCY_3C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}
Expand All @@ -18386,6 +18411,45 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
break;
}

case INS_vpcmpb:
case INS_vpcmpw:
case INS_vpcmpd:
case INS_vpcmpq:
case INS_vpcmpub:
case INS_vpcmpuw:
case INS_vpcmpud:
case INS_vpcmpuq:
{
result.insLatency += PERFSCORE_LATENCY_4C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}

case INS_vpmovm2b:
case INS_vpmovm2w:
{
result.insLatency += PERFSCORE_LATENCY_3C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}
case INS_vpmovm2d:
case INS_vpmovm2q:
{
result.insLatency += PERFSCORE_LATENCY_1C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}

case INS_kortestb:
case INS_kortestw:
case INS_kortestd:
case INS_kortestq:
{
result.insLatency += PERFSCORE_LATENCY_1C;
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
break;
}

default:
// unhandled instruction insFmt combination
perfScoreUnhandledInstruction(id, &result);
Expand Down
7 changes: 2 additions & 5 deletions src/coreclr/jit/emitxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr
bool HasKMaskRegisterDest(instruction ins) const
{
assert(UseEvexEncoding() == true);

switch (ins)
{
// Requires KMask.
Expand Down Expand Up @@ -403,6 +404,7 @@ void SetContains256bitOrMoreAVX(bool value)

bool IsDstDstSrcAVXInstruction(instruction ins) const;
bool IsDstSrcSrcAVXInstruction(instruction ins) const;
bool IsThreeOperandAVXInstruction(instruction ins) const;
static bool HasRegularWideForm(instruction ins);
static bool HasRegularWideImmediateForm(instruction ins);
static bool DoesWriteZeroFlag(instruction ins);
Expand All @@ -414,11 +416,6 @@ static bool IsRexW1Instruction(instruction ins);
static bool IsRexWXInstruction(instruction ins);
static bool IsRexW1EvexInstruction(instruction ins);

bool IsThreeOperandAVXInstruction(instruction ins)
{
return (IsDstDstSrcAVXInstruction(ins) || IsDstSrcSrcAVXInstruction(ins));
}

bool isAvxBlendv(instruction ins)
{
return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb;
Expand Down
42 changes: 40 additions & 2 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20172,6 +20172,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode(

NamedIntrinsic intrinsic = NI_Illegal;

if (simdSize == 64)
{
assert(op == GT_EQ);
}

switch (op)
{
#if defined(TARGET_XARCH)
Expand All @@ -20191,6 +20196,11 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
intrinsic = NI_AVX2_CompareEqual;
}
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_AVX512F_CompareEqualSpecial;
}
else if (simdBaseType == TYP_FLOAT)
{
intrinsic = NI_SSE_CompareEqual;
Expand Down Expand Up @@ -20760,7 +20770,20 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
}

assert(intrinsic != NI_Illegal);

#if defined(TARGET_XARCH)
if (simdSize != 64)
{
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
}
else
{
GenTree* cmp = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, intrinsic, simdBaseJitType, simdSize);
return gtNewSimdHWIntrinsicNode(type, cmp, NI_AVX512F_MoveMaskToVectorSpecial, simdBaseJitType, simdSize);
}
#else
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
#endif
}

GenTree* Compiler::gtNewSimdCmpOpAllNode(
Expand Down Expand Up @@ -20788,7 +20811,12 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(
#if defined(TARGET_XARCH)
case GT_EQ:
{
if (simdSize == 32)
if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_op_Equality;
}
else if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
Expand Down Expand Up @@ -20934,6 +20962,11 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(

intrinsic = NI_Vector256_op_Inequality;
}
else if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_op_Inequality;
}
else
{
intrinsic = NI_Vector128_op_Inequality;
Expand All @@ -20957,7 +20990,12 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(

case GT_NE:
{
if (simdSize == 32)
if (simdSize == 64)
{
assert(IsBaselineVector512IsaSupportedDebugOnly());
intrinsic = NI_Vector512_op_Inequality;
}
else if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
Expand Down
45 changes: 42 additions & 3 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1738,15 +1738,54 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node)
unreached();
}

// TODO-XARCH-AVX512 remove REG_K1 check when all K registers possible for
// allocation.
assert(emitter::isMaskReg(maskReg) && maskReg == REG_K1);
assert(emitter::isMaskReg(maskReg));

emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg);
emit->emitIns_Mov(kmovIns, EA_8BYTE, targetReg, maskReg, INS_FLAGS_DONT_CARE);
break;
}

case NI_AVX512F_CompareEqualSpecial:
{
GenTree* op2 = node->Op(2);
op1Reg = op1->GetRegNum();
regNumber op2Reg = op2->GetRegNum();

instruction compareIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_CompareEqualSpecial, baseType);

assert(compareIns != INS_invalid);
assert(emitter::isMaskReg(targetReg));

emit->emitIns_R_R_R_I(compareIns, attr, targetReg, op1Reg, op2Reg, 0);
break;
}

case NI_AVX512F_MoveMaskToVectorSpecial:
{
op1Reg = op1->GetRegNum();

instruction maskMovIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_MoveMaskToVectorSpecial, baseType);

assert(maskMovIns != INS_invalid);
assert(emitter::isMaskReg(op1Reg));

emit->emitIns_R_R(maskMovIns, attr, targetReg, op1Reg);
break;
}

case NI_AVX512F_KORTEST:
{
op1Reg = op1->GetRegNum();

instruction testIns = HWIntrinsicInfo::lookupIns(NI_AVX512F_KORTEST, baseType);

assert(testIns != INS_invalid);
assert(emitter::isMaskReg(op1Reg));

emit->emitIns_R_R(testIns, EA_8BYTE, op1Reg, op1Reg);
break;
}

case NI_AVX512F_ConvertToVector128Int16:
case NI_AVX512F_ConvertToVector128Int32:
case NI_AVX512F_ConvertToVector128UInt16:
Expand Down
14 changes: 13 additions & 1 deletion src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,12 @@ HARDWARE_INTRINSIC(Vector512, WidenUpper,
HARDWARE_INTRINSIC(Vector512, WithLower, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, WithUpper, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, Xor, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, Equals, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)

HARDWARE_INTRINSIC(Vector512, EqualsAll, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, EqualsAny, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, op_Equality, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative)
HARDWARE_INTRINSIC(Vector512, op_Inequality, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative)

// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size NumArg Instructions Category Flags
Expand Down Expand Up @@ -962,7 +968,13 @@ HARDWARE_INTRINSIC(SSE2, UCOMISD,
HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)

HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)

HARDWARE_INTRINSIC(AVX512F, CompareEqualSpecial, 64, 2, {INS_vpcmpb, INS_vpcmpub, INS_vpcmpw, INS_vpcmpuw, INS_vpcmpd, INS_vpcmpud, INS_vpcmpq, INS_vpcmpuq, INS_cmpps, INS_cmppd}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX512F, MoveMaskToVectorSpecial, 64, 1, {INS_vpmovm2b, INS_vpmovm2b, INS_vpmovm2w, INS_vpmovm2w, INS_vpmovm2d, INS_vpmovm2d, INS_vpmovm2q, INS_vpmovm2q, INS_vpmovm2d, INS_vpmovm2q}, HW_Category_Special, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)


HARDWARE_INTRINSIC(AVX512F, KORTEST, 0, 1, {INS_kortestq, INS_kortestq, INS_kortestd, INS_kortestd, INS_kortestw, INS_kortestw, INS_kortestb, INS_kortestb, INS_kortestw, INS_kortestb}, HW_Category_Special, HW_Flag_NoRMWSemantics)

#endif // FEATURE_HW_INTRINSIC

Expand Down
48 changes: 48 additions & 0 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1338,6 +1338,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

case NI_Vector128_Equals:
case NI_Vector256_Equals:
case NI_Vector512_Equals:
{
assert(sig->numArgs == 2);

Expand All @@ -1351,6 +1352,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_Vector512_EqualsAll:
case NI_Vector512_op_Equality:
{
assert(sig->numArgs == 2);
assert(IsBaselineVector512IsaSupportedDebugOnly());

var_types simdType = getSIMDTypeForSize(simdSize);

op2 = impSIMDPopStack(simdType);
op1 = impSIMDPopStack(simdType);

retNode = gtNewSimdCmpOpAllNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize);
break;
}

case NI_Vector128_EqualsAll:
case NI_Vector256_EqualsAll:
case NI_Vector128_op_Equality:
Expand All @@ -1370,6 +1386,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_Vector512_EqualsAny:
{
assert(sig->numArgs == 2);
assert(simdSize == 64);
assert(IsBaselineVector512IsaSupportedDebugOnly());

var_types simdType = getSIMDTypeForSize(simdSize);

op2 = impSIMDPopStack(simdType);
op1 = impSIMDPopStack(simdType);

retNode = gtNewSimdCmpOpAnyNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize);
break;
}

case NI_Vector128_EqualsAny:
case NI_Vector256_EqualsAny:
{
Expand Down Expand Up @@ -2011,6 +2042,23 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_Vector512_op_Inequality:
{
assert(sig->numArgs == 2);

if (IsBaselineVector512IsaSupported())
{
var_types simdType = getSIMDTypeForSize(simdSize);

op2 = impSIMDPopStack(simdType);
op1 = impSIMDPopStack(simdType);

retNode = gtNewSimdCmpOpAnyNode(GT_NE, retType, op1, op2, simdBaseJitType, simdSize);
}

break;
}

case NI_Vector128_op_UnaryPlus:
case NI_Vector256_op_UnaryPlus:
{
Expand Down
Loading