Skip to content

Commit

Permalink
Allow morph to recognize more places that its beneficial to produce m…
Browse files Browse the repository at this point in the history
…asks
  • Loading branch information
tannergooding committed Jul 16, 2024
1 parent 6a9a0b0 commit f45c339
Show file tree
Hide file tree
Showing 3 changed files with 284 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,7 @@ struct HWIntrinsicInfo
FloatComparisonMode comparison,
var_types simdBaseType,
unsigned simdSize);
static NamedIntrinsic lookupEvexMaskId(NamedIntrinsic intrinsic);
#endif

// Member lookup
Expand Down
166 changes: 166 additions & 0 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,172 @@ NamedIntrinsic HWIntrinsicInfo::lookupIdForFloatComparisonMode(NamedIntrinsic
}
}

//------------------------------------------------------------------------
// lookupEvexId: Get the EVEX intrinsic ID to use for a given intrinsic
//
// Arguments:
// intrinsic -- The base intrinsic that is being converted to its EVEX form
//
// Return Value:
// The EVEX intrinsic ID to use instead of intrinsic
//
NamedIntrinsic HWIntrinsicInfo::lookupEvexMaskId(NamedIntrinsic intrinsic)
{
switch (intrinsic)
{
case NI_SSE_And:
case NI_SSE2_And:
case NI_AVX_And:
case NI_AVX2_And:
case NI_AVX512F_And:
case NI_AVX512DQ_And:
case NI_AVX10v1_V512_And:
{
return NI_EVEX_AndMask;
}

case NI_SSE_AndNot:
case NI_SSE2_AndNot:
case NI_AVX_AndNot:
case NI_AVX2_AndNot:
case NI_AVX512F_AndNot:
case NI_AVX512DQ_AndNot:
case NI_AVX10v1_V512_AndNot:
{
return NI_EVEX_AndNotMask;
}

case NI_SSE41_BlendVariable:
case NI_AVX_BlendVariable:
case NI_AVX2_BlendVariable:
{
return NI_EVEX_BlendVariableMask;
}

case NI_AVX_Compare:
{
return NI_EVEX_CompareMask;
}

case NI_SSE_CompareEqual:
case NI_SSE2_CompareEqual:
case NI_SSE41_CompareEqual:
case NI_AVX_CompareEqual:
case NI_AVX2_CompareEqual:
{
return NI_EVEX_CompareEqualMask;
}

case NI_SSE_CompareGreaterThan:
case NI_SSE2_CompareGreaterThan:
case NI_SSE42_CompareGreaterThan:
case NI_AVX_CompareGreaterThan:
case NI_AVX2_CompareGreaterThan:
{
return NI_EVEX_CompareGreaterThanMask;
}

case NI_SSE_CompareGreaterThanOrEqual:
case NI_SSE2_CompareGreaterThanOrEqual:
case NI_AVX_CompareGreaterThanOrEqual:
{
return NI_EVEX_CompareGreaterThanOrEqualMask;
}

case NI_SSE_CompareLessThan:
case NI_SSE2_CompareLessThan:
case NI_SSE42_CompareLessThan:
case NI_AVX_CompareLessThan:
{
return NI_EVEX_CompareLessThanMask;
}

case NI_SSE_CompareLessThanOrEqual:
case NI_SSE2_CompareLessThanOrEqual:
case NI_AVX_CompareLessThanOrEqual:
{
return NI_EVEX_CompareLessThanOrEqualMask;
}

case NI_SSE_CompareNotEqual:
case NI_SSE2_CompareNotEqual:
case NI_AVX_CompareNotEqual:
{
return NI_EVEX_CompareNotEqualMask;
}

case NI_SSE_CompareNotGreaterThan:
case NI_SSE2_CompareNotGreaterThan:
case NI_AVX_CompareNotGreaterThan:
{
return NI_EVEX_CompareNotGreaterThanMask;
}

case NI_SSE_CompareNotGreaterThanOrEqual:
case NI_SSE2_CompareNotGreaterThanOrEqual:
case NI_AVX_CompareNotGreaterThanOrEqual:
{
return NI_EVEX_CompareNotGreaterThanOrEqualMask;
}

case NI_SSE_CompareNotLessThan:
case NI_SSE2_CompareNotLessThan:
case NI_AVX_CompareNotLessThan:
{
return NI_EVEX_CompareNotLessThanMask;
}

case NI_SSE_CompareNotLessThanOrEqual:
case NI_SSE2_CompareNotLessThanOrEqual:
case NI_AVX_CompareNotLessThanOrEqual:
{
return NI_EVEX_CompareNotLessThanOrEqualMask;
}

case NI_SSE_CompareOrdered:
case NI_SSE2_CompareOrdered:
case NI_AVX_CompareOrdered:
{
return NI_EVEX_CompareOrderedMask;
}

case NI_SSE_CompareUnordered:
case NI_SSE2_CompareUnordered:
case NI_AVX_CompareUnordered:
{
return NI_EVEX_CompareUnorderedMask;
}

case NI_SSE_Or:
case NI_SSE2_Or:
case NI_AVX_Or:
case NI_AVX2_Or:
case NI_AVX512F_Or:
case NI_AVX512DQ_Or:
case NI_AVX10v1_V512_Or:
{
return NI_EVEX_OrMask;
}

case NI_SSE_Xor:
case NI_SSE2_Xor:
case NI_AVX_Xor:
case NI_AVX2_Xor:
case NI_AVX512F_Xor:
case NI_AVX512DQ_Xor:
case NI_AVX10v1_V512_Xor:
{
return NI_EVEX_XorMask;
}

default:
{
assert(!"Unexpected intrinsic when resolving EVEX alternative");
return NI_Illegal;
}
}
}

//------------------------------------------------------------------------
// isFullyImplementedIsa: Gets a value that indicates whether the InstructionSet is fully implemented
//
Expand Down
117 changes: 117 additions & 0 deletions src/coreclr/jit/morph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9918,6 +9918,117 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
default:
{
#if defined(FEATURE_MASKED_HW_INTRINSICS)
#if defined(TARGET_XARCH)
bool isCndSel = (intrinsicId == NI_Vector128_ConditionalSelect) ||
(intrinsicId == NI_Vector256_ConditionalSelect) ||
(intrinsicId == NI_Vector512_ConditionalSelect);

if (isCndSel || node->OperIsConvertVectorToMask())
{
GenTree* op1 = node->Op(1);

if (!op1->IsVectorPerElementMask(simdBaseType, simdSize))
{
break;
}

if (!op1->OperIsHWIntrinsic())
{
break;
}

GenTreeHWIntrinsic* op1Intrin = op1->AsHWIntrinsic();

if (!isCndSel)
{
// CndSel knows how to handle mismatched mask sizes, but not all consumers can

if (genTypeSize(op1Intrin->GetSimdBaseType()) != genTypeSize(simdBaseType))
{
break;
}
}

if (!canUseEvexEncoding())
{
break;
}

// We have something expecting a mask and have a case where we could be producing a mask directly

NamedIntrinsic op1IntrinId = op1Intrin->GetHWIntrinsicId();

NamedIntrinsic evexIntrinId = HWIntrinsicInfo::lookupEvexMaskId(op1IntrinId);

if (evexIntrinId != NI_Illegal)
{
GenTree* cvtNode;

op1Intrin->ChangeHWIntrinsicId(evexIntrinId);
op1Intrin->gtType = TYP_MASK;

#ifdef DEBUG
// We want to remorph the nodes introduced below, so clear the flag

auto resetMorphedFlag = [](GenTree** slot, fgWalkData* data) -> fgWalkResult {
(*slot)->gtDebugFlags &= ~GTF_DEBUG_NODE_MORPHED;
return WALK_CONTINUE;
};

fgWalkTreePost(&op1, resetMorphedFlag);
#endif // DEBUG

switch (op1IntrinId)
{
case NI_EVEX_AndMask:
case NI_EVEX_AndNotMask:
case NI_EVEX_OrMask:
case NI_EVEX_XorMask:
{
// There's a few special nodes which are allowed to combine masks
// and so we handle these by inserting a CvtVectorToMask over each
// operand and remorphing, which will get us the optimized sequence

cvtNode = op1Intrin->Op(1);
cvtNode = gtNewSimdCvtVectorToMaskNode(TYP_MASK, cvtNode, simdBaseJitType, simdSize);
cvtNode = fgMorphHWIntrinsic(cvtNode->AsHWIntrinsic());

op1Intrin->Op(1) = cvtNode;

cvtNode = op1Intrin->Op(2);
cvtNode = gtNewSimdCvtVectorToMaskNode(TYP_MASK, cvtNode, simdBaseJitType, simdSize);
cvtNode = fgMorphHWIntrinsic(cvtNode->AsHWIntrinsic());

op1Intrin->Op(2) = cvtNode;

op1 = fgMorphHWIntrinsic(op1Intrin);
break;
}

default:
{
break;
}
}

if (isCndSel)
{
// This will allow lowering to emit a vblendm and potentially do embedded masking
cvtNode = gtNewSimdCvtMaskToVectorNode(retType, op1, simdBaseJitType, simdSize);
cvtNode = fgMorphHWIntrinsic(cvtNode->AsHWIntrinsic());

node->Op(1) = cvtNode;
return node;
}
else
{
DEBUG_DESTROY_NODE(node);
return op1;
}
}
}
#endif // TARGET_XARCH

bool isScalar = false;
genTreeOps actualOper = node->GetOperForHWIntrinsicId(&isScalar);
genTreeOps oper = actualOper;
Expand Down Expand Up @@ -9945,6 +10056,12 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)

// We need both operands to be ConvertMaskToVector in
// order to optimize this to a direct mask operation
//
// Noting that we could handle broader scenarios by
// checking IsVectorPerElementMask instead, but that
// could regress code size unnecessarily if we aren't
// consumed as a mask as well. We handle the case where
// we're consumed as a mask elsewhere in morph instead.

if (!op1->OperIsConvertMaskToVector())
{
Expand Down

0 comments on commit f45c339

Please sign in to comment.