Skip to content

Commit

Permalink
Optimize comparisons against AllBitsSet on pre-AVX512 hardware
Browse files Browse the repository at this point in the history
  • Loading branch information
tannergooding committed Jul 16, 2024
1 parent 7e9cc55 commit 6a9a0b0
Showing 1 changed file with 97 additions and 88 deletions.
185 changes: 97 additions & 88 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2488,7 +2488,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
CorInfoType maskBaseJitType = simdBaseJitType;
var_types maskBaseType = simdBaseType;

if (op1Msk->OperIsHWIntrinsic(NI_EVEX_ConvertMaskToVector))
if (op1Msk->OperIsConvertMaskToVector())
{
GenTreeHWIntrinsic* cvtMaskToVector = op1Msk->AsHWIntrinsic();

Expand All @@ -2499,122 +2499,131 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
maskBaseType = cvtMaskToVector->GetSimdBaseType();
}

if (!varTypeIsFloating(simdBaseType) && (simdSize != 64) && op2->IsVectorZero() &&
comp->compOpportunisticallyDependsOn(InstructionSet_SSE41) && !varTypeIsMask(op1Msk))
if (!varTypeIsFloating(simdBaseType) && (simdSize != 64) && !varTypeIsMask(op1Msk))
{
// On SSE4.1 or higher we can optimize comparisons against zero to
// just use PTEST. We can't support it for floating-point, however,
// as it has both +0.0 and -0.0 where +0.0 == -0.0
bool isOp2VectorZero = op2->IsVectorZero();

bool skipReplaceOperands = false;

if (op1->OperIsHWIntrinsic())
if ((isOp2VectorZero || op2->IsVectorAllBitsSet()) &&
comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic();
NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId();
// On SSE4.1 or higher we can optimize comparisons against Zero or AllBitsSet to
// just use PTEST. We can't support it for floating-point, however, as it has
// both +0.0 and -0.0 where +0.0 == -0.0

GenTree* nestedOp1 = nullptr;
GenTree* nestedOp2 = nullptr;
bool isEmbeddedBroadcast = false;
bool skipReplaceOperands = false;

if (op1Intrinsic->GetOperandCount() == 2)
if (!isOp2VectorZero)
{
nestedOp1 = op1Intrinsic->Op(1);
nestedOp2 = op1Intrinsic->Op(2);
// We can optimize to TestC(op1, allbitsset)
//
// This works out because TestC sets CF if (~x & y) == 0, so:
// ~00 & 11 = 11; 11 & 11 = 11; NC
// ~01 & 11 = 01; 10 & 11 = 10; NC
// ~10 & 11 = 10; 01 & 11 = 01; NC
// ~11 & 11 = 11; 00 & 11 = 00; C

assert(!nestedOp1->isContained());
isEmbeddedBroadcast = nestedOp2->isContained() && nestedOp2->OperIsHWIntrinsic();
}
assert(op2->IsVectorAllBitsSet());
cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;

switch (op1IntrinsicId)
skipReplaceOperands = true;
}
else if (op1->OperIsHWIntrinsic())
{
case NI_SSE_And:
case NI_SSE2_And:
case NI_AVX_And:
case NI_AVX2_And:
assert(op2->IsVectorZero());

GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic();

if (op1Intrinsic->GetOperandCount() == 2)
{
// We can optimize to TestZ(op1.op1, op1.op2)
GenTree* nestedOp1 = op1Intrinsic->Op(1);
GenTree* nestedOp2 = op1Intrinsic->Op(2);

assert(!nestedOp1->isContained());
bool isEmbeddedBroadcast = nestedOp2->isContained() && nestedOp2->OperIsHWIntrinsic();

if (isEmbeddedBroadcast)
bool isScalar = false;
genTreeOps oper = op1Intrinsic->GetOperForHWIntrinsicId(&isScalar);

switch (oper)
{
// PTEST doesn't support embedded broadcast
break;
}
case GT_AND:
{
// We can optimize to TestZ(op1.op1, op1.op2)

node->Op(1) = nestedOp1;
node->Op(2) = nestedOp2;
if (isEmbeddedBroadcast)
{
// PTEST doesn't support embedded broadcast
break;
}

BlockRange().Remove(op1);
BlockRange().Remove(op2);
node->Op(1) = nestedOp1;
node->Op(2) = nestedOp2;

skipReplaceOperands = true;
break;
}
BlockRange().Remove(op1);
BlockRange().Remove(op2);

case NI_SSE_AndNot:
case NI_SSE2_AndNot:
case NI_AVX_AndNot:
case NI_AVX2_AndNot:
{
// We can optimize to TestC(op1.op1, op1.op2)
skipReplaceOperands = true;
break;
}

if (isEmbeddedBroadcast)
{
// PTEST doesn't support embedded broadcast
break;
}
case GT_AND_NOT:
{
// We can optimize to TestC(op1.op1, op1.op2)

if (isEmbeddedBroadcast)
{
// PTEST doesn't support embedded broadcast
break;
}

cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;
cmpCnd = (cmpOp == GT_EQ) ? GenCondition::C : GenCondition::NC;

node->Op(1) = nestedOp1;
node->Op(2) = nestedOp2;
node->Op(1) = nestedOp1;
node->Op(2) = nestedOp2;

BlockRange().Remove(op1);
BlockRange().Remove(op2);
BlockRange().Remove(op1);
BlockRange().Remove(op2);

skipReplaceOperands = true;
break;
}
skipReplaceOperands = true;
break;
}

default:
{
break;
default:
{
break;
}
}
}
}
}

if (!skipReplaceOperands)
{
// Default handler, emit a TestZ(op1, op1)

node->Op(1) = op1;
BlockRange().Remove(op2);
if (!skipReplaceOperands)
{
// Default handler, emit a TestZ(op1, op1)
assert(op2->IsVectorZero());

LIR::Use op1Use(BlockRange(), &node->Op(1), node);
ReplaceWithLclVar(op1Use);
op1 = node->Op(1);
node->Op(1) = op1;
BlockRange().Remove(op2);

op2 = comp->gtClone(op1);
BlockRange().InsertAfter(op1, op2);
node->Op(2) = op2;
}
LIR::Use op1Use(BlockRange(), &node->Op(1), node);
ReplaceWithLclVar(op1Use);
op1 = node->Op(1);

if (simdSize == 32)
{
// TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
node->ChangeHWIntrinsicId(NI_AVX_TestZ);
LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd);
}
else
{
assert(simdSize == 16);
op2 = comp->gtClone(op1);
BlockRange().InsertAfter(op1, op2);
node->Op(2) = op2;
}

// TODO-Review: LowerHWIntrinsicCC resets the id again, so why is this needed?
node->ChangeHWIntrinsicId(NI_SSE41_TestZ);
LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
if (simdSize == 32)
{
LowerHWIntrinsicCC(node, NI_AVX_PTEST, cmpCnd);
}
else
{
assert(simdSize == 16);
LowerHWIntrinsicCC(node, NI_SSE41_PTEST, cmpCnd);
}
return LowerNode(node);
}

return LowerNode(node);
}

// TODO-XARCH-AVX512: We should handle TYP_SIMD12 here under the EVEX path, but doing
Expand Down Expand Up @@ -3490,7 +3499,7 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node)
}
}

if (condition->OperIsHWIntrinsic(NI_EVEX_ConvertMaskToVector))
if (condition->OperIsConvertMaskToVector())
{
GenTree* tmp = condition->AsHWIntrinsic()->Op(1);
BlockRange().Remove(condition);
Expand Down

0 comments on commit 6a9a0b0

Please sign in to comment.