Skip to content

Commit

Permalink
Optimize VectorX<T>.ConditionalSelect for constant masks (#104092)
Browse files Browse the repository at this point in the history
* Optimize ConditionalSelect for const mask

This adds a check in the JIT for constant masks (`GT_CNS_VEC`, everything else gets lowered to it) and enables optimization to `BlendVariable` (`(v)pblendvb` instruction).
This currently does not work for masks loaded from an array in a field/variable.
Also this optimization is not triggered for platforms supporting AVX512F(/VL?) since it gets optimized earlier to `vpternlogd` instruction.

* Cleanup code and separate it into functions

* fix build

* Misc fixes

* Final build fixes

* Address review comments

* Address review comments again

* address the rest of the comments

* Remove scalar assertion

Co-authored-by: Tanner Gooding <tagoo@outlook.com>

---------

Co-authored-by: Tanner Gooding <tagoo@outlook.com>
  • Loading branch information
ezhevita and tannergooding authored Jul 4, 2024
1 parent 8432f0d commit 284aeaf
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 4 deletions.
86 changes: 86 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29707,6 +29707,92 @@ bool GenTree::IsInvariant() const
return OperIsConst() || OperIs(GT_LCL_ADDR) || OperIs(GT_FTN_ADDR);
}

//-------------------------------------------------------------------
// IsVectorPerElementMask: returns true if this node is a vector constant per-element mask
// (every element has either all bits set or none of them).
//
// Arguments:
// simdBaseType - the base type of the constant being checked.
// simdSize - the size of the SIMD type of the intrinsic.
//
// Returns:
// True if this node is a vector constant per-element mask.
//
bool GenTree::IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize) const
{
#ifdef FEATURE_SIMD
if (IsCnsVec())
{
const GenTreeVecCon* vecCon = AsVecCon();

int elementCount = vecCon->ElementCount(simdSize, simdBaseType);

switch (simdBaseType)
{
case TYP_BYTE:
case TYP_UBYTE:
return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u8[0], elementCount);
case TYP_SHORT:
case TYP_USHORT:
return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u16[0], elementCount);
case TYP_INT:
case TYP_UINT:
case TYP_FLOAT:
return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u32[0], elementCount);
case TYP_LONG:
case TYP_ULONG:
case TYP_DOUBLE:
return ElementsAreAllBitsSetOrZero(&vecCon->gtSimdVal.u64[0], elementCount);
default:
unreached();
}
}
else if (OperIsHWIntrinsic())
{
const GenTreeHWIntrinsic* intrinsic = AsHWIntrinsic();
const NamedIntrinsic intrinsicId = intrinsic->GetHWIntrinsicId();

if (HWIntrinsicInfo::ReturnsPerElementMask(intrinsicId))
{
// We directly return a per-element mask
return true;
}

bool isScalar = false;
genTreeOps oper = intrinsic->HWOperGet(&isScalar);

switch (oper)
{
case GT_AND:
case GT_AND_NOT:
case GT_OR:
case GT_XOR:
{
// We are a binary bitwise operation where both inputs are per-element masks
return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize) &&
intrinsic->Op(2)->IsVectorPerElementMask(simdBaseType, simdSize);
}

case GT_NOT:
{
// We are an unary bitwise operation where the input is a per-element mask
return intrinsic->Op(1)->IsVectorPerElementMask(simdBaseType, simdSize);
}

default:
{
assert(!GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper));
break;
}
}

return false;
}
#endif // FEATURE_SIMD

return false;
}

//------------------------------------------------------------------------
// IsNeverNegative: returns true if the given tree is known to be never
// negative, i. e. the upper bit will always be zero.
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -2318,6 +2318,7 @@ struct GenTree
bool Precedes(GenTree* other);

bool IsInvariant() const;
bool IsVectorPerElementMask(var_types simdBaseType, unsigned simdSize) const;

bool IsNeverNegative(Compiler* comp) const;
bool IsNeverNegativeOne(Compiler* comp) const;
Expand Down
8 changes: 4 additions & 4 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3034,12 +3034,12 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node)
GenTree* op3 = node->Op(3);

// If the condition vector comes from a hardware intrinsic that
// returns a per-element mask (marked with HW_Flag_ReturnsPerElementMask),
// we can optimize the entire conditional select to
// a single BlendVariable instruction (if supported by the architecture)
// returns a per-element mask, we can optimize the entire
// conditional select to a single BlendVariable instruction
// (if supported by the architecture)

// First, determine if the condition is a per-element mask
if (op1->OperIsHWIntrinsic() && HWIntrinsicInfo::ReturnsPerElementMask(op1->AsHWIntrinsic()->GetHWIntrinsicId()))
if (op1->IsVectorPerElementMask(simdBaseType, simdSize))
{
// Next, determine if the target architecture supports BlendVariable
NamedIntrinsic blendVariableId = NI_Illegal;
Expand Down
11 changes: 11 additions & 0 deletions src/coreclr/jit/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@ static bool ElementsAreSame(T* array, size_t size)
return true;
}

template <typename T>
static bool ElementsAreAllBitsSetOrZero(T* array, size_t size)
{
for (size_t i = 0; i < size; i++)
{
if (array[i] != static_cast<T>(0) && array[i] != static_cast<T>(~0))
return false;
}
return true;
}

struct simd8_t
{
union
Expand Down

0 comments on commit 284aeaf

Please sign in to comment.