Skip to content

Commit

Permalink
Add SIMD RMW comments
Browse files Browse the repository at this point in the history
  • Loading branch information
mikedn committed Jan 4, 2020
1 parent 059dd21 commit 45d9986
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 98 deletions.
6 changes: 2 additions & 4 deletions src/coreclr/src/jit/lsra.h
Original file line number Diff line number Diff line change
Expand Up @@ -1522,10 +1522,6 @@ class LinearScan : public LinearScanInterface
int BuildBinaryUses(GenTreeOp* node, regMaskTP candidates = RBM_NONE);
#ifdef _TARGET_XARCH_
int BuildRMWUses(GenTreeOp* node, regMaskTP candidates = RBM_NONE);
#ifdef FEATURE_SIMD
int BuildRMWUsesSIMDUnary(GenTreeSIMD* node);
int BuildRMWUsesSIMDBinary(GenTreeSIMD* node);
#endif
#endif // !_TARGET_XARCH_
// This is the main entry point for building the RefPositions for a node.
// These methods return the number of sources.
Expand Down Expand Up @@ -1596,6 +1592,8 @@ class LinearScan : public LinearScanInterface

#ifdef FEATURE_SIMD
int BuildSIMD(GenTreeSIMD* tree);
int BuildSIMDUnaryRMWUses(GenTreeSIMD* node);
int BuildSIMDBinaryRMWUses(GenTreeSIMD* node);
#endif // FEATURE_SIMD

#ifdef FEATURE_HW_INTRINSICS
Expand Down
216 changes: 122 additions & 94 deletions src/coreclr/src/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -904,97 +904,6 @@ int LinearScan::BuildRMWUses(GenTreeOp* node, regMaskTP candidates)
return srcCount;
}

#ifdef FEATURE_SIMD
int LinearScan::BuildRMWUsesSIMDUnary(GenTreeSIMD* node)
{
assert(node->IsUnary());

GenTree* op1 = node->GetOp(0);

if (op1->isContained())
{
return BuildOperandUses(op1);
}

tgtPrefUse = BuildUse(op1);
return 1;
}

int LinearScan::BuildRMWUsesSIMDBinary(GenTreeSIMD* node)
{
assert(node->IsBinary());

GenTree* op1 = node->GetOp(0);
GenTree* op2 = node->GetOp(1);

// Determine which operand, if any, should be delayRegFree. Normally, this would be op2,
// but if we have a commutative operator and op1 is a contained memory op, it would be op1.
// We need to make the delayRegFree operand remain live until the op is complete, by marking
// the source(s) associated with op2 as "delayFree".
// Note that if op2 of a binary RMW operator is a memory op, even if the operator
// is commutative, codegen cannot reverse them.
// TODO-XArch-CQ: This is not actually the case for all RMW binary operators, but there's
// more work to be done to correctly reverse the operands if they involve memory
// operands. Also, we may need to handle more cases than GT_IND, especially once
// we've modified the register allocator to not require all nodes to be assigned
// a register (e.g. a spilled lclVar can often be referenced directly from memory).
// Note that we may have a null op2, even with 2 sources, if op1 is a base/index memory op.
GenTree* delayUseOperand = op2;

if (node->isCommutativeSIMDIntrinsic())
{
if (op1->isContained())
{
delayUseOperand = op1;
}
else if (!op2->isContained() || op2->IsCnsIntOrI())
{
// If we have a commutative operator and op2 is not a memory op, we don't need
// to set delayRegFree on either operand because codegen can swap them.
delayUseOperand = nullptr;
}
}
else if (op1->isContained())
{
delayUseOperand = nullptr;
}

int srcCount = 0;

// Build first use
if (!op1->isContained())
{
tgtPrefUse = BuildUse(op1);
srcCount++;
}
else if (delayUseOperand == op1)
{
srcCount += BuildDelayFreeUses(op1);
}
else
{
srcCount += BuildOperandUses(op1);
}

// Build second use
if (node->isCommutativeSIMDIntrinsic() && !op2->isContained())
{
tgtPrefUse2 = BuildUse(op2);
srcCount++;
}
else if (delayUseOperand == op2)
{
srcCount += BuildDelayFreeUses(op2);
}
else
{
srcCount += BuildOperandUses(op2);
}

return srcCount;
}
#endif // FEATURE_SIMD

//------------------------------------------------------------------------
// BuildShiftRotate: Set the NodeInfo for a shift or rotate.
//
Expand Down Expand Up @@ -1932,7 +1841,7 @@ int LinearScan::BuildIntrinsic(GenTree* tree)
// BuildSIMD: Set the NodeInfo for a GT_SIMD tree.
//
// Arguments:
// tree - The GT_SIMD node of interest
// simdTree - The GT_SIMD node of interest
//
// Return Value:
// The number of sources consumed by this node.
Expand Down Expand Up @@ -2321,11 +2230,11 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)

if (simdTree->IsUnary())
{
srcCount = BuildRMWUsesSIMDUnary(simdTree);
srcCount = BuildSIMDUnaryRMWUses(simdTree);
}
else
{
srcCount = BuildRMWUsesSIMDBinary(simdTree);
srcCount = BuildSIMDBinaryRMWUses(simdTree);
}
}

Expand All @@ -2340,6 +2249,125 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
}
return srcCount;
}

//------------------------------------------------------------------------
// BuildSIMDUnaryRMWUses: Build uses for a RMW unary SIMD node.
//
// Arguments:
// node - The GT_SIMD node of interest
//
// Return Value:
// The number of sources consumed by this node.
//
// Notes:
// SSE unary instructions (sqrtps, cvttps2dq etc.) aren't really RMW,
// they have "dst, src" forms even when the VEX encoding is not available.
// However, it seems that SIMDIntrinsicConvertToSingle with UINT base type
// could benefit from getting the same register for both destination and
// source because its rather complicated codegen expansion starts by copying
// the source to the destination register.
//
int LinearScan::BuildSIMDUnaryRMWUses(GenTreeSIMD* node)
{
assert(node->IsUnary());

GenTree* op1 = node->GetOp(0);

if (op1->isContained())
{
return BuildOperandUses(op1);
}

tgtPrefUse = BuildUse(op1);
return 1;
}

//------------------------------------------------------------------------
// BuildSIMDBinaryRMWUses: Build uses for a RMW binary SIMD node.
//
// Arguments:
// node - The GT_SIMD node of interest
//
// Return Value:
// The number of sources consumed by this node.
//
int LinearScan::BuildSIMDBinaryRMWUses(GenTreeSIMD* node)
{
assert(node->IsBinary());

GenTree* op1 = node->GetOp(0);
GenTree* op2 = node->GetOp(1);

// Determine which operand, if any, should be delayRegFree. Normally, this would be op2,
// but if we have a commutative operator codegen can swap the operands, avoiding the need
// for delayRegFree.
//
// TODO-XArch-CQ: This should not be necessary when VEX encoding is available, at least
// for those intrinsics that directly map to SSE/AVX instructions. Intrinsics that require
// custom codegen expansion may still neeed this.
//
// Also, this doesn't check if the intrinsic is really RMW:
// - SIMDIntrinsicOpEquality/SIMDIntrinsicOpInEquality - they don't have a register def.
// - SIMDIntrinsicShuffleSSE2 - the second operand is always a contained immediate so they're
// really unary as far as the register allocator is concerned.
// - SIMDIntrinsicGetItem - the second operand is always an integer but the first may be a float
// and in that case delayRegFree is not needed. Either way, it's not a real RMW operation. It's
// also the only binary SIMD intrinsic that can have a contained op1.

GenTree* delayUseOperand = op2;

if (node->isCommutativeSIMDIntrinsic())
{
if (op1->isContained())
{
delayUseOperand = op1;
}
else if (!op2->isContained() || op2->IsCnsIntOrI())
{
// If we have a commutative operator and op2 is not a memory op, we don't need
// to set delayRegFree on either operand because codegen can swap them.
delayUseOperand = nullptr;
}
}
else if (op1->isContained())
{
delayUseOperand = nullptr;
}

int srcCount = 0;

// Build first use
if (!op1->isContained())
{
tgtPrefUse = BuildUse(op1);
srcCount++;
}
else if (delayUseOperand == op1)
{
srcCount += BuildDelayFreeUses(op1);
}
else
{
srcCount += BuildOperandUses(op1);
}

// Build second use
if (node->isCommutativeSIMDIntrinsic() && !op2->isContained())
{
tgtPrefUse2 = BuildUse(op2);
srcCount++;
}
else if (delayUseOperand == op2)
{
srcCount += BuildDelayFreeUses(op2);
}
else
{
srcCount += BuildOperandUses(op2);
}

return srcCount;
}
#endif // FEATURE_SIMD

#ifdef FEATURE_HW_INTRINSICS
Expand Down

0 comments on commit 45d9986

Please sign in to comment.