Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various fixes for consecutive registers found with jitstressregs #84824

Merged
merged 12 commits into from
Apr 20, 2023
Merged
158 changes: 88 additions & 70 deletions src/coreclr/jit/lsra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,45 @@ void LinearScan::updateSpillCost(regNumber reg, Interval* interval)
#endif
}

//------------------------------------------------------------------------
// updateRegsFreeBusyState: Update various register masks and global state to track
// registers that are free and busy.
//
// Arguments:
// refPosition - RefPosition for which we need to update the state.
// regsBusy - Mask of registers that are busy.
// regsToFree - Mask of registers that are set to be free.
// delayRegsToFree - Mask of registers that are set to be delayed free.
// interval - Interval of Refposition.
// assignedReg - Assigned register for this refposition.
//
void LinearScan::updateRegsFreeBusyState(RefPosition& refPosition,
regMaskTP regsBusy,
regMaskTP* regsToFree,
regMaskTP* delayRegsToFree DEBUG_ARG(Interval* interval)
DEBUG_ARG(regNumber assignedReg))
{
regsInUseThisLocation |= regsBusy;
if (refPosition.lastUse)
{
if (refPosition.delayRegFree)
{
INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE_DELAYED, interval, assignedReg));
*delayRegsToFree |= regsBusy;
regsInUseNextLocation |= regsBusy;
}
else
{
INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE, interval, assignedReg));
*regsToFree |= regsBusy;
}
}
else if (refPosition.delayRegFree)
{
regsInUseNextLocation |= regsBusy;
}
}

//------------------------------------------------------------------------
// internalFloatRegCandidates: Return the set of registers that are appropriate
// for use as internal float registers.
Expand Down Expand Up @@ -445,6 +484,17 @@ regMaskTP LinearScan::getConstrainedRegMask(regMaskTP regMaskActual, regMaskTP r

regMaskTP LinearScan::stressLimitRegs(RefPosition* refPosition, regMaskTP mask)
{
#ifdef TARGET_ARM64
if ((refPosition != nullptr) && refPosition->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation))
{
// If we are assigning for the refPositions that has consecutive registers requirements, skip the
// limit stress for them, because there are high chances that many registers are busy for consecutive
// requirements and we do not have enough remaining to assign other refpositions (like operands).
// Likewise, skip for the definition node that comes after that, for which, all the registers are in
// the "delayRegFree" state.
return mask;
}
#endif
if (getStressLimitRegs() != LSRA_LIMIT_NONE)
{
// The refPosition could be null, for example when called
Expand Down Expand Up @@ -5394,34 +5444,18 @@ void LinearScan::allocateRegisters()
regMaskTP copyRegMask = getRegMask(copyReg, currentInterval->registerType);
regMaskTP assignedRegMask = getRegMask(assignedRegister, currentInterval->registerType);

// For consecutive register, it doesn't matter what the assigned register was.
// We have just assigned it `copyRegMask` and that's the one in-use, and not the
// one that was assigned previously.
// For consecutive register, although it shouldn't matter what the assigned register was,
// because we have just assigned it `copyReg` and that's the one in-use, and not the
// one that was assigned previously. However, in situation where an upper-vector restore
// happened to be restored in assignedReg, we would need assignedReg to stay alive because
// we will copy the entire vector value from it to the `copyReg`.

regsInUseThisLocation |= copyRegMask;
if (currentRefPosition.lastUse)
{
if (currentRefPosition.delayRegFree)
{
INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE_DELAYED, currentInterval,
assignedRegister));
delayRegsToFree |= copyRegMask | assignedRegMask;
regsInUseNextLocation |= copyRegMask | assignedRegMask;
}
else
{
INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE, currentInterval,
assignedRegister));
regsToFree |= copyRegMask | assignedRegMask;
}
}
else
updateRegsFreeBusyState(currentRefPosition, assignedRegMask | copyRegMask, &regsToFree,
&delayRegsToFree DEBUG_ARG(currentInterval)
DEBUG_ARG(assignedRegister));
if (!currentRefPosition.lastUse)
{
copyRegsToFree |= copyRegMask;
if (currentRefPosition.delayRegFree)
{
regsInUseNextLocation |= copyRegMask | assignedRegMask;
}
}

// If this is a tree temp (non-localVar) interval, we will need an explicit move.
Expand Down Expand Up @@ -5508,40 +5542,18 @@ void LinearScan::allocateRegisters()
// registers.
assignConsecutiveRegisters(&currentRefPosition, copyReg);
}

// For consecutive register, it doesn't matter what the assigned register was.
// We have just assigned it `copyRegMask` and that's the one in-use, and not the
// one that was assigned previously.

regsInUseThisLocation |= copyRegMask;
}
else
#endif
{
regsInUseThisLocation |= copyRegMask | assignedRegMask;
}
if (currentRefPosition.lastUse)
{
if (currentRefPosition.delayRegFree)
{
INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE_DELAYED, currentInterval,
assignedRegister));
delayRegsToFree |= copyRegMask | assignedRegMask;
regsInUseNextLocation |= copyRegMask | assignedRegMask;
}
else
{
INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE, currentInterval, assignedRegister));
regsToFree |= copyRegMask | assignedRegMask;
}
}
else
// For consecutive register, although it shouldn't matter what the assigned register was,
// because we have just assigned it `copyReg` and that's the one in-use, and not the
// one that was assigned previously. However, in situation where an upper-vector restore
// happened to be restored in assignedReg, we would need assignedReg to stay alive because
// we will copy the entire vector value from it to the `copyReg`.
updateRegsFreeBusyState(currentRefPosition, assignedRegMask | copyRegMask, &regsToFree,
&delayRegsToFree DEBUG_ARG(currentInterval) DEBUG_ARG(assignedRegister));
if (!currentRefPosition.lastUse)
{
copyRegsToFree |= copyRegMask;
if (currentRefPosition.delayRegFree)
{
regsInUseNextLocation |= copyRegMask | assignedRegMask;
}
}

// If this is a tree temp (non-localVar) interval, we will need an explicit move.
Expand Down Expand Up @@ -6675,12 +6687,30 @@ void LinearScan::insertUpperVectorRestore(GenTree* tree,
LIR::Range& blockRange = LIR::AsRange(block);
if (tree != nullptr)
{
JITDUMP("before %d.%s:\n", tree->gtTreeID, GenTree::OpName(tree->gtOper));
LIR::Use treeUse;
GenTree* useNode = nullptr;
bool foundUse = blockRange.TryGetUse(tree, &treeUse);
useNode = treeUse.User();

#ifdef TARGET_ARM64
if (refPosition->needsConsecutive && useNode->OperIs(GT_FIELD_LIST))
{
// The tree node requiring consecutive registers are represented as GT_FIELD_LIST.
// When restoring the upper vector, make sure to restore it at the point where
// GT_FIELD_LIST is consumed instead where the individual field is consumed, which
// will always be at GT_FIELD_LIST creation time. That way, we will restore the
// upper vector just before the use of them in the intrinsic.
LIR::Use fieldListUse;
foundUse = blockRange.TryGetUse(useNode, &fieldListUse);
treeUse = fieldListUse;
useNode = treeUse.User();
}
#endif
assert(foundUse);
JITDUMP("before %d.%s:\n", useNode->gtTreeID, GenTree::OpName(useNode->gtOper));

// We need to insert the restore prior to the use, not (necessarily) immediately after the lclVar.
blockRange.InsertBefore(treeUse.User(), LIR::SeqTree(compiler, simdUpperRestore));
blockRange.InsertBefore(useNode, LIR::SeqTree(compiler, simdUpperRestore));
}
else
{
Expand Down Expand Up @@ -12066,19 +12096,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval,
}

#ifdef DEBUG
#ifdef TARGET_ARM64
if (!refPosition->needsConsecutive && (linearScan->consecutiveRegistersLocation == refPosition->nodeLocation))
{
// If a method has consecutive registers and we are assigning to refPositions that are not part
// of consecutive registers, but are live at same location, skip the limit stress for them, because
// there are high chances that many registers are busy for consecutive requirements and we don't
// have enough remaining for other refpositions (like operands).
}
else
#endif
{
candidates = linearScan->stressLimitRegs(refPosition, candidates);
}
candidates = linearScan->stressLimitRegs(refPosition, candidates);
#endif
assert(candidates != RBM_NONE);

Expand Down
10 changes: 10 additions & 0 deletions src/coreclr/jit/lsra.h
Original file line number Diff line number Diff line change
Expand Up @@ -1784,6 +1784,12 @@ class LinearScan : public LinearScanInterface
void clearSpillCost(regNumber reg, var_types regType);
void updateSpillCost(regNumber reg, Interval* interval);

FORCEINLINE void updateRegsFreeBusyState(RefPosition& refPosition,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it really necessary to use FORCEINLINE?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is hot functionality used whenever we assign copyRegs with or without consecutive registers. I didn't want to impact the TP for just extracting out this functionality in a method.

regMaskTP regsBusy,
regMaskTP* regsToFree,
regMaskTP* delayRegsToFree DEBUG_ARG(Interval* interval)
DEBUG_ARG(regNumber assignedReg));

regMaskTP m_RegistersWithConstants;
void clearConstantReg(regNumber reg, var_types regType)
{
Expand Down Expand Up @@ -2652,6 +2658,10 @@ class RefPosition
}
return false;
}

#ifdef DEBUG
bool isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegistersLocation);
#endif
#endif // TARGET_ARM64

#ifdef DEBUG
Expand Down
51 changes: 51 additions & 0 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,15 @@ regMaskTP LinearScan::getConsecutiveCandidates(regMaskTP allCandidates,
assert(refPosition->isFirstRefPositionOfConsecutiveRegisters());
regMaskTP freeCandidates = allCandidates & m_AvailableRegs;

#ifdef DEBUG
if (getStressLimitRegs() != LSRA_LIMIT_NONE)
{
// For stress, make only alternate registers available so we can stress the selection of free/busy registers.
freeCandidates &= (RBM_V0 | RBM_V2 | RBM_V4 | RBM_V6 | RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14 | RBM_V16 |
RBM_V18 | RBM_V20 | RBM_V22 | RBM_V24 | RBM_V26 | RBM_V28 | RBM_V30);
}
#endif

*busyCandidates = RBM_NONE;
regMaskTP overallResult;
unsigned int registersNeeded = refPosition->regCount;
Expand Down Expand Up @@ -1802,6 +1811,48 @@ int LinearScan::BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwN

return srcCount;
}

#ifdef DEBUG
//------------------------------------------------------------------------
// isLiveAtConsecutiveRegistersLoc: Check if the refPosition is live at the location
// where consecutive registers are needed. This is used during JitStressRegs to
// not constrain the register requirements for such refpositions, because a lot
// of registers will be busy. For RefTypeUse, it will just see if the nodeLocation
// matches with the tracking `consecutiveRegistersLocation`. For Def, it will check
// the underlying `GenTree*` to see if the tree that produced it had consecutive
// registers requirement.
//
//
// Arguments:
// consecutiveRegistersLocation - The most recent location where consecutive
// registers were needed.
//
kunalspathak marked this conversation as resolved.
Show resolved Hide resolved
// Returns: If the refposition is live at same location which has the requirement of
// consecutive registers.
//
bool RefPosition::isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegistersLocation)
{
if (needsConsecutive)
{
return true;
}

if (refType == RefTypeDef)
{
if (treeNode->OperIsHWIntrinsic())
{
const HWIntrinsic intrin(treeNode->AsHWIntrinsic());
return HWIntrinsicInfo::NeedsConsecutiveRegisters(intrin.id);
}
}
else if ((refType == RefTypeUse) || (refType == RefTypeUpperVectorRestore))
{
return consecutiveRegistersLocation == nodeLocation;
}
return false;
}
#endif

#endif

#endif // TARGET_ARM64
46 changes: 41 additions & 5 deletions src/coreclr/jit/lsrabuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1811,8 +1811,15 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc
assert(newRefPosition->refType == RefTypeUpperVectorRestore);
minRegCount++;
}
#endif
#endif
#endif // TARGET_ARM64
#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE

#ifdef TARGET_ARM64
if (newRefPosition->needsConsecutive)
{
consecutiveRegistersLocation = newRefPosition->nodeLocation;
}
#endif // TARGET_ARM64
if (newRefPosition->getInterval()->isSpecialPutArg)
{
minRegCount++;
Expand Down Expand Up @@ -1864,16 +1871,45 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc
Interval* interval = newRefPosition->getInterval();
regMaskTP oldAssignment = newRefPosition->registerAssignment;
regMaskTP calleeSaveMask = calleeSaveRegs(interval->registerType);
newRefPosition->registerAssignment =
getConstrainedRegMask(oldAssignment, calleeSaveMask, minRegCountForRef);
#ifdef TARGET_ARM64
if (newRefPosition->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation))
{
// If we are assigning to refPositions that has consecutive registers requirements, skip the
// limit stress for them, because there are high chances that many registers are busy for
// consecutive requirements and
// we do not have enough remaining for other refpositions (like operands). Likewise, skip for the
// definition node that comes after that, for which, all the registers are in "delayRegFree" state.
}
else
#endif // TARGET_ARM64
{
newRefPosition->registerAssignment =
getConstrainedRegMask(oldAssignment, calleeSaveMask, minRegCountForRef);
}

if ((newRefPosition->registerAssignment != oldAssignment) && (newRefPosition->refType == RefTypeUse) &&
!interval->isLocalVar)
{
checkConflictingDefUse(newRefPosition);
#ifdef TARGET_ARM64
RefPosition* defRefPos = interval->firstRefPosition;
assert(defRefPos->treeNode != nullptr);
if (defRefPos->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation))
{
// If a method has consecutive registers and we are assigning to use refPosition whose
// definition was from a location that has consecutive registers, skip the limit stress for
// them, because there are high chances that many registers are busy for consecutive
// requirements and marked as "delayRegFree" state. We do not have enough remaining for other
// refpositions.
}
else
#endif // TARGET_ARM64
{
checkConflictingDefUse(newRefPosition);
}
}
}
}
consecutiveRegistersLocation = MinLocation;
}
#endif // DEBUG
JITDUMP("\n");
Expand Down