From 1462dc7b7c543d8aee270ec443f77dee50f03910 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 14 Apr 2023 00:38:48 -0700 Subject: [PATCH 01/11] restore the upper vector at the use of GT_FIELD_LIST --- src/coreclr/jit/lsra.cpp | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 68d9c3baa163d..b1efb03588af1 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -6675,12 +6675,32 @@ void LinearScan::insertUpperVectorRestore(GenTree* tree, LIR::Range& blockRange = LIR::AsRange(block); if (tree != nullptr) { - JITDUMP("before %d.%s:\n", tree->gtTreeID, GenTree::OpName(tree->gtOper)); LIR::Use treeUse; + GenTree* useNode = nullptr; bool foundUse = blockRange.TryGetUse(tree, &treeUse); assert(foundUse); + useNode = treeUse.User(); + +#ifdef TARGET_ARM64 + if (refPosition->needsConsecutive && useNode->OperIs(GT_FIELD_LIST)) + { + // The tree node requiring consecutive registers are represented as GT_FIELD_LIST. + // When restoring the upper vector, make sure to restore it at the point where + // GT_FIELD_LIST is consumed instead where the individual field is consumed, which + // will always be at GT_FIELD_LIST creation time. That way, we will restore the + // upper vector just before the use of them in the intrinsic. + LIR::Use fieldListUse; + if (blockRange.TryGetUse(useNode, &fieldListUse)) + { + treeUse = fieldListUse; + useNode = treeUse.User(); + } + } +#endif + JITDUMP("before %d.%s:\n", useNode->gtTreeID, GenTree::OpName(useNode->gtOper)); + // We need to insert the restore prior to the use, not (necessarily) immediately after the lclVar. - blockRange.InsertBefore(treeUse.User(), LIR::SeqTree(compiler, simdUpperRestore)); + blockRange.InsertBefore(useNode, LIR::SeqTree(compiler, simdUpperRestore)); } else { From 0d740b2dc42ae739622c7853eb50a6bbba24449a Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 14 Apr 2023 10:23:41 -0700 Subject: [PATCH 02/11] Introduce isLiveAtConsecutiveRegistersLoc and fix #84747 This method will track if the defs/uses are live at the same location as where the consecutive registers were allocated. If yes, it will skip the constraint imposition on it during JitStressRegs --- src/coreclr/jit/lsra.cpp | 9 ++++---- src/coreclr/jit/lsra.h | 4 ++++ src/coreclr/jit/lsraarm64.cpp | 39 +++++++++++++++++++++++++++++++++++ src/coreclr/jit/lsrabuild.cpp | 10 ++++++++- 4 files changed, 57 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index b1efb03588af1..d7d28e0d43a0e 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -12087,12 +12087,13 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, #ifdef DEBUG #ifdef TARGET_ARM64 - if (!refPosition->needsConsecutive && (linearScan->consecutiveRegistersLocation == refPosition->nodeLocation)) + if (!refPosition->needsConsecutive && refPosition->isLiveAtConsecutiveRegistersLoc(linearScan->consecutiveRegistersLocation)) { // If a method has consecutive registers and we are assigning to refPositions that are not part - // of consecutive registers, but are live at same location, skip the limit stress for them, because - // there are high chances that many registers are busy for consecutive requirements and we don't - // have enough remaining for other refpositions (like operands). + // of consecutive registers, but are live at the same location, skip the limit stress for them, + // because there are high chances that many registers are busy for consecutive requirements and + // we do not have enough remaining for other refpositions (like operands). Likewise, skip for the + // definition node that comes after that, for which, all the registers are in "delayRegFree" state. } else #endif diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 0aef405f35a9f..297a6956b9099 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -2652,6 +2652,10 @@ class RefPosition } return false; } + +#ifdef DEBUG + bool isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegistersLocation); +#endif #endif // TARGET_ARM64 #ifdef DEBUG diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 5e0385b7fc1b2..64ff8351bc95f 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1802,6 +1802,45 @@ int LinearScan::BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwN return srcCount; } + +#ifdef DEBUG +//------------------------------------------------------------------------ +// isLiveAtConsecutiveRegistersLoc: Check if the refPosition is live at the location +// where consecutive registers are needed. This is used during JitStressRegs to +// not constrained the register requirements for such refpositions, because lot +// of registers will be busy. For RefTypeUse, it will just see if the nodeLocation +// matches with the tracking `consecutiveRegistersLocation`. For Def, it will check +// the underlying `GenTree*` to see if the tree that produced it had consecutive +// registers requirement. +// +// +// Arguments: +// consecutiveRegistersLocation - The most recent location where consecutive +// registers were needed. +// +bool RefPosition::isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegistersLocation) +{ + if (needsConsecutive) + { + return true; + } + + if (refType == RefTypeDef) + { + if (treeNode->OperIsHWIntrinsic()) + { + const HWIntrinsic intrin(treeNode->AsHWIntrinsic()); + return HWIntrinsicInfo::NeedsConsecutiveRegisters(intrin.id); + } + } + else if ((refType == RefTypeUse) || (refType == RefTypeUpperVectorRestore)) + { + return consecutiveRegistersLocation == nodeLocation; + } + return false; +} +#endif + #endif #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 9ff7f83b3ab6f..4be40ebb30ea8 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -423,7 +423,13 @@ void LinearScan::checkConflictingDefUse(RefPosition* useRP) { if (!isSingleRegister(newAssignment) || !theInterval->hasInterferingUses) { - defRP->registerAssignment = newAssignment; +#ifdef TARGET_ARM64 + if (!compiler->info.compNeedsConsecutiveRegisters || + !defRP->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation)) +#endif + { + defRP->registerAssignment = newAssignment; + } } } else @@ -1808,7 +1814,9 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc #ifdef TARGET_ARM64 else if (newRefPosition->needsConsecutive) { +#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE assert(newRefPosition->refType == RefTypeUpperVectorRestore); +#endif minRegCount++; } #endif From 89d39b8c5656afdff973da53110e63071036a915 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 14 Apr 2023 23:56:33 -0700 Subject: [PATCH 03/11] Handle tracking of previously assigned register for copyReg When we have copyReg that was just restored or previously assigned to a different register, also track it as live at the location so it doesn't get allocated again for different refposition at the same location. --- src/coreclr/jit/lsra.cpp | 15 ++++----------- src/coreclr/jit/lsraarm64.cpp | 2 +- src/coreclr/jit/lsrabuild.cpp | 32 ++++++++++++++++++++++++++------ 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index d7d28e0d43a0e..e492f9bd7af22 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -5394,11 +5394,7 @@ void LinearScan::allocateRegisters() regMaskTP copyRegMask = getRegMask(copyReg, currentInterval->registerType); regMaskTP assignedRegMask = getRegMask(assignedRegister, currentInterval->registerType); - // For consecutive register, it doesn't matter what the assigned register was. - // We have just assigned it `copyRegMask` and that's the one in-use, and not the - // one that was assigned previously. - - regsInUseThisLocation |= copyRegMask; + regsInUseThisLocation |= copyRegMask | assignedRegMask; if (currentRefPosition.lastUse) { if (currentRefPosition.delayRegFree) @@ -5509,11 +5505,7 @@ void LinearScan::allocateRegisters() assignConsecutiveRegisters(¤tRefPosition, copyReg); } - // For consecutive register, it doesn't matter what the assigned register was. - // We have just assigned it `copyRegMask` and that's the one in-use, and not the - // one that was assigned previously. - - regsInUseThisLocation |= copyRegMask; + regsInUseThisLocation |= copyRegMask | assignedRegMask; } else #endif @@ -12087,7 +12079,8 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, #ifdef DEBUG #ifdef TARGET_ARM64 - if (!refPosition->needsConsecutive && refPosition->isLiveAtConsecutiveRegistersLoc(linearScan->consecutiveRegistersLocation)) + if (!refPosition->needsConsecutive && + refPosition->isLiveAtConsecutiveRegistersLoc(linearScan->consecutiveRegistersLocation)) { // If a method has consecutive registers and we are assigning to refPositions that are not part // of consecutive registers, but are live at the same location, skip the limit stress for them, diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 64ff8351bc95f..b5ace134bfb62 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1812,7 +1812,7 @@ int LinearScan::BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwN // matches with the tracking `consecutiveRegistersLocation`. For Def, it will check // the underlying `GenTree*` to see if the tree that produced it had consecutive // registers requirement. -// +// // // Arguments: // consecutiveRegistersLocation - The most recent location where consecutive diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 4be40ebb30ea8..a5dcb9c5c76d8 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1814,13 +1814,18 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc #ifdef TARGET_ARM64 else if (newRefPosition->needsConsecutive) { -#if FEATURE_PARTIAL_SIMD_CALLEE_SAVE assert(newRefPosition->refType == RefTypeUpperVectorRestore); -#endif minRegCount++; } -#endif -#endif +#endif // TARGET_ARM64 +#endif // FEATURE_PARTIAL_SIMD_CALLEE_SAVE + +#ifdef TARGET_ARM64 + if (newRefPosition->needsConsecutive) + { + consecutiveRegistersLocation = newRefPosition->nodeLocation; + } +#endif // TARGET_ARM64 if (newRefPosition->getInterval()->isSpecialPutArg) { minRegCount++; @@ -1872,8 +1877,22 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc Interval* interval = newRefPosition->getInterval(); regMaskTP oldAssignment = newRefPosition->registerAssignment; regMaskTP calleeSaveMask = calleeSaveRegs(interval->registerType); - newRefPosition->registerAssignment = - getConstrainedRegMask(oldAssignment, calleeSaveMask, minRegCountForRef); +#ifdef TARGET_ARM64 + if (!newRefPosition->needsConsecutive && + newRefPosition->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation)) + { + // If a method has consecutive registers and we are assigning to refPositions that are not part + // of consecutive registers, but are live at the same location, skip the limit stress for them, + // because there are high chances that many registers are busy for consecutive requirements and + // we do not have enough remaining for other refpositions (like operands). Likewise, skip for the + // definition node that comes after that, for which, all the registers are in "delayRegFree" state. + } + else +#endif // TARGET_ARM64 + { + newRefPosition->registerAssignment = + getConstrainedRegMask(oldAssignment, calleeSaveMask, minRegCountForRef); + } if ((newRefPosition->registerAssignment != oldAssignment) && (newRefPosition->refType == RefTypeUse) && !interval->isLocalVar) @@ -1882,6 +1901,7 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc } } } + consecutiveRegistersLocation = MinLocation; } #endif // DEBUG JITDUMP("\n"); From 27ddde6eb8082b88a223868dff49271a88952d24 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Sun, 16 Apr 2023 12:31:26 -0700 Subject: [PATCH 04/11] fix the release build errors --- src/coreclr/jit/lsrabuild.cpp | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index a5dcb9c5c76d8..60b490a5030d9 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -423,10 +423,6 @@ void LinearScan::checkConflictingDefUse(RefPosition* useRP) { if (!isSingleRegister(newAssignment) || !theInterval->hasInterferingUses) { -#ifdef TARGET_ARM64 - if (!compiler->info.compNeedsConsecutiveRegisters || - !defRP->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation)) -#endif { defRP->registerAssignment = newAssignment; } @@ -1897,7 +1893,22 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc if ((newRefPosition->registerAssignment != oldAssignment) && (newRefPosition->refType == RefTypeUse) && !interval->isLocalVar) { - checkConflictingDefUse(newRefPosition); +#ifdef TARGET_ARM64 + RefPosition* defRefPos = interval->firstRefPosition; + assert(defRefPos->treeNode != nullptr); + if (defRefPos->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation)) + { + // If a method has consecutive registers and we are assigning to use refPosition whose + // definition + // was from a location that has consecutive registers, skip the limit stress for them, + // because there are high chances that many registers are busy for consecutive requirements and + // marked as "delayRegFree" state. We do not have enough remaining for other refpositions. + } + else +#endif // TARGET_ARM64 + { + checkConflictingDefUse(newRefPosition); + } } } } From 2cb19379048c79fadbfd87678a31619cd7136b25 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Sun, 16 Apr 2023 13:22:55 -0700 Subject: [PATCH 05/11] Mark consecutive refpositions registers as busy --- src/coreclr/jit/lsra.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index e492f9bd7af22..c52b3b393d8ba 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -12253,6 +12253,30 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, // When we allocate for USE, we see that the register is busy at current location // and we end up with that candidate is no longer available. regMaskTP busyRegs = linearScan->regsBusyUntilKill | linearScan->regsInUseThisLocation; +#ifdef TARGET_ARM64 + if (needsConsecutiveRegisters && refPosition->isFirstRefPositionOfConsecutiveRegisters()) + { + // For consecutive registers refpositions, go through the subsequent refpositions + // and mark the registers assigned to them as busy (at this location). This is + // to prevent assigning such a register to the first refposition, whose consecutive + // register gets busy for subsequent refposition because of a copy involved. + // If (I3, I5 and I6) are three intervals that needs consecutive registers (v0, v1, v2) + // but if they are present in (v0, v2, v1), for I5, we will insert a copy from v2->v1 + // and need to keep both v1 and v2 live. Now, this becomes problem while assigning to + // I6 because, there v2 is also busy, because the value of I5 is getting copied from v2. + RefPosition* consecutiveRefPosition = linearScan->getNextConsecutiveRefPosition(refPosition); + while (consecutiveRefPosition != nullptr) + { + assert(consecutiveRefPosition->isIntervalRef()); + Interval* interval = consecutiveRefPosition->getInterval(); + if (interval->isActive) + { + busyRegs |= genRegMask(interval->physReg); + } + consecutiveRefPosition = linearScan->getNextConsecutiveRefPosition(consecutiveRefPosition); + } + } +#endif candidates &= ~busyRegs; #ifdef DEBUG From 092a72c5b091378d2a876cd4aef75b68121fd57d Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Sun, 16 Apr 2023 19:26:01 -0700 Subject: [PATCH 06/11] Update the comments --- src/coreclr/jit/lsra.cpp | 6 ++++++ src/coreclr/jit/lsra.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index c52b3b393d8ba..d79bcb204a069 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -12264,6 +12264,12 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, // but if they are present in (v0, v2, v1), for I5, we will insert a copy from v2->v1 // and need to keep both v1 and v2 live. Now, this becomes problem while assigning to // I6 because, there v2 is also busy, because the value of I5 is getting copied from v2. + // + // This is somewhat conservative appoach. It is possible that we first see the register + // assigned to the first RefPosition and if overlaps with registers in which subsequent + // refPositions are live, reallocate for the first RefPosition excluding the registers + // that are busy for subsequent refpositions (whats being done here), but that needs another + // round of allocation for the same RefPosition, and it doesn't fit our existing model. RefPosition* consecutiveRefPosition = linearScan->getNextConsecutiveRefPosition(refPosition); while (consecutiveRefPosition != nullptr) { diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 297a6956b9099..23fcc8226a252 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1416,7 +1416,7 @@ class LinearScan : public LinearScanInterface } return nextConsecutiveRefPositionMap; } - FORCEINLINE RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); + RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); #endif #ifdef DEBUG From 77ccc27d463dd960733d0dc34f1778f88dd651b6 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Mon, 17 Apr 2023 10:57:12 -0700 Subject: [PATCH 07/11] Stop stresslimiting registerAssignment and instead limit the free registers Under JitStressRegs, there are multiple ways in which consecutive registers demand cannot be met. So skip restricting the registers for `registerAssignment` of a refPosition (which are allowable candidates that can be assigned to the given refposition). Instead limit the free registers to alternate under stress mode, so we can verify the code if it can handle situation where it needs to pick from a mix of free/busy registers. --- src/coreclr/jit/lsra.cpp | 57 ++++++++--------------------------- src/coreclr/jit/lsraarm64.cpp | 8 +++++ src/coreclr/jit/lsrabuild.cpp | 17 +++++------ 3 files changed, 27 insertions(+), 55 deletions(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index d79bcb204a069..aace75ee59d01 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -445,6 +445,17 @@ regMaskTP LinearScan::getConstrainedRegMask(regMaskTP regMaskActual, regMaskTP r regMaskTP LinearScan::stressLimitRegs(RefPosition* refPosition, regMaskTP mask) { +#ifdef TARGET_ARM64 + if ((refPosition != nullptr) && refPosition->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation)) + { + // If we are assigning for the refPositions that has consecutive registers requirements, skip the + // limit stress for them, because there are high chances that many registers are busy for consecutive + // requirements and we do not have enough remaining to assign other refpositions (like operands). + // Likewise, skip for the definition node that comes after that, for which, all the registers are in + // the "delayRegFree" state. + return mask; + } +#endif if (getStressLimitRegs() != LSRA_LIMIT_NONE) { // The refPosition could be null, for example when called @@ -12078,21 +12089,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, } #ifdef DEBUG -#ifdef TARGET_ARM64 - if (!refPosition->needsConsecutive && - refPosition->isLiveAtConsecutiveRegistersLoc(linearScan->consecutiveRegistersLocation)) - { - // If a method has consecutive registers and we are assigning to refPositions that are not part - // of consecutive registers, but are live at the same location, skip the limit stress for them, - // because there are high chances that many registers are busy for consecutive requirements and - // we do not have enough remaining for other refpositions (like operands). Likewise, skip for the - // definition node that comes after that, for which, all the registers are in "delayRegFree" state. - } - else -#endif - { - candidates = linearScan->stressLimitRegs(refPosition, candidates); - } + candidates = linearScan->stressLimitRegs(refPosition, candidates); #endif assert(candidates != RBM_NONE); @@ -12253,36 +12250,6 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, // When we allocate for USE, we see that the register is busy at current location // and we end up with that candidate is no longer available. regMaskTP busyRegs = linearScan->regsBusyUntilKill | linearScan->regsInUseThisLocation; -#ifdef TARGET_ARM64 - if (needsConsecutiveRegisters && refPosition->isFirstRefPositionOfConsecutiveRegisters()) - { - // For consecutive registers refpositions, go through the subsequent refpositions - // and mark the registers assigned to them as busy (at this location). This is - // to prevent assigning such a register to the first refposition, whose consecutive - // register gets busy for subsequent refposition because of a copy involved. - // If (I3, I5 and I6) are three intervals that needs consecutive registers (v0, v1, v2) - // but if they are present in (v0, v2, v1), for I5, we will insert a copy from v2->v1 - // and need to keep both v1 and v2 live. Now, this becomes problem while assigning to - // I6 because, there v2 is also busy, because the value of I5 is getting copied from v2. - // - // This is somewhat conservative appoach. It is possible that we first see the register - // assigned to the first RefPosition and if overlaps with registers in which subsequent - // refPositions are live, reallocate for the first RefPosition excluding the registers - // that are busy for subsequent refpositions (whats being done here), but that needs another - // round of allocation for the same RefPosition, and it doesn't fit our existing model. - RefPosition* consecutiveRefPosition = linearScan->getNextConsecutiveRefPosition(refPosition); - while (consecutiveRefPosition != nullptr) - { - assert(consecutiveRefPosition->isIntervalRef()); - Interval* interval = consecutiveRefPosition->getInterval(); - if (interval->isActive) - { - busyRegs |= genRegMask(interval->physReg); - } - consecutiveRefPosition = linearScan->getNextConsecutiveRefPosition(consecutiveRefPosition); - } - } -#endif candidates &= ~busyRegs; #ifdef DEBUG diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index b5ace134bfb62..b0aa7a851c37a 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -417,6 +417,14 @@ regMaskTP LinearScan::getConsecutiveCandidates(regMaskTP allCandidates, assert(refPosition->isFirstRefPositionOfConsecutiveRegisters()); regMaskTP freeCandidates = allCandidates & m_AvailableRegs; +#ifdef DEBUG + if (getStressLimitRegs() != LSRA_LIMIT_NONE) + { + freeCandidates &= (RBM_V0 | RBM_V2 | RBM_V4 | RBM_V6 | RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14 | RBM_V16 | + RBM_V18 | RBM_V20 | RBM_V22 | RBM_V24 | RBM_V26 | RBM_V28 | RBM_V30); + } +#endif + *busyCandidates = RBM_NONE; regMaskTP overallResult; unsigned int registersNeeded = refPosition->regCount; diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 60b490a5030d9..2a23e056c2c23 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -423,9 +423,7 @@ void LinearScan::checkConflictingDefUse(RefPosition* useRP) { if (!isSingleRegister(newAssignment) || !theInterval->hasInterferingUses) { - { - defRP->registerAssignment = newAssignment; - } + defRP->registerAssignment = newAssignment; } } else @@ -1874,12 +1872,11 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc regMaskTP oldAssignment = newRefPosition->registerAssignment; regMaskTP calleeSaveMask = calleeSaveRegs(interval->registerType); #ifdef TARGET_ARM64 - if (!newRefPosition->needsConsecutive && - newRefPosition->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation)) + if (newRefPosition->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation)) { - // If a method has consecutive registers and we are assigning to refPositions that are not part - // of consecutive registers, but are live at the same location, skip the limit stress for them, - // because there are high chances that many registers are busy for consecutive requirements and + // If we are assigning to refPositions that has consecutive registers requirements, skip the + // limit stress for them, because there are high chances that many registers are busy for + // consecutive requirements and // we do not have enough remaining for other refpositions (like operands). Likewise, skip for the // definition node that comes after that, for which, all the registers are in "delayRegFree" state. } @@ -1899,8 +1896,8 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc if (defRefPos->isLiveAtConsecutiveRegistersLoc(consecutiveRegistersLocation)) { // If a method has consecutive registers and we are assigning to use refPosition whose - // definition - // was from a location that has consecutive registers, skip the limit stress for them, + // definition was from a location that has consecutive registers, skip the limit stress for + // them, // because there are high chances that many registers are busy for consecutive requirements and // marked as "delayRegFree" state. We do not have enough remaining for other refpositions. } From f9cf985783923ba721ef57a0f85e4d152008de48 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Mon, 17 Apr 2023 11:27:55 -0700 Subject: [PATCH 08/11] Introduce updateRegsFreeBusyState() for common trackign --- src/coreclr/jit/lsra.cpp | 110 ++++++++++++++++++++++----------------- src/coreclr/jit/lsra.h | 6 +++ 2 files changed, 68 insertions(+), 48 deletions(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index aace75ee59d01..485088897dd7d 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -363,6 +363,50 @@ void LinearScan::updateSpillCost(regNumber reg, Interval* interval) #endif } +//------------------------------------------------------------------------ +// updateRegsFreeBusyState: Update various register masks and global state to track +// registers that are free and busy. +// +// Arguments: +// refPosition - +// assignedRegMask - +// copyRegMask - +// regsToFree - +// delayRegsToFree - +// +// Notes: +// compFloatingPointUsed is only required to be set if it is possible that we +// will use floating point callee-save registers. +// It is unlikely, if an internal register is the only use of floating point, +// that it will select a callee-save register. But to be safe, we restrict +// the set of candidates if compFloatingPointUsed is not already set. +void LinearScan::updateRegsFreeBusyState(RefPosition& refPosition, + regMaskTP regsBusy, + regMaskTP* regsToFree, + regMaskTP* delayRegsToFree DEBUG_ARG(Interval* interval) + DEBUG_ARG(regNumber assignedReg)) +{ + regsInUseThisLocation |= regsBusy; + if (refPosition.lastUse) + { + if (refPosition.delayRegFree) + { + INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE_DELAYED, interval, assignedReg)); + *delayRegsToFree |= regsBusy; + regsInUseNextLocation |= regsBusy; + } + else + { + INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE, interval, assignedReg)); + *regsToFree |= regsBusy; + } + } + else if (refPosition.delayRegFree) + { + regsInUseNextLocation |= regsBusy; + } +} + //------------------------------------------------------------------------ // internalFloatRegCandidates: Return the set of registers that are appropriate // for use as internal float registers. @@ -5405,30 +5449,18 @@ void LinearScan::allocateRegisters() regMaskTP copyRegMask = getRegMask(copyReg, currentInterval->registerType); regMaskTP assignedRegMask = getRegMask(assignedRegister, currentInterval->registerType); - regsInUseThisLocation |= copyRegMask | assignedRegMask; - if (currentRefPosition.lastUse) - { - if (currentRefPosition.delayRegFree) - { - INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE_DELAYED, currentInterval, - assignedRegister)); - delayRegsToFree |= copyRegMask | assignedRegMask; - regsInUseNextLocation |= copyRegMask | assignedRegMask; - } - else - { - INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE, currentInterval, - assignedRegister)); - regsToFree |= copyRegMask | assignedRegMask; - } - } - else + // For consecutive register, although it shouldn't matter what the assigned register was, + // because we have just assigned it `copyReg` and that's the one in-use, and not the + // one that was assigned previously. However, in situation where an upper-vector restore + // happened to be restored in assignedReg, we would need assignedReg to stay alive because + // we will copy the entire vector value from it to the `copyReg`. + + updateRegsFreeBusyState(currentRefPosition, assignedRegMask | copyRegMask, ®sToFree, + &delayRegsToFree DEBUG_ARG(currentInterval) + DEBUG_ARG(assignedRegister)); + if (!currentRefPosition.lastUse) { copyRegsToFree |= copyRegMask; - if (currentRefPosition.delayRegFree) - { - regsInUseNextLocation |= copyRegMask | assignedRegMask; - } } // If this is a tree temp (non-localVar) interval, we will need an explicit move. @@ -5515,36 +5547,18 @@ void LinearScan::allocateRegisters() // registers. assignConsecutiveRegisters(¤tRefPosition, copyReg); } - - regsInUseThisLocation |= copyRegMask | assignedRegMask; } - else #endif - { - regsInUseThisLocation |= copyRegMask | assignedRegMask; - } - if (currentRefPosition.lastUse) - { - if (currentRefPosition.delayRegFree) - { - INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE_DELAYED, currentInterval, - assignedRegister)); - delayRegsToFree |= copyRegMask | assignedRegMask; - regsInUseNextLocation |= copyRegMask | assignedRegMask; - } - else - { - INDEBUG(dumpLsraAllocationEvent(LSRA_EVENT_LAST_USE, currentInterval, assignedRegister)); - regsToFree |= copyRegMask | assignedRegMask; - } - } - else + // For consecutive register, although it shouldn't matter what the assigned register was, + // because we have just assigned it `copyReg` and that's the one in-use, and not the + // one that was assigned previously. However, in situation where an upper-vector restore + // happened to be restored in assignedReg, we would need assignedReg to stay alive because + // we will copy the entire vector value from it to the `copyReg`. + updateRegsFreeBusyState(currentRefPosition, assignedRegMask | copyRegMask, ®sToFree, + &delayRegsToFree DEBUG_ARG(currentInterval) DEBUG_ARG(assignedRegister)); + if (!currentRefPosition.lastUse) { copyRegsToFree |= copyRegMask; - if (currentRefPosition.delayRegFree) - { - regsInUseNextLocation |= copyRegMask | assignedRegMask; - } } // If this is a tree temp (non-localVar) interval, we will need an explicit move. diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 23fcc8226a252..a939ccee28857 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1784,6 +1784,12 @@ class LinearScan : public LinearScanInterface void clearSpillCost(regNumber reg, var_types regType); void updateSpillCost(regNumber reg, Interval* interval); + FORCEINLINE void updateRegsFreeBusyState(RefPosition& refPosition, + regMaskTP regsBusy, + regMaskTP* regsToFree, + regMaskTP* delayRegsToFree DEBUG_ARG(Interval* interval) + DEBUG_ARG(regNumber assignedReg)); + regMaskTP m_RegistersWithConstants; void clearConstantReg(regNumber reg, var_types regType) { From d2960c67f1922cc2b22ed3e249bc77dce3aac293 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Mon, 17 Apr 2023 11:32:21 -0700 Subject: [PATCH 09/11] Update comment --- src/coreclr/jit/lsraarm64.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index b0aa7a851c37a..cd2ec9205babc 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -420,6 +420,7 @@ regMaskTP LinearScan::getConsecutiveCandidates(regMaskTP allCandidates, #ifdef DEBUG if (getStressLimitRegs() != LSRA_LIMIT_NONE) { + // For stress, make only alternate registers available so we can stress the selectoin of free/busy registers. freeCandidates &= (RBM_V0 | RBM_V2 | RBM_V4 | RBM_V6 | RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14 | RBM_V16 | RBM_V18 | RBM_V20 | RBM_V22 | RBM_V24 | RBM_V26 | RBM_V28 | RBM_V30); } From ab5a9c9de75cbb0573743fd7220e128406fcfc56 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Tue, 18 Apr 2023 06:43:58 -0700 Subject: [PATCH 10/11] misc. changes --- src/coreclr/jit/lsra.h | 2 +- src/coreclr/jit/lsraarm64.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index a939ccee28857..f9de491dfc199 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1416,7 +1416,7 @@ class LinearScan : public LinearScanInterface } return nextConsecutiveRefPositionMap; } - RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); + FORCEINLINE RefPosition* getNextConsecutiveRefPosition(RefPosition* refPosition); #endif #ifdef DEBUG diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index cd2ec9205babc..1e90c1c9f0365 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1827,6 +1827,9 @@ int LinearScan::BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwN // consecutiveRegistersLocation - The most recent location where consecutive // registers were needed. // +// Returns: If the refposition is live at same location which has the requirement of +// consecutive registers. +// bool RefPosition::isLiveAtConsecutiveRegistersLoc(LsraLocation consecutiveRegistersLocation) { if (needsConsecutive) From 3e1ec418fe0a305657566a621fcc6bf0ce099a88 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 19 Apr 2023 13:56:58 -0700 Subject: [PATCH 11/11] review feedback --- src/coreclr/jit/lsra.cpp | 29 +++++++++++------------------ src/coreclr/jit/lsraarm64.cpp | 4 ++-- src/coreclr/jit/lsrabuild.cpp | 6 +++--- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 485088897dd7d..64fb68cba56c7 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -368,18 +368,13 @@ void LinearScan::updateSpillCost(regNumber reg, Interval* interval) // registers that are free and busy. // // Arguments: -// refPosition - -// assignedRegMask - -// copyRegMask - -// regsToFree - -// delayRegsToFree - +// refPosition - RefPosition for which we need to update the state. +// regsBusy - Mask of registers that are busy. +// regsToFree - Mask of registers that are set to be free. +// delayRegsToFree - Mask of registers that are set to be delayed free. +// interval - Interval of Refposition. +// assignedReg - Assigned register for this refposition. // -// Notes: -// compFloatingPointUsed is only required to be set if it is possible that we -// will use floating point callee-save registers. -// It is unlikely, if an internal register is the only use of floating point, -// that it will select a callee-save register. But to be safe, we restrict -// the set of candidates if compFloatingPointUsed is not already set. void LinearScan::updateRegsFreeBusyState(RefPosition& refPosition, regMaskTP regsBusy, regMaskTP* regsToFree, @@ -6695,8 +6690,7 @@ void LinearScan::insertUpperVectorRestore(GenTree* tree, LIR::Use treeUse; GenTree* useNode = nullptr; bool foundUse = blockRange.TryGetUse(tree, &treeUse); - assert(foundUse); - useNode = treeUse.User(); + useNode = treeUse.User(); #ifdef TARGET_ARM64 if (refPosition->needsConsecutive && useNode->OperIs(GT_FIELD_LIST)) @@ -6707,13 +6701,12 @@ void LinearScan::insertUpperVectorRestore(GenTree* tree, // will always be at GT_FIELD_LIST creation time. That way, we will restore the // upper vector just before the use of them in the intrinsic. LIR::Use fieldListUse; - if (blockRange.TryGetUse(useNode, &fieldListUse)) - { - treeUse = fieldListUse; - useNode = treeUse.User(); - } + foundUse = blockRange.TryGetUse(useNode, &fieldListUse); + treeUse = fieldListUse; + useNode = treeUse.User(); } #endif + assert(foundUse); JITDUMP("before %d.%s:\n", useNode->gtTreeID, GenTree::OpName(useNode->gtOper)); // We need to insert the restore prior to the use, not (necessarily) immediately after the lclVar. diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 1e90c1c9f0365..ebb0588b62b35 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -420,7 +420,7 @@ regMaskTP LinearScan::getConsecutiveCandidates(regMaskTP allCandidates, #ifdef DEBUG if (getStressLimitRegs() != LSRA_LIMIT_NONE) { - // For stress, make only alternate registers available so we can stress the selectoin of free/busy registers. + // For stress, make only alternate registers available so we can stress the selection of free/busy registers. freeCandidates &= (RBM_V0 | RBM_V2 | RBM_V4 | RBM_V6 | RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14 | RBM_V16 | RBM_V18 | RBM_V20 | RBM_V22 | RBM_V24 | RBM_V26 | RBM_V28 | RBM_V30); } @@ -1816,7 +1816,7 @@ int LinearScan::BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwN //------------------------------------------------------------------------ // isLiveAtConsecutiveRegistersLoc: Check if the refPosition is live at the location // where consecutive registers are needed. This is used during JitStressRegs to -// not constrained the register requirements for such refpositions, because lot +// not constrain the register requirements for such refpositions, because a lot // of registers will be busy. For RefTypeUse, it will just see if the nodeLocation // matches with the tracking `consecutiveRegistersLocation`. For Def, it will check // the underlying `GenTree*` to see if the tree that produced it had consecutive diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 2a23e056c2c23..78b06dae9d646 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1897,9 +1897,9 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc { // If a method has consecutive registers and we are assigning to use refPosition whose // definition was from a location that has consecutive registers, skip the limit stress for - // them, - // because there are high chances that many registers are busy for consecutive requirements and - // marked as "delayRegFree" state. We do not have enough remaining for other refpositions. + // them, because there are high chances that many registers are busy for consecutive + // requirements and marked as "delayRegFree" state. We do not have enough remaining for other + // refpositions. } else #endif // TARGET_ARM64