-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VPlan] Model branch cond to enter scalar epilogue in VPlan. #92651
Changes from 23 commits
53e8e56
3a4ecfc
10e747a
4dfba5d
fe03d51
2372421
ab78e2a
446b313
5c6e07e
0a34761
a5c1fd6
045f051
2e4bb36
189aa60
014c393
c983acf
b165cca
852d28d
b506722
3236f1d
a3c7747
8e7f8dd
0e74f48
bd99d7b
887a56d
8cab6d2
2ba349c
fd9975c
dee166d
d5cd98d
ab6c49a
7867bfa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2970,34 +2970,6 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { | |
LoopScalarPreHeader = | ||
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, | ||
nullptr, Twine(Prefix) + "scalar.ph"); | ||
|
||
// Set up the middle block terminator. Two cases: | ||
// 1) If we know that we must execute the scalar epilogue, retain the existing | ||
// unconditional branch from the middle block to the scalar preheader. In that | ||
// case, there's no edge from the middle block to exit blocks and thus no | ||
// need to update the immediate dominator of the exit blocks. | ||
if (Cost->requiresScalarEpilogue(VF.isVector())) { | ||
assert( | ||
LoopMiddleBlock->getSingleSuccessor() == LoopScalarPreHeader && | ||
" middle block should have the scalar preheader as single successor"); | ||
return; | ||
} | ||
|
||
// 2) Otherwise, we must have a single unique exit block (due to how we | ||
// implement the multiple exit case). In this case, set up a conditional | ||
// branch from the middle block to the loop scalar preheader, and the | ||
// exit block. completeLoopSkeleton will update the condition to use an | ||
// iteration check, if required to decide whether to execute the remainder. | ||
BranchInst *BrInst = | ||
BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); | ||
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); | ||
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); | ||
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); | ||
|
||
// Update dominator for loop exit. During skeleton creation, only the vector | ||
// pre-header and the middle block are created. The vector loop is entirely | ||
// created during VPlan exection. | ||
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); | ||
} | ||
|
||
PHINode *InnerLoopVectorizer::createInductionResumeValue( | ||
|
@@ -3094,51 +3066,6 @@ void InnerLoopVectorizer::createInductionResumeValues( | |
} | ||
} | ||
|
||
BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { | ||
// The trip counts should be cached by now. | ||
Value *Count = getTripCount(); | ||
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); | ||
|
||
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); | ||
|
||
// Add a check in the middle block to see if we have completed | ||
// all of the iterations in the first vector loop. Three cases: | ||
// 1) If we require a scalar epilogue, there is no conditional branch as | ||
// we unconditionally branch to the scalar preheader. Do nothing. | ||
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder. | ||
// Thus if tail is to be folded, we know we don't need to run the | ||
// remainder and we can use the previous value for the condition (true). | ||
// 3) Otherwise, construct a runtime check. | ||
if (!Cost->requiresScalarEpilogue(VF.isVector()) && | ||
!Cost->foldTailByMasking()) { | ||
// Here we use the same DebugLoc as the scalar loop latch terminator instead | ||
// of the corresponding compare because they may have ended up with | ||
// different line numbers and we want to avoid awkward line stepping while | ||
// debugging. Eg. if the compare has got a line number inside the loop. | ||
// TODO: At the moment, CreateICmpEQ will simplify conditions with constant | ||
// operands. Perform simplification directly on VPlan once the branch is | ||
// modeled there. | ||
IRBuilder<> B(LoopMiddleBlock->getTerminator()); | ||
B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); | ||
Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n"); | ||
BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator()); | ||
BI.setCondition(CmpN); | ||
if (hasBranchWeightMD(*ScalarLatchTerm)) { | ||
// Assume that `Count % VectorTripCount` is equally distributed. | ||
unsigned TripCount = UF * VF.getKnownMinValue(); | ||
assert(TripCount > 0 && "trip count should not be zero"); | ||
const uint32_t Weights[] = {1, TripCount - 1}; | ||
setBranchWeights(BI, Weights, /*IsExpected=*/false); | ||
} | ||
} | ||
|
||
#ifdef EXPENSIVE_CHECKS | ||
assert(DT->verify(DominatorTree::VerificationLevel::Fast)); | ||
#endif | ||
|
||
return LoopVectorPreHeader; | ||
} | ||
|
||
std::pair<BasicBlock *, Value *> | ||
InnerLoopVectorizer::createVectorizedLoopSkeleton( | ||
const SCEV2ValueTy &ExpandedSCEVs) { | ||
|
@@ -3161,17 +3088,18 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton( | |
| [ ]_| <-- vector loop (created during VPlan execution). | ||
| | | ||
| v | ||
\ -[ ] <--- middle-block. | ||
\ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to | ||
| | successors created during VPlan execution) | ||
\/ | | ||
/\ v | ||
| ->[ ] <--- new preheader. | ||
| ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). | ||
| | | ||
(opt) v <-- edge from middle to exit iff epilogue is not required. | ||
| [ ] \ | ||
| [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). | ||
\ | | ||
\ v | ||
>[ ] <-- exit block(s). | ||
>[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) | ||
... | ||
*/ | ||
|
||
|
@@ -3198,7 +3126,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton( | |
// Emit phis for the new starting index of the scalar loop. | ||
createInductionResumeValues(ExpandedSCEVs); | ||
|
||
return {completeLoopSkeleton(), nullptr}; | ||
return {LoopVectorPreHeader, nullptr}; | ||
} | ||
|
||
// Fix up external users of the induction variable. At this point, we are | ||
|
@@ -7467,6 +7395,9 @@ LoopVectorizationPlanner::executePlan( | |
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = | ||
ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs | ||
: State.ExpandedSCEVs); | ||
#ifdef EXPENSIVE_CHECKS | ||
assert(DT->verify(DominatorTree::VerificationLevel::Fast)); | ||
#endif | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good to add some explanation somewhere what IR blocks are wrapped when building VPlan(s), what IR blocks are built when creating the Skeleton and need VPlan to be wrap them now, and what blocks are created during VPlan execution. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Extended comment in createVectorizaedLoopSkeleton, moved the replacement to VPlan::execute and added comment there as well. |
||
// Only use noalias metadata when using memory checks guaranteeing no overlap | ||
// across all iterations. | ||
|
@@ -7547,6 +7478,18 @@ LoopVectorizationPlanner::executePlan( | |
|
||
ILV.printDebugTracesAtEnd(); | ||
|
||
// 4. Adjust branch weight of the branch in the middle block. | ||
auto *MiddleTerm = | ||
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); | ||
if (MiddleTerm->isConditional() && | ||
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thoughts independent of this patch:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where should we record it? Ideally on the branch-on-cond VPInst, but that would require supporting metadata on VPinst at least. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assigning weights deserves further discussion; the indicator could be recorded in VPlan itself - whether weights should be assigned to all new branches (possibly adjusting weights of existing branches). |
||
// Assume that `Count % VectorTripCount` is equally distributed. | ||
unsigned TripCount = State.UF * State.VF.getKnownMinValue(); | ||
assert(TripCount > 0 && "trip count should not be zero"); | ||
const uint32_t Weights[] = {1, TripCount - 1}; | ||
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); | ||
} | ||
|
||
return {State.ExpandedSCEVs, ReductionResumeValues}; | ||
} | ||
|
||
|
@@ -7603,7 +7546,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( | |
// inductions in the epilogue loop are created before executing the plan for | ||
// the epilogue loop. | ||
|
||
return {completeLoopSkeleton(), nullptr}; | ||
return {LoopVectorPreHeader, nullptr}; | ||
} | ||
|
||
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { | ||
|
@@ -7792,7 +7735,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( | |
{VecEpilogueIterationCountCheck, | ||
EPI.VectorTripCount} /* AdditionalBypass */); | ||
|
||
return {completeLoopSkeleton(), EPResumeVal}; | ||
return {LoopVectorPreHeader, EPResumeVal}; | ||
} | ||
|
||
BasicBlock * | ||
|
@@ -7837,7 +7780,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( | |
setBranchWeights(BI, Weights, /*IsExpected=*/false); | ||
} | ||
ReplaceInstWithInst(Insert->getTerminator(), &BI); | ||
|
||
LoopBypassBlocks.push_back(Insert); | ||
return Insert; | ||
} | ||
|
@@ -8542,9 +8484,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { | |
// modified; a basic block for the vector pre-header, followed by a region for | ||
// the vector loop, followed by the middle basic block. The skeleton vector | ||
// loop region contains a header and latch basic blocks. | ||
|
||
bool RequiresScalarEpilogueCheck = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More consistent to set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changing the logic leads to a few regressions, better to change this as follow up? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree to have a follow up patch deal with regressions, but does retaining the original condition alone leads to any? I.e., There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now after updating all tests, there appears to be some changes only in the VPlan-native outer-loop vectorization path. Need to take a closer look. |
||
LoopVectorizationPlanner::getDecisionAndClampRange( | ||
[this](ElementCount VF) { | ||
return !CM.requiresScalarEpilogue(VF.isVector()); | ||
}, | ||
Range); | ||
VPlanPtr Plan = VPlan::createInitialVPlan( | ||
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), | ||
*PSE.getSE(), OrigLoop->getLoopPreheader()); | ||
*PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(), | ||
OrigLoop); | ||
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); | ||
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); | ||
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); | ||
|
@@ -8792,7 +8742,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { | |
// Create new empty VPlan | ||
auto Plan = VPlan::createInitialVPlan( | ||
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), | ||
*PSE.getSE(), OrigLoop->getLoopPreheader()); | ||
*PSE.getSE(), true, false, OrigLoop); | ||
|
||
// Build hierarchical CFG | ||
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); | ||
|
@@ -9001,6 +8951,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( | |
} | ||
} | ||
Builder.setInsertPoint(&*LatchVPBB->begin()); | ||
VPBasicBlock *MiddleVPBB = | ||
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()); | ||
VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could (some of) this be pushed independently, earlier, to clarify change in insertion points, if any? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in 8299bfa, thanks |
||
for (VPRecipeBase &R : | ||
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { | ||
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); | ||
|
@@ -9109,8 +9062,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( | |
// also modeled in VPlan. | ||
auto *FinalReductionResult = new VPInstruction( | ||
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); | ||
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()) | ||
->appendRecipe(FinalReductionResult); | ||
FinalReductionResult->insertBefore(*MiddleVPBB, IP); | ||
IP = std::next(FinalReductionResult->getIterator()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this std::next needed - having inserted FinalReductionResult before IP implies that the latter already follows the former? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not needed in the current version, it is sufficient to use the original IP |
||
OrigExitingVPV->replaceUsesWithIf( | ||
FinalReductionResult, | ||
[](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); }); | ||
|
@@ -10155,6 +10108,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); | ||
} | ||
|
||
assert(DT->verify(DominatorTree::VerificationLevel::Fast) && | ||
"DT not preserved correctly"); | ||
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, | ||
DT, true, &ExpandedSCEVs); | ||
++LoopsEpilogueVectorized; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better move the expensive assert to the end of createVectorizedLoopSkeleton()'s than here after calling it? This appeared at the end of completeLoopSkeleton(), called at the end of all three create[Epilogue]VectorizedLoopSkeleton()'s.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we would have to move it to all implementations of
createVectorizedLoopSkeleton
. Left here for now, as this keeps it at a single place and makes sure it is called fro all all different ILV specializations.