From c0227b4b3e17b55681f17c4f8de4123028a44819 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 30 May 2022 10:18:40 -0700 Subject: [PATCH] JIT: defer some flow graph reordering until after loop recognition (#69878) The JIT currently will aggressively reorder the flow graph before running its loop recognition phases. When there is PGO data this sometimes perturbs the block order so that loops are no longer recognized, and we miss out on some loop optimizations. This change defers most block reordering until after the JIT has gone through the optimization phases. There is still a limited form of flow cleanup done early on. There is also a compensating change in loop recognition in one place where it was relying on adjacent blocks being combined. Fixes #67318. --- src/coreclr/jit/compiler.cpp | 11 ++++- src/coreclr/jit/compiler.h | 3 +- src/coreclr/jit/compphases.h | 1 + src/coreclr/jit/fgopt.cpp | 37 +++++++++++---- src/coreclr/jit/flowgraph.cpp | 2 +- src/coreclr/jit/optimizer.cpp | 89 +++++++++++++++++++++++++++++++++-- 6 files changed, 124 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 00dfb10b8072b..5702221c02294 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -4796,9 +4796,9 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // DoPhase(this, PHASE_INVERT_LOOPS, &Compiler::optInvertLoops); - // Optimize block order + // Run some flow graph optimizations (but don't reorder) // - DoPhase(this, PHASE_OPTIMIZE_LAYOUT, &Compiler::optOptimizeLayout); + DoPhase(this, PHASE_OPTIMIZE_FLOW, &Compiler::optOptimizeFlow); // Compute reachability sets and dominators. // @@ -4988,6 +4988,13 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // Insert GC Polls DoPhase(this, PHASE_INSERT_GC_POLLS, &Compiler::fgInsertGCPolls); + // Optimize block order + // + if (opts.OptimizationEnabled()) + { + DoPhase(this, PHASE_OPTIMIZE_LAYOUT, &Compiler::optOptimizeLayout); + } + // Determine start of cold region if we are hot/cold splitting // DoPhase(this, PHASE_DETERMINE_FIRST_COLD_BLOCK, &Compiler::fgDetermineFirstColdBlock); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 99b5906a58c72..34bb62a0daafe 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -5223,7 +5223,7 @@ class Compiler void fgComputeCalledCount(weight_t returnWeight); void fgComputeEdgeWeights(); - bool fgReorderBlocks(); + bool fgReorderBlocks(bool useProfile); PhaseStatus fgDetermineFirstColdBlock(); @@ -5998,6 +5998,7 @@ class Compiler public: PhaseStatus optInvertLoops(); // Invert loops so they're entered at top and tested at bottom. + PhaseStatus optOptimizeFlow(); // Simplify flow graph and do tail duplication PhaseStatus optOptimizeLayout(); // Optimize the BasicBlock layout of the method PhaseStatus optSetBlockWeights(); PhaseStatus optFindLoopsPhase(); // Finds loops and records them in the loop table diff --git a/src/coreclr/jit/compphases.h b/src/coreclr/jit/compphases.h index 3b1907e86fde0..cb0d284ff56ad 100644 --- a/src/coreclr/jit/compphases.h +++ b/src/coreclr/jit/compphases.h @@ -55,6 +55,7 @@ CompPhaseNameMacro(PHASE_CREATE_FUNCLETS, "Create EH funclets", #endif // FEATURE_EH_FUNCLETS CompPhaseNameMacro(PHASE_MERGE_THROWS, "Merge throw blocks", "MRGTHROW", false, -1, false) CompPhaseNameMacro(PHASE_INVERT_LOOPS, "Invert loops", "LOOP-INV", false, -1, false) +CompPhaseNameMacro(PHASE_OPTIMIZE_FLOW, "Optimize control flow", "OPT-FLOW", false, -1, false) CompPhaseNameMacro(PHASE_OPTIMIZE_LAYOUT, "Optimize layout", "LAYOUT", false, -1, false) CompPhaseNameMacro(PHASE_COMPUTE_REACHABILITY, "Compute blocks reachability", "BL_REACH", false, -1, false) CompPhaseNameMacro(PHASE_SET_BLOCK_WEIGHTS, "Set block weights", "BL-WEIGHTS", false, -1, false) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 6caaf338ffea9..228e01cbf5ad4 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4639,14 +4639,22 @@ bool Compiler::fgExpandRarelyRunBlocks() //----------------------------------------------------------------------------- // fgReorderBlocks: reorder blocks to favor frequent fall through paths, -// move rare blocks to the end of the method/eh region, and move -// funclets to the ends of methods. +// move rare blocks to the end of the method/eh region, and move +// funclets to the ends of methods. +// +// Arguments: +// useProfile - if true, use profile data (if available) to more aggressively +// reorder the blocks. // // Returns: -// True if anything got reordered. Reordering blocks may require changing -// IR to reverse branch conditions. +// True if anything got reordered. Reordering blocks may require changing +// IR to reverse branch conditions. // -bool Compiler::fgReorderBlocks() +// Notes: +// We currently allow profile-driven switch opts even when useProfile is false, +// as they are unlikely to lead to reordering.. +// +bool Compiler::fgReorderBlocks(bool useProfile) { noway_assert(opts.compDbgCode == false); @@ -4726,7 +4734,7 @@ bool Compiler::fgReorderBlocks() continue; } - bool reorderBlock = true; // This is set to false if we decide not to reorder 'block' + bool reorderBlock = useProfile; bool isRare = block->isRunRarely(); BasicBlock* bDest = nullptr; bool forwardBranch = false; @@ -4752,7 +4760,8 @@ bool Compiler::fgReorderBlocks() weight_t profHotWeight = -1; - if (bPrev->hasProfileWeight() && block->hasProfileWeight() && ((bDest == nullptr) || bDest->hasProfileWeight())) + if (useProfile && bPrev->hasProfileWeight() && block->hasProfileWeight() && + ((bDest == nullptr) || bDest->hasProfileWeight())) { // // All blocks have profile information @@ -5682,13 +5691,21 @@ bool Compiler::fgReorderBlocks() if (bPrev->bbJumpKind == BBJ_COND) { /* Reverse the bPrev jump condition */ - Statement* condTestStmt = bPrev->lastStmt(); + Statement* const condTestStmt = bPrev->lastStmt(); + GenTree* const condTest = condTestStmt->GetRootNode(); - GenTree* condTest = condTestStmt->GetRootNode(); noway_assert(condTest->gtOper == GT_JTRUE); - condTest->AsOp()->gtOp1 = gtReverseCond(condTest->AsOp()->gtOp1); + // may need to rethread + // + if (fgStmtListThreaded) + { + JITDUMP("Rethreading " FMT_STMT "\n", condTestStmt->GetID()); + gtSetStmtInfo(condTestStmt); + fgSetStmtSeq(condTestStmt); + } + if (bStart2 == nullptr) { /* Set the new jump dest for bPrev to the rarely run or uncommon block(s) */ diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index a512f918efb74..0f70f0d619df5 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -175,7 +175,7 @@ PhaseStatus Compiler::fgInsertGCPolls() if (createdPollBlocks) { noway_assert(opts.OptimizationEnabled()); - fgReorderBlocks(); + fgReorderBlocks(/* useProfileData */ false); constexpr bool computePreds = true; constexpr bool computeDoms = false; fgUpdateChangedFlowGraph(computePreds, computeDoms); diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 6006907e6933e..e58f1c908bcd9 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2330,6 +2330,51 @@ class LoopSearch // prevent assertions from complaining about it. block->bbFlags |= BBF_KEEP_BBJ_ALWAYS; } + + // If block is newNext's only predcessor, move the IR from block to newNext, + // but keep the now-empty block around. + // + // We move the IR because loop recognition has a very limited search capability and + // won't walk from one block's statements to another, even if the blocks form + // a linear chain. So this IR move enhances counted loop recognition. + // + // The logic here is essentially echoing fgCompactBlocks... but we don't call + // that here because we don't want to delete block and do the necessary updates + // to all the other data in flight, and we'd also prefer that newNext be the + // survivor, not block. + // + if ((newNext->bbRefs == 1) && comp->fgCanCompactBlocks(block, newNext)) + { + JITDUMP("Moving stmts from " FMT_BB " to " FMT_BB "\n", block->bbNum, newNext->bbNum); + Statement* stmtList1 = block->firstStmt(); + Statement* stmtList2 = newNext->firstStmt(); + + // Is there anything to move? + // + if (stmtList1 != nullptr) + { + // Append newNext stmts to block's stmts. + // + if (stmtList2 != nullptr) + { + Statement* stmtLast1 = block->lastStmt(); + Statement* stmtLast2 = newNext->lastStmt(); + + stmtLast1->SetNextStmt(stmtList2); + stmtList2->SetPrevStmt(stmtLast1); + stmtList1->SetPrevStmt(stmtLast2); + } + + // Move block's stmts to newNext + // + newNext->bbStmtList = stmtList1; + block->bbStmtList = nullptr; + + // Update newNext's block flags + // + newNext->bbFlags |= (block->bbFlags & BBF_COMPACT_UPD); + } + } } // Make sure we don't leave around a goto-next unless it's marked KEEP_BBJ_ALWAYS. @@ -4797,22 +4842,56 @@ PhaseStatus Compiler::optInvertLoops() return madeChanges ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING; } +//----------------------------------------------------------------------------- +// optOptimizeFlow: simplify flow graph +// +// Returns: +// suitable phase status +// +// Notes: +// Does not do profile-based reordering to try and ensure that +// that we recognize and represent as many loops as possible. +// +PhaseStatus Compiler::optOptimizeFlow() +{ + noway_assert(opts.OptimizationEnabled()); + noway_assert(fgModified == false); + + bool madeChanges = false; + + madeChanges |= fgUpdateFlowGraph(/* allowTailDuplication */ true); + madeChanges |= fgReorderBlocks(/* useProfileData */ false); + madeChanges |= fgUpdateFlowGraph(); + + // fgReorderBlocks can cause IR changes even if it does not modify + // the flow graph. It calls gtPrepareCost which can cause operand swapping. + // Work around this for now. + // + // Note phase status only impacts dumping and checking done post-phase, + // it has no impact on a release build. + // + madeChanges = true; + + return madeChanges ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING; +} + //----------------------------------------------------------------------------- // optOptimizeLayout: reorder blocks to reduce cost of control flow // // Returns: // suitable phase status // +// Notes: +// Reorders using profile data, if available. +// PhaseStatus Compiler::optOptimizeLayout() { noway_assert(opts.OptimizationEnabled()); - noway_assert(fgModified == false); - bool madeChanges = false; - const bool allowTailDuplication = true; + bool madeChanges = false; - madeChanges |= fgUpdateFlowGraph(allowTailDuplication); - madeChanges |= fgReorderBlocks(); + madeChanges |= fgUpdateFlowGraph(/* allowTailDuplication */ false); + madeChanges |= fgReorderBlocks(/* useProfile */ true); madeChanges |= fgUpdateFlowGraph(); // fgReorderBlocks can cause IR changes even if it does not modify