From 5813e1aaa6b5ae5997ed0f8a2960acf7016c847f Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 13 Jun 2024 14:05:42 -0400 Subject: [PATCH 01/40] Implement k-opt for non-EH methods --- src/coreclr/jit/compiler.h | 2 + src/coreclr/jit/fgopt.cpp | 201 ++++++++++++++++++++++++++++++++++++- 2 files changed, 199 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 75fa2d9e8e2a4..96cb20ce92813 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6140,6 +6140,8 @@ class Compiler void fgDoReversePostOrderLayout(); void fgMoveColdBlocks(); + void fgSearchImprovedLayout(); + template void fgMoveBackwardJumpsToSuccessors(); diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index de2b6a8f37c93..d523395072491 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -3421,6 +3421,8 @@ bool Compiler::fgReorderBlocks(bool useProfile) fgDoReversePostOrderLayout(); fgMoveColdBlocks(); + fgSearchImprovedLayout(); + // Renumber blocks to facilitate LSRA's order of block visitation // TODO: Consider removing this, and using traversal order in lSRA // @@ -4677,8 +4679,12 @@ void Compiler::fgDoReversePostOrderLayout() { BasicBlock* const block = dfsTree->GetPostOrder(i); BasicBlock* const blockToMove = dfsTree->GetPostOrder(i - 1); - fgUnlinkBlock(blockToMove); - fgInsertBBafter(block, blockToMove); + + if (!block->NextIs(blockToMove)) + { + fgUnlinkBlock(blockToMove); + fgInsertBBafter(block, blockToMove); + } } // The RPO established a good base layout, but in some cases, it might produce a subpar layout for loops. @@ -4761,8 +4767,11 @@ void Compiler::fgDoReversePostOrderLayout() continue; } - fgUnlinkBlock(blockToMove); - fgInsertBBafter(block, blockToMove); + if (!block->NextIs(blockToMove)) + { + fgUnlinkBlock(blockToMove); + fgInsertBBafter(block, blockToMove); + } } } @@ -5106,6 +5115,190 @@ void Compiler::fgMoveColdBlocks() ehUpdateTryLasts(getTryLast, setTryLast); } +void Compiler::fgSearchImprovedLayout() +{ +#ifdef DEBUG + if (verbose) + { + printf("*************** In fgSearchImprovedLayout()\n"); + + printf("\nInitial BasicBlocks"); + fgDispBasicBlocks(verboseTrees); + printf("\n"); + } +#endif // DEBUG + + // TODO: Enable for methods with EH + // + if (compHndBBtabCount != 0) + { + return; + } + + BlockSet visitedBlocks(BlockSetOps::MakeEmpty(this)); + BasicBlock* startBlock = nullptr; + weight_t layoutScore = 0.0; + weight_t minLayoutScore = 0.0; + + for (BasicBlock* const block : Blocks(fgFirstBB, fgLastBBInMainFunction())) + { + BlockSetOps::AddElemD(this, visitedBlocks, block->bbNum); + BasicBlock* heaviestSucc = nullptr; + weight_t maxEdgeCost = 0.0; + weight_t fallthroughEdgeCost = 0.0; + + for (FlowEdge* const succEdge : block->SuccEdges(this)) + { + BasicBlock* const succ = succEdge->getDestinationBlock(); + const bool isForwardJump = !BlockSetOps::IsMember(this, visitedBlocks, succ->bbNum); + + if (isForwardJump) + { + const weight_t edgeCost = succEdge->getLikelyWeight(); + + if (block->NextIs(succ)) + { + assert(fallthroughEdgeCost == 0.0); + fallthroughEdgeCost = edgeCost; + } + + if (edgeCost > maxEdgeCost) + { + maxEdgeCost = edgeCost; + heaviestSucc = succ; + } + } + } + + if (block->NumSucc() > 0) + { + layoutScore += (block->bbWeight - fallthroughEdgeCost); + minLayoutScore += (block->bbWeight - maxEdgeCost); + + if ((startBlock == nullptr) && !block->NextIs(heaviestSucc)) + { + startBlock = block; + } + } + } + + JITDUMP("Layout score: %f, Min score: %f", layoutScore, minLayoutScore); + + if ((startBlock == nullptr) || Compiler::fgProfileWeightsEqual(layoutScore, minLayoutScore, 0.001)) + { + JITDUMP("\nSkipping reordering"); + return; + } + + BasicBlock** blockVector = new BasicBlock*[fgBBNumMax]; + BasicBlock** tempBlockVector = new BasicBlock*[fgBBNumMax]; + unsigned blockCount = 0; + + for (BasicBlock* const block : Blocks(startBlock, fgLastBBInMainFunction())) + { + if (block->isRunRarely()) + { + break; + } + + blockVector[blockCount] = block; + tempBlockVector[blockCount++] = block; + } + + if (blockCount < 3) + { + JITDUMP("\nNot enough interesting blocks; skipping reordering"); + return; + } + + JITDUMP("\nInteresting blocks: [" FMT_BB ", " FMT_BB "]", startBlock->bbNum, blockVector[blockCount - 1]->bbNum); + + auto evaluateCost = [](BasicBlock* const block, BasicBlock* const next) -> weight_t { + assert(block != nullptr); + + if ((block->NumSucc() == 0) || (next == nullptr)) + { + return 0.0; + } + + const weight_t cost = block->bbWeight; + + for (FlowEdge* const edge : block->SuccEdges()) + { + if (edge->getDestinationBlock() == next) + { + return cost - edge->getLikelyWeight(); + } + } + + return cost; + }; + + BasicBlock* const finalBlock = blockVector[blockCount - 1]->Next(); + bool improvedLayout = true; + + for (unsigned numIter = 0; improvedLayout && (numIter < 20); numIter++) + { + JITDUMP("\n\n--Iteration %d--", (numIter + 1)); + improvedLayout = false; + BasicBlock* const exitBlock = blockVector[blockCount - 1]; + + for (unsigned i = 1; i < (blockCount - 1); i++) + { + BasicBlock* const blockI = blockVector[i]; + BasicBlock* const blockIPrev = blockVector[i - 1]; + + for (unsigned j = i + 1; j < blockCount; j++) + { + // Evaluate the current partition at (i,j) + // S1: 0 ~ i-1 + // S2: i ~ j-1 + // S3: j ~ exit + + BasicBlock* const blockJ = blockVector[j]; + BasicBlock* const blockJPrev = blockVector[j - 1]; + + const weight_t oldScore = evaluateCost(blockIPrev, blockI) + evaluateCost(blockJPrev, blockJ) + evaluateCost(exitBlock, finalBlock); + const weight_t newScore = evaluateCost(blockIPrev, blockJ) + evaluateCost(exitBlock, blockI) + evaluateCost(blockJPrev, finalBlock); + + if ((newScore < oldScore) && !Compiler::fgProfileWeightsEqual(oldScore, newScore, 0.001)) + { + JITDUMP("\nFound better layout by partitioning at i=%d, j=%d", i, j); + JITDUMP("\nOld score: %f, New score: %f", oldScore, newScore); + const unsigned part1Size = i; + const unsigned part2Size = j - i; + const unsigned part3Size = blockCount - j; + + memcpy(tempBlockVector, blockVector, sizeof(BasicBlock*) * part1Size); + memcpy(tempBlockVector + part1Size, blockVector + part1Size + part2Size, sizeof(BasicBlock*) * part3Size); + memcpy(tempBlockVector + part1Size + part3Size, blockVector + part1Size, sizeof(BasicBlock*) * part2Size); + + std::swap(blockVector, tempBlockVector); + improvedLayout = true; + break; + } + } + + if (improvedLayout) + { + break; + } + } + } + + for (unsigned i = 1; i < blockCount; i++) + { + BasicBlock* const block = blockVector[i - 1]; + BasicBlock* const next = blockVector[i]; + + if (!block->NextIs(next)) + { + fgUnlinkBlock(next); + fgInsertBBafter(block, next); + } + } +} + //------------------------------------------------------------- // ehUpdateTryLasts: Iterates EH descriptors, updating each try region's // end block as determined by getTryLast. From 3f4e749eb5acb1fa9a03f5d10d44ac580d564760 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 13 Jun 2024 17:33:56 -0400 Subject: [PATCH 02/40] Enable for methods with EH --- src/coreclr/jit/fgopt.cpp | 78 +++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index d523395072491..8c433c5693dd4 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4651,6 +4651,20 @@ void Compiler::fgMoveBackwardJumpsToSuccessors() } } +struct CallFinallyPair +{ + BasicBlock* callFinally; + BasicBlock* callFinallyRet; + + // Constructor provided so we can call ArrayStack::Emplace + // + CallFinallyPair(BasicBlock* first, BasicBlock* second) + : callFinally(first) + , callFinallyRet(second) + { + } +}; + //----------------------------------------------------------------------------- // fgDoReversePostOrderLayout: Reorder blocks using a greedy RPO traversal. // @@ -4715,20 +4729,6 @@ void Compiler::fgDoReversePostOrderLayout() // The RPO will break up call-finally pairs, so save them before re-ordering // - struct CallFinallyPair - { - BasicBlock* callFinally; - BasicBlock* callFinallyRet; - - // Constructor provided so we can call ArrayStack::Emplace - // - CallFinallyPair(BasicBlock* first, BasicBlock* second) - : callFinally(first) - , callFinallyRet(second) - { - } - }; - ArrayStack callFinallyPairs(getAllocator()); for (EHblkDsc* const HBtab : EHClauses(this)) @@ -4776,6 +4776,7 @@ void Compiler::fgDoReversePostOrderLayout() } // Fix up call-finally pairs + // (We assume the RPO will mess these up, so don't bother checking if the blocks are still adjacent) // for (int i = 0; i < callFinallyPairs.Height(); i++) { @@ -5128,13 +5129,6 @@ void Compiler::fgSearchImprovedLayout() } #endif // DEBUG - // TODO: Enable for methods with EH - // - if (compHndBBtabCount != 0) - { - return; - } - BlockSet visitedBlocks(BlockSetOps::MakeEmpty(this)); BasicBlock* startBlock = nullptr; weight_t layoutScore = 0.0; @@ -5142,14 +5136,27 @@ void Compiler::fgSearchImprovedLayout() for (BasicBlock* const block : Blocks(fgFirstBB, fgLastBBInMainFunction())) { + if (block->hasHndIndex()) + { + continue; + } + BlockSetOps::AddElemD(this, visitedBlocks, block->bbNum); BasicBlock* heaviestSucc = nullptr; weight_t maxEdgeCost = 0.0; weight_t fallthroughEdgeCost = 0.0; + bool hasNonEHSuccs = false; for (FlowEdge* const succEdge : block->SuccEdges(this)) { BasicBlock* const succ = succEdge->getDestinationBlock(); + + if (succ->hasHndIndex()) + { + continue; + } + + hasNonEHSuccs = true; const bool isForwardJump = !BlockSetOps::IsMember(this, visitedBlocks, succ->bbNum); if (isForwardJump) @@ -5170,12 +5177,12 @@ void Compiler::fgSearchImprovedLayout() } } - if (block->NumSucc() > 0) + if (hasNonEHSuccs) { layoutScore += (block->bbWeight - fallthroughEdgeCost); minLayoutScore += (block->bbWeight - maxEdgeCost); - if ((startBlock == nullptr) && !block->NextIs(heaviestSucc)) + if ((startBlock == nullptr) && !block->NextIs(heaviestSucc) && !block->hasTryIndex()) { startBlock = block; } @@ -5190,12 +5197,18 @@ void Compiler::fgSearchImprovedLayout() return; } + ArrayStack callFinallyPairs(getAllocator()); BasicBlock** blockVector = new BasicBlock*[fgBBNumMax]; BasicBlock** tempBlockVector = new BasicBlock*[fgBBNumMax]; unsigned blockCount = 0; for (BasicBlock* const block : Blocks(startBlock, fgLastBBInMainFunction())) { + if (block->hasTryIndex() || block->hasHndIndex()) + { + continue; + } + if (block->isRunRarely()) { break; @@ -5203,6 +5216,11 @@ void Compiler::fgSearchImprovedLayout() blockVector[blockCount] = block; tempBlockVector[blockCount++] = block; + + if (block->isBBCallFinallyPair()) + { + callFinallyPairs.Emplace(block, block->Next()); + } } if (blockCount < 3) @@ -5211,7 +5229,7 @@ void Compiler::fgSearchImprovedLayout() return; } - JITDUMP("\nInteresting blocks: [" FMT_BB ", " FMT_BB "]", startBlock->bbNum, blockVector[blockCount - 1]->bbNum); + JITDUMP("\nInteresting blocks: [" FMT_BB "-" FMT_BB "]", startBlock->bbNum, blockVector[blockCount - 1]->bbNum); auto evaluateCost = [](BasicBlock* const block, BasicBlock* const next) -> weight_t { assert(block != nullptr); @@ -5290,6 +5308,7 @@ void Compiler::fgSearchImprovedLayout() { BasicBlock* const block = blockVector[i - 1]; BasicBlock* const next = blockVector[i]; + assert(BasicBlock::sameEHRegion(block, next)); if (!block->NextIs(next)) { @@ -5297,6 +5316,17 @@ void Compiler::fgSearchImprovedLayout() fgInsertBBafter(block, next); } } + + for (int i = 0; i < callFinallyPairs.Height(); i++) + { + const CallFinallyPair& pair = callFinallyPairs.BottomRef(i); + + if (!pair.callFinally->NextIs(pair.callFinallyRet)) + { + fgUnlinkBlock(pair.callFinallyRet); + fgInsertBBafter(pair.callFinally, pair.callFinallyRet); + } + } } //------------------------------------------------------------- From 699714b9347b91a3d267a85475ba025844d2317b Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 13 Jun 2024 18:30:21 -0400 Subject: [PATCH 03/40] Add comments --- src/coreclr/jit/fgopt.cpp | 48 ++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 8c433c5693dd4..dcfddd4e56657 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5116,6 +5116,14 @@ void Compiler::fgMoveColdBlocks() ehUpdateTryLasts(getTryLast, setTryLast); } +//----------------------------------------------------------------------------- +// fgSearchImprovedLayout: Try to improve upon RPO-based layout with the 3-opt method: +// - Identify a subset of "interesting" (not cold, has branches, etc.) blocks to move +// - Partition this set into three segments: S1 - S2 - S3 +// - Evaluate cost of swapped layout: S1 - S3 - S2 +// - If the cost improves, keep this layout +// - Repeat for a certain number of iterations, or until no improvements are made +// void Compiler::fgSearchImprovedLayout() { #ifdef DEBUG @@ -5134,15 +5142,21 @@ void Compiler::fgSearchImprovedLayout() weight_t layoutScore = 0.0; weight_t minLayoutScore = 0.0; + // Evaluate initial minimum layout costs + // (Minimum layout cost may not actually be possible to achieve, + // as we cannot always fall into a block's hottest successor) + // for (BasicBlock* const block : Blocks(fgFirstBB, fgLastBBInMainFunction())) { + // Ignore EH blocks + // if (block->hasHndIndex()) { continue; } BlockSetOps::AddElemD(this, visitedBlocks, block->bbNum); - BasicBlock* heaviestSucc = nullptr; + BasicBlock* hottestSucc = nullptr; weight_t maxEdgeCost = 0.0; weight_t fallthroughEdgeCost = 0.0; bool hasNonEHSuccs = false; @@ -5151,6 +5165,8 @@ void Compiler::fgSearchImprovedLayout() { BasicBlock* const succ = succEdge->getDestinationBlock(); + // Ignore EH successors + // if (succ->hasHndIndex()) { continue; @@ -5165,24 +5181,28 @@ void Compiler::fgSearchImprovedLayout() if (block->NextIs(succ)) { - assert(fallthroughEdgeCost == 0.0); fallthroughEdgeCost = edgeCost; } if (edgeCost > maxEdgeCost) { maxEdgeCost = edgeCost; - heaviestSucc = succ; + hottestSucc = succ; } } } + // Only factor costs of non-EH edges into layout cost calculations + // if (hasNonEHSuccs) { layoutScore += (block->bbWeight - fallthroughEdgeCost); minLayoutScore += (block->bbWeight - maxEdgeCost); - if ((startBlock == nullptr) && !block->NextIs(heaviestSucc) && !block->hasTryIndex()) + // We have found our first "interesting" block in the main method body, + // as this block does not fall into its hottest successor + // + if ((startBlock == nullptr) && !block->NextIs(hottestSucc) && !block->hasTryIndex()) { startBlock = block; } @@ -5197,18 +5217,26 @@ void Compiler::fgSearchImprovedLayout() return; } - ArrayStack callFinallyPairs(getAllocator()); + // blockVector will contain the set of interesting blocks to move. + // tempBlockVector will assist with moving segments of interesting blocks. + // BasicBlock** blockVector = new BasicBlock*[fgBBNumMax]; BasicBlock** tempBlockVector = new BasicBlock*[fgBBNumMax]; + ArrayStack callFinallyPairs(getAllocator()); unsigned blockCount = 0; for (BasicBlock* const block : Blocks(startBlock, fgLastBBInMainFunction())) { + // Don't consider blocks in EH regions + // if (block->hasTryIndex() || block->hasHndIndex()) { continue; } + // We've reached the cold section of the main method body; + // nothing is interesting at this point + // if (block->isRunRarely()) { break; @@ -5252,6 +5280,10 @@ void Compiler::fgSearchImprovedLayout() return cost; }; + // finalBlock is the first block after the set of interesting blocks. + // We will need to keep track of it to compute the cost of creating/breaking fallthrough into it. + // finalBlock can be null. + // BasicBlock* const finalBlock = blockVector[blockCount - 1]->Next(); bool improvedLayout = true; @@ -5271,7 +5303,7 @@ void Compiler::fgSearchImprovedLayout() // Evaluate the current partition at (i,j) // S1: 0 ~ i-1 // S2: i ~ j-1 - // S3: j ~ exit + // S3: j ~ exitBlock BasicBlock* const blockJ = blockVector[j]; BasicBlock* const blockJPrev = blockVector[j - 1]; @@ -5304,6 +5336,8 @@ void Compiler::fgSearchImprovedLayout() } } + // Rearrange blocks + // for (unsigned i = 1; i < blockCount; i++) { BasicBlock* const block = blockVector[i - 1]; @@ -5317,6 +5351,8 @@ void Compiler::fgSearchImprovedLayout() } } + // Fix call-finally pairs + // for (int i = 0; i < callFinallyPairs.Height(); i++) { const CallFinallyPair& pair = callFinallyPairs.BottomRef(i); From 0849f768d184bf6a5cad5c967c24f7482e8f9bd9 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 13 Jun 2024 18:31:22 -0400 Subject: [PATCH 04/40] Style --- src/coreclr/jit/fgopt.cpp | 52 +++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index dcfddd4e56657..bd68e86db26ec 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5137,10 +5137,10 @@ void Compiler::fgSearchImprovedLayout() } #endif // DEBUG - BlockSet visitedBlocks(BlockSetOps::MakeEmpty(this)); - BasicBlock* startBlock = nullptr; - weight_t layoutScore = 0.0; - weight_t minLayoutScore = 0.0; + BlockSet visitedBlocks(BlockSetOps::MakeEmpty(this)); + BasicBlock* startBlock = nullptr; + weight_t layoutScore = 0.0; + weight_t minLayoutScore = 0.0; // Evaluate initial minimum layout costs // (Minimum layout cost may not actually be possible to achieve, @@ -5156,10 +5156,10 @@ void Compiler::fgSearchImprovedLayout() } BlockSetOps::AddElemD(this, visitedBlocks, block->bbNum); - BasicBlock* hottestSucc = nullptr; - weight_t maxEdgeCost = 0.0; - weight_t fallthroughEdgeCost = 0.0; - bool hasNonEHSuccs = false; + BasicBlock* hottestSucc = nullptr; + weight_t maxEdgeCost = 0.0; + weight_t fallthroughEdgeCost = 0.0; + bool hasNonEHSuccs = false; for (FlowEdge* const succEdge : block->SuccEdges(this)) { @@ -5172,7 +5172,7 @@ void Compiler::fgSearchImprovedLayout() continue; } - hasNonEHSuccs = true; + hasNonEHSuccs = true; const bool isForwardJump = !BlockSetOps::IsMember(this, visitedBlocks, succ->bbNum); if (isForwardJump) @@ -5220,10 +5220,10 @@ void Compiler::fgSearchImprovedLayout() // blockVector will contain the set of interesting blocks to move. // tempBlockVector will assist with moving segments of interesting blocks. // - BasicBlock** blockVector = new BasicBlock*[fgBBNumMax]; - BasicBlock** tempBlockVector = new BasicBlock*[fgBBNumMax]; + BasicBlock** blockVector = new BasicBlock*[fgBBNumMax]; + BasicBlock** tempBlockVector = new BasicBlock*[fgBBNumMax]; ArrayStack callFinallyPairs(getAllocator()); - unsigned blockCount = 0; + unsigned blockCount = 0; for (BasicBlock* const block : Blocks(startBlock, fgLastBBInMainFunction())) { @@ -5242,7 +5242,7 @@ void Compiler::fgSearchImprovedLayout() break; } - blockVector[blockCount] = block; + blockVector[blockCount] = block; tempBlockVector[blockCount++] = block; if (block->isBBCallFinallyPair()) @@ -5268,7 +5268,7 @@ void Compiler::fgSearchImprovedLayout() } const weight_t cost = block->bbWeight; - + for (FlowEdge* const edge : block->SuccEdges()) { if (edge->getDestinationBlock() == next) @@ -5284,18 +5284,18 @@ void Compiler::fgSearchImprovedLayout() // We will need to keep track of it to compute the cost of creating/breaking fallthrough into it. // finalBlock can be null. // - BasicBlock* const finalBlock = blockVector[blockCount - 1]->Next(); - bool improvedLayout = true; + BasicBlock* const finalBlock = blockVector[blockCount - 1]->Next(); + bool improvedLayout = true; for (unsigned numIter = 0; improvedLayout && (numIter < 20); numIter++) { JITDUMP("\n\n--Iteration %d--", (numIter + 1)); - improvedLayout = false; + improvedLayout = false; BasicBlock* const exitBlock = blockVector[blockCount - 1]; for (unsigned i = 1; i < (blockCount - 1); i++) { - BasicBlock* const blockI = blockVector[i]; + BasicBlock* const blockI = blockVector[i]; BasicBlock* const blockIPrev = blockVector[i - 1]; for (unsigned j = i + 1; j < blockCount; j++) @@ -5305,11 +5305,13 @@ void Compiler::fgSearchImprovedLayout() // S2: i ~ j-1 // S3: j ~ exitBlock - BasicBlock* const blockJ = blockVector[j]; + BasicBlock* const blockJ = blockVector[j]; BasicBlock* const blockJPrev = blockVector[j - 1]; - const weight_t oldScore = evaluateCost(blockIPrev, blockI) + evaluateCost(blockJPrev, blockJ) + evaluateCost(exitBlock, finalBlock); - const weight_t newScore = evaluateCost(blockIPrev, blockJ) + evaluateCost(exitBlock, blockI) + evaluateCost(blockJPrev, finalBlock); + const weight_t oldScore = evaluateCost(blockIPrev, blockI) + evaluateCost(blockJPrev, blockJ) + + evaluateCost(exitBlock, finalBlock); + const weight_t newScore = evaluateCost(blockIPrev, blockJ) + evaluateCost(exitBlock, blockI) + + evaluateCost(blockJPrev, finalBlock); if ((newScore < oldScore) && !Compiler::fgProfileWeightsEqual(oldScore, newScore, 0.001)) { @@ -5320,8 +5322,10 @@ void Compiler::fgSearchImprovedLayout() const unsigned part3Size = blockCount - j; memcpy(tempBlockVector, blockVector, sizeof(BasicBlock*) * part1Size); - memcpy(tempBlockVector + part1Size, blockVector + part1Size + part2Size, sizeof(BasicBlock*) * part3Size); - memcpy(tempBlockVector + part1Size + part3Size, blockVector + part1Size, sizeof(BasicBlock*) * part2Size); + memcpy(tempBlockVector + part1Size, blockVector + part1Size + part2Size, + sizeof(BasicBlock*) * part3Size); + memcpy(tempBlockVector + part1Size + part3Size, blockVector + part1Size, + sizeof(BasicBlock*) * part2Size); std::swap(blockVector, tempBlockVector); improvedLayout = true; @@ -5341,7 +5345,7 @@ void Compiler::fgSearchImprovedLayout() for (unsigned i = 1; i < blockCount; i++) { BasicBlock* const block = blockVector[i - 1]; - BasicBlock* const next = blockVector[i]; + BasicBlock* const next = blockVector[i]; assert(BasicBlock::sameEHRegion(block, next)); if (!block->NextIs(next)) From 044b33222ec1cc6807082618cae7ab858f24745e Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 13 Jun 2024 18:55:15 -0400 Subject: [PATCH 05/40] Only one iteration for now; try to reduce TP cost --- src/coreclr/jit/fgopt.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index bd68e86db26ec..c4b280cbaf076 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5148,9 +5148,9 @@ void Compiler::fgSearchImprovedLayout() // for (BasicBlock* const block : Blocks(fgFirstBB, fgLastBBInMainFunction())) { - // Ignore EH blocks + // Ignore EH and cold blocks -- neither should contribute to the layout cost of the main method body // - if (block->hasHndIndex()) + if (block->hasHndIndex() || block->isRunRarely()) { continue; } @@ -5284,10 +5284,11 @@ void Compiler::fgSearchImprovedLayout() // We will need to keep track of it to compute the cost of creating/breaking fallthrough into it. // finalBlock can be null. // - BasicBlock* const finalBlock = blockVector[blockCount - 1]->Next(); - bool improvedLayout = true; + BasicBlock* const finalBlock = blockVector[blockCount - 1]->Next(); + bool improvedLayout = true; + constexpr unsigned maxIter = 1; // TODO: Reconsider? - for (unsigned numIter = 0; improvedLayout && (numIter < 20); numIter++) + for (unsigned numIter = 0; improvedLayout && (numIter < maxIter); numIter++) { JITDUMP("\n\n--Iteration %d--", (numIter + 1)); improvedLayout = false; From f0e7f6bd0e420dc2da326920fcfafab9dc66341a Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 13 Jun 2024 19:38:23 -0400 Subject: [PATCH 06/40] Remove initial layout cost calculation --- src/coreclr/jit/fgopt.cpp | 62 +++++++++++---------------------------- 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index c4b280cbaf076..2e47c18db4bfb 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5138,80 +5138,52 @@ void Compiler::fgSearchImprovedLayout() #endif // DEBUG BlockSet visitedBlocks(BlockSetOps::MakeEmpty(this)); - BasicBlock* startBlock = nullptr; - weight_t layoutScore = 0.0; - weight_t minLayoutScore = 0.0; + BasicBlock* startBlock = nullptr; - // Evaluate initial minimum layout costs - // (Minimum layout cost may not actually be possible to achieve, - // as we cannot always fall into a block's hottest successor) + // Find the first block that doesn't fall into its hottest successor. + // This will be our first "interesting" block. // for (BasicBlock* const block : Blocks(fgFirstBB, fgLastBBInMainFunction())) { - // Ignore EH and cold blocks -- neither should contribute to the layout cost of the main method body - // - if (block->hasHndIndex() || block->isRunRarely()) + // Ignore try/handler blocks + if (block->hasTryIndex() || block->hasHndIndex()) { continue; } BlockSetOps::AddElemD(this, visitedBlocks, block->bbNum); - BasicBlock* hottestSucc = nullptr; - weight_t maxEdgeCost = 0.0; - weight_t fallthroughEdgeCost = 0.0; - bool hasNonEHSuccs = false; + FlowEdge* hottestSuccEdge = nullptr; for (FlowEdge* const succEdge : block->SuccEdges(this)) { BasicBlock* const succ = succEdge->getDestinationBlock(); - // Ignore EH successors + // Ignore try/handler successors // - if (succ->hasHndIndex()) + if (succ->hasTryIndex() || succ->hasHndIndex()) { continue; } - hasNonEHSuccs = true; const bool isForwardJump = !BlockSetOps::IsMember(this, visitedBlocks, succ->bbNum); - if (isForwardJump) + if (isForwardJump && + ((hottestSuccEdge == nullptr) || (succEdge->getLikelihood() > hottestSuccEdge->getLikelihood()))) { - const weight_t edgeCost = succEdge->getLikelyWeight(); - - if (block->NextIs(succ)) - { - fallthroughEdgeCost = edgeCost; - } - - if (edgeCost > maxEdgeCost) - { - maxEdgeCost = edgeCost; - hottestSucc = succ; - } + hottestSuccEdge = succEdge; } } - // Only factor costs of non-EH edges into layout cost calculations - // - if (hasNonEHSuccs) + if ((hottestSuccEdge != nullptr) && !block->NextIs(hottestSuccEdge->getDestinationBlock())) { - layoutScore += (block->bbWeight - fallthroughEdgeCost); - minLayoutScore += (block->bbWeight - maxEdgeCost); - - // We have found our first "interesting" block in the main method body, - // as this block does not fall into its hottest successor + // We found the first "interesting" block that doesn't fall into its hottest successor // - if ((startBlock == nullptr) && !block->NextIs(hottestSucc) && !block->hasTryIndex()) - { - startBlock = block; - } + startBlock = block; + break; } } - JITDUMP("Layout score: %f, Min score: %f", layoutScore, minLayoutScore); - - if ((startBlock == nullptr) || Compiler::fgProfileWeightsEqual(layoutScore, minLayoutScore, 0.001)) + if (startBlock == nullptr) { JITDUMP("\nSkipping reordering"); return; @@ -5286,7 +5258,7 @@ void Compiler::fgSearchImprovedLayout() // BasicBlock* const finalBlock = blockVector[blockCount - 1]->Next(); bool improvedLayout = true; - constexpr unsigned maxIter = 1; // TODO: Reconsider? + constexpr unsigned maxIter = 5; // TODO: Reconsider? for (unsigned numIter = 0; improvedLayout && (numIter < maxIter); numIter++) { From 41efb9b8552110f72abf08dd16617a2f80c4d0d2 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 13 Jun 2024 19:54:51 -0400 Subject: [PATCH 07/40] Conditionalize EH checks --- src/coreclr/jit/arraystack.h | 4 ++-- src/coreclr/jit/compiler.h | 1 + src/coreclr/jit/fgopt.cpp | 25 ++++++++++++++++++------- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/arraystack.h b/src/coreclr/jit/arraystack.h index 5d8a697a3820d..269a23402519c 100644 --- a/src/coreclr/jit/arraystack.h +++ b/src/coreclr/jit/arraystack.h @@ -7,9 +7,9 @@ template class ArrayStack { - static const int builtinSize = 8; - public: + static constexpr int builtinSize = 8; + explicit ArrayStack(CompAllocator alloc, int initialCapacity = builtinSize) : m_alloc(alloc) { diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index c9648aec35c7e..35256b591cfe7 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6147,6 +6147,7 @@ class Compiler void fgDoReversePostOrderLayout(); void fgMoveColdBlocks(); + template void fgSearchImprovedLayout(); template diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 2e47c18db4bfb..b8edc113e421c 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -3421,7 +3421,14 @@ bool Compiler::fgReorderBlocks(bool useProfile) fgDoReversePostOrderLayout(); fgMoveColdBlocks(); - fgSearchImprovedLayout(); + if (compHndBBtabCount != 0) + { + fgSearchImprovedLayout(); + } + else + { + fgSearchImprovedLayout(); + } // Renumber blocks to facilitate LSRA's order of block visitation // TODO: Consider removing this, and using traversal order in lSRA @@ -5124,6 +5131,10 @@ void Compiler::fgMoveColdBlocks() // - If the cost improves, keep this layout // - Repeat for a certain number of iterations, or until no improvements are made // +// Template parameters: +// hasEH - If true, method has EH regions, so check that we don't try to move blocks in different regions +// +template void Compiler::fgSearchImprovedLayout() { #ifdef DEBUG @@ -5146,7 +5157,7 @@ void Compiler::fgSearchImprovedLayout() for (BasicBlock* const block : Blocks(fgFirstBB, fgLastBBInMainFunction())) { // Ignore try/handler blocks - if (block->hasTryIndex() || block->hasHndIndex()) + if (hasEH && (block->hasTryIndex() || block->hasHndIndex())) { continue; } @@ -5160,7 +5171,7 @@ void Compiler::fgSearchImprovedLayout() // Ignore try/handler successors // - if (succ->hasTryIndex() || succ->hasHndIndex()) + if (hasEH && (succ->hasTryIndex() || succ->hasHndIndex())) { continue; } @@ -5194,8 +5205,8 @@ void Compiler::fgSearchImprovedLayout() // BasicBlock** blockVector = new BasicBlock*[fgBBNumMax]; BasicBlock** tempBlockVector = new BasicBlock*[fgBBNumMax]; - ArrayStack callFinallyPairs(getAllocator()); - unsigned blockCount = 0; + unsigned blockCount = 0; + ArrayStack callFinallyPairs(getAllocator(), hasEH ? ArrayStack::builtinSize : 0); for (BasicBlock* const block : Blocks(startBlock, fgLastBBInMainFunction())) { @@ -5217,7 +5228,7 @@ void Compiler::fgSearchImprovedLayout() blockVector[blockCount] = block; tempBlockVector[blockCount++] = block; - if (block->isBBCallFinallyPair()) + if (hasEH && block->isBBCallFinallyPair()) { callFinallyPairs.Emplace(block, block->Next()); } @@ -5330,7 +5341,7 @@ void Compiler::fgSearchImprovedLayout() // Fix call-finally pairs // - for (int i = 0; i < callFinallyPairs.Height(); i++) + for (int i = 0; hasEH && (i < callFinallyPairs.Height()); i++) { const CallFinallyPair& pair = callFinallyPairs.BottomRef(i); From 94c22721335db9ee980e6856599af601255618e9 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Wed, 18 Sep 2024 13:40:25 -0400 Subject: [PATCH 08/40] Add priority queue impl --- src/coreclr/jit/CMakeLists.txt | 1 + src/coreclr/jit/compiler.h | 1 + src/coreclr/jit/priorityqueue.h | 113 ++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+) create mode 100644 src/coreclr/jit/priorityqueue.h diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt index 68155021d8eb7..a4e83a35b8c9d 100644 --- a/src/coreclr/jit/CMakeLists.txt +++ b/src/coreclr/jit/CMakeLists.txt @@ -361,6 +361,7 @@ set( JIT_HEADERS opcode.h optcse.h phase.h + priorityqueue.h promotion.h rangecheck.h rationalize.h diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index c0b67e8996db2..e3421324dafe6 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -38,6 +38,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "cycletimer.h" #include "blockset.h" #include "arraystack.h" +#include "priorityqueue.h" #include "hashbv.h" #include "jitexpandarray.h" #include "valuenum.h" diff --git a/src/coreclr/jit/priorityqueue.h b/src/coreclr/jit/priorityqueue.h new file mode 100644 index 0000000000000..e9f5df2f55d61 --- /dev/null +++ b/src/coreclr/jit/priorityqueue.h @@ -0,0 +1,113 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +// +// PriorityQueue: A priority queue implemented as a max-heap + +template +class PriorityQueue +{ +private: + jitstd::vector data; + Compare comp; + + // Returns true only if each element has a higher priority than its children. + bool verifyMaxHeap() const + { + for (size_t i = 0; i < data.size(); i++) + { + const size_t leftChild = (2 * i) + 1; + const size_t rightChild = leftChild + 1; + + if (rightChild < data.size()) + { + if (comp(data[i], data[leftChild]) || comp(data[i], data[rightChild])) + { + return false; + } + } + else if (leftChild < data.size()) + { + if (comp(data[i], data[leftChild])) + { + return false; + } + } + } + + return true; + } + +public: + PriorityQueue(const jitstd::allocator& allocator, const Compare& compare) + : data(allocator) + , comp(compare) + { + } + + const T& top() const + { + assert(!data.empty()); + return data.front(); + } + + bool empty() const + { + return data.empty(); + } + + size_t size() const + { + return data.size(); + } + + // Insert new element at the back of the vector. + // Then, while the new element has a higher priority than its parent, swap them. + void push(const T& value) + { + size_t i = data.size(); + data.push_back(value); + + auto getParent = [](const size_t i) -> size_t { + return (i - 1) / 2; + }; + + for (size_t parent = getParent(i); (i != 0) && comp(data[parent], data[i]); i = parent, parent = getParent(i)) + { + std::swap(data[parent], data[i]); + } + + assert(verifyMaxHeap()); + } + + // Swap the root and last element to facilitate removing the former. + // Then, while the new root element has a lower priority than its children, + // swap the element with its highest-priority child. + void pop() + { + assert(!data.empty()); + std::swap(data.front(), data.back()); + data.pop_back(); + + auto getLeftChild = [](const size_t i) -> size_t { + return (2 * i) + 1; + }; + + for (size_t i = 0, maxChild = getLeftChild(i); maxChild < data.size(); i = maxChild, maxChild = getLeftChild(i)) + { + const size_t rightChild = maxChild + 1; + maxChild = ((rightChild < data.size()) && comp(data[maxChild], data[rightChild])) ? rightChild : maxChild; + + if (comp(data[i], data[maxChild])) + { + std::swap(data[i], data[maxChild]); + } + else + { + break; + } + } + + assert(verifyMaxHeap()); + } +}; From 0081d9b1fb4d46486285ea92d0dedfb4dada84de Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Fri, 20 Sep 2024 09:42:28 -0400 Subject: [PATCH 09/40] wip --- src/coreclr/jit/fgopt.cpp | 530 ++++++++++++++++++++++---------- src/coreclr/jit/priorityqueue.h | 21 +- 2 files changed, 369 insertions(+), 182 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 030cc688dc5f3..b1bac29c6d54f 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5128,236 +5128,426 @@ void Compiler::fgMoveColdBlocks() ehUpdateTryLasts(getTryLast, setTryLast); } -//----------------------------------------------------------------------------- -// fgSearchImprovedLayout: Try to improve upon RPO-based layout with the 3-opt method: -// - Identify a subset of "interesting" (not cold, has branches, etc.) blocks to move -// - Partition this set into three segments: S1 - S2 - S3 -// - Evaluate cost of swapped layout: S1 - S3 - S2 -// - If the cost improves, keep this layout -// - Repeat for a certain number of iterations, or until no improvements are made -// -// Template parameters: -// hasEH - If true, method has EH regions, so check that we don't try to move blocks in different regions -// template void Compiler::fgSearchImprovedLayout() { -#ifdef DEBUG - if (verbose) + if (hasEH) { - printf("*************** In fgSearchImprovedLayout()\n"); - - printf("\nInitial BasicBlocks"); - fgDispBasicBlocks(verboseTrees); - printf("\n"); + return; } -#endif // DEBUG - BlockSet visitedBlocks(BlockSetOps::MakeEmpty(this)); - BasicBlock* startBlock = nullptr; + auto edgeWeightComp = [](const FlowEdge* left, const FlowEdge* right) -> bool { + return left->getLikelyWeight() < right->getLikelyWeight(); + }; - // Find the first block that doesn't fall into its hottest successor. - // This will be our first "interesting" block. - // - for (BasicBlock* const block : Blocks(fgFirstBB, fgLastBBInMainFunction())) + unsigned numHotBlocks = 0; + unsigned maxBBNum = 0; + PriorityQueue cutPoints(getAllocator(CMK_FlowEdge), edgeWeightComp); + BasicBlock* finalBlock = fgLastBBInMainFunction(); + + for (BasicBlock* const block : Blocks(fgFirstBB, finalBlock)) { - // Ignore try/handler blocks - if (hasEH && (block->hasTryIndex() || block->hasHndIndex())) + if (block->isBBWeightCold(this)) { - continue; + finalBlock = block->Prev(); + break; } - BlockSetOps::AddElemD(this, visitedBlocks, block->bbNum); - FlowEdge* hottestSuccEdge = nullptr; + maxBBNum = max(maxBBNum, block->bbNum); + numHotBlocks++; - for (FlowEdge* const succEdge : block->SuccEdges(this)) + if (block->KindIs(BBJ_ALWAYS) && !block->JumpsToNext()) { - BasicBlock* const succ = succEdge->getDestinationBlock(); - - // Ignore try/handler successors - // - if (hasEH && (succ->hasTryIndex() || succ->hasHndIndex())) - { - continue; - } - - const bool isForwardJump = !BlockSetOps::IsMember(this, visitedBlocks, succ->bbNum); - - if (isForwardJump && - ((hottestSuccEdge == nullptr) || (succEdge->getLikelihood() > hottestSuccEdge->getLikelihood()))) - { - hottestSuccEdge = succEdge; - } + cutPoints.Push(block->GetTargetEdge()); } + // else if (block->KindIs(BBJ_COND)) + // { + // FlowEdge* likelyEdge; + // FlowEdge* unlikelyEdge; - if ((hottestSuccEdge != nullptr) && !block->NextIs(hottestSuccEdge->getDestinationBlock())) - { - // We found the first "interesting" block that doesn't fall into its hottest successor - // - startBlock = block; - break; - } + // if (block->GetTrueEdge()->getLikelihood() > 0.5) + // { + // likelyEdge = block->GetTrueEdge(); + // unlikelyEdge = block->GetFalseEdge(); + // } + // else + // { + // likelyEdge = block->GetFalseEdge(); + // unlikelyEdge = block->GetTrueEdge(); + // } + + // if (!block->NextIs(likelyEdge->getDestinationBlock())) + // { + // if ((likelyEdge->getLikelihood() > 0.5) || !block->NextIs()) + // } + // } } - if (startBlock == nullptr) + if (numHotBlocks < 3) { - JITDUMP("\nSkipping reordering"); + JITDUMP("Not enough hot blocks; skipping reordering\n"); return; } - // blockVector will contain the set of interesting blocks to move. - // tempBlockVector will assist with moving segments of interesting blocks. - // - BasicBlock** blockVector = new BasicBlock*[fgBBNumMax]; - BasicBlock** tempBlockVector = new BasicBlock*[fgBBNumMax]; - unsigned blockCount = 0; - ArrayStack callFinallyPairs(getAllocator(), hasEH ? ArrayStack::builtinSize : 0); + assert(finalBlock != nullptr); + const bool hasExitBlock = !finalBlock->IsLast(); + unsigned* const ordinals = new unsigned[maxBBNum + 1]; + BasicBlock** blockOrder = new BasicBlock*[numHotBlocks + (int)hasExitBlock]; + BasicBlock** tempOrder = new BasicBlock*[numHotBlocks + (int)hasExitBlock]; + unsigned position = 0; - for (BasicBlock* const block : Blocks(startBlock, fgLastBBInMainFunction())) + for (BasicBlock* const block : Blocks(fgFirstBB, finalBlock)) { - // Don't consider blocks in EH regions - // - if (block->hasTryIndex() || block->hasHndIndex()) - { - continue; - } - - // We've reached the cold section of the main method body; - // nothing is interesting at this point - // - if (block->isRunRarely()) - { - break; - } - - blockVector[blockCount] = block; - tempBlockVector[blockCount++] = block; - - if (hasEH && block->isBBCallFinallyPair()) - { - callFinallyPairs.Emplace(block, block->Next()); - } + blockOrder[position] = tempOrder[position] = block; + ordinals[block->bbNum] = position++; } - if (blockCount < 3) + assert(position == numHotBlocks); + + if (hasExitBlock) { - JITDUMP("\nNot enough interesting blocks; skipping reordering"); - return; + blockOrder[numHotBlocks] = tempOrder[numHotBlocks] = finalBlock->Next(); } - JITDUMP("\nInteresting blocks: [" FMT_BB "-" FMT_BB "]", startBlock->bbNum, blockVector[blockCount - 1]->bbNum); + const unsigned lastHotIndex = numHotBlocks - 1; + const unsigned firstColdIndex = numHotBlocks; - auto evaluateCost = [](BasicBlock* const block, BasicBlock* const next) -> weight_t { - assert(block != nullptr); - - if ((block->NumSucc() == 0) || (next == nullptr)) + weight_t cost; + auto getEdgeAndUpdateCost = [this, &cost, blockOrder, firstColdIndex] (unsigned srcPos, unsigned dstPos) -> FlowEdge* { + if ((srcPos > firstColdIndex) || (dstPos > firstColdIndex)) { - return 0.0; + return nullptr; } - const weight_t cost = block->bbWeight; + FlowEdge* const edge = fgGetPredForBlock(blockOrder[dstPos], blockOrder[srcPos]); - for (FlowEdge* const edge : block->SuccEdges()) + if (edge != nullptr) { - if (edge->getDestinationBlock() == next) - { - return cost - edge->getLikelyWeight(); - } + cost += edge->getLikelyWeight(); } - return cost; + return edge; }; - // finalBlock is the first block after the set of interesting blocks. - // We will need to keep track of it to compute the cost of creating/breaking fallthrough into it. - // finalBlock can be null. - // - BasicBlock* const finalBlock = blockVector[blockCount - 1]->Next(); - bool improvedLayout = true; - constexpr unsigned maxIter = 5; // TODO: Reconsider? + // S ~~~~ s | ~~~~ | d ~~~~ E + // S ~~~~ s | d ~~~~ | ~~~~ E + bool modified = false; - for (unsigned numIter = 0; improvedLayout && (numIter < maxIter); numIter++) + while (!cutPoints.Empty()) { - JITDUMP("\n\n--Iteration %d--", (numIter + 1)); - improvedLayout = false; - BasicBlock* const exitBlock = blockVector[blockCount - 1]; + FlowEdge* const candidateEdge = cutPoints.Top(); + cutPoints.Pop(); + const weight_t improvement = candidateEdge->getLikelyWeight(); + cost = 0.0; - for (unsigned i = 1; i < (blockCount - 1); i++) + BasicBlock* const srcBlk = candidateEdge->getSourceBlock(); + BasicBlock* const dstBlk = candidateEdge->getDestinationBlock(); + const unsigned srcPos = ordinals[srcBlk->bbNum]; + const unsigned dstPos = ordinals[dstBlk->bbNum]; + assert((srcPos + 1) != dstPos); + assert(srcPos < numHotBlocks); + assert(dstPos < numHotBlocks); + + if (srcPos == dstPos) { - BasicBlock* const blockI = blockVector[i]; - BasicBlock* const blockIPrev = blockVector[i - 1]; + continue; + } - for (unsigned j = i + 1; j < blockCount; j++) - { - // Evaluate the current partition at (i,j) - // S1: 0 ~ i-1 - // S2: i ~ j-1 - // S3: j ~ exitBlock + const bool isForwardJump = (srcPos < dstPos); + if (!isForwardJump) + { + // TODO: Move backward jumps + continue; + } - BasicBlock* const blockJ = blockVector[j]; - BasicBlock* const blockJPrev = blockVector[j - 1]; + FlowEdge* const srcNextEdge = getEdgeAndUpdateCost(srcPos, srcPos + 1); + FlowEdge* const dstPrevEdge = getEdgeAndUpdateCost(dstPos - 1, dstPos); - const weight_t oldScore = evaluateCost(blockIPrev, blockI) + evaluateCost(blockJPrev, blockJ) + - evaluateCost(exitBlock, finalBlock); - const weight_t newScore = evaluateCost(blockIPrev, blockJ) + evaluateCost(exitBlock, blockI) + - evaluateCost(blockJPrev, finalBlock); + if (hasExitBlock) + { + getEdgeAndUpdateCost(lastHotIndex, firstColdIndex); + } - if ((newScore < oldScore) && !Compiler::fgProfileWeightsEqual(oldScore, newScore, 0.001)) - { - JITDUMP("\nFound better layout by partitioning at i=%d, j=%d", i, j); - JITDUMP("\nOld score: %f, New score: %f", oldScore, newScore); - const unsigned part1Size = i; - const unsigned part2Size = j - i; - const unsigned part3Size = blockCount - j; - - memcpy(tempBlockVector, blockVector, sizeof(BasicBlock*) * part1Size); - memcpy(tempBlockVector + part1Size, blockVector + part1Size + part2Size, - sizeof(BasicBlock*) * part3Size); - memcpy(tempBlockVector + part1Size + part3Size, blockVector + part1Size, - sizeof(BasicBlock*) * part2Size); - - std::swap(blockVector, tempBlockVector); - improvedLayout = true; - break; - } - } + if ((improvement <= cost) || Compiler::fgProfileWeightsEqual(improvement, cost, 0.001)) + { + continue; + } - if (improvedLayout) - { - break; - } + modified = true; + + const unsigned part1Size = srcPos + 1; + const unsigned part2Size = dstPos - srcPos - 1; + const unsigned part3Size = numHotBlocks - dstPos; + + memcpy(tempOrder, blockOrder, sizeof(BasicBlock*) * part1Size); + memcpy(tempOrder + part1Size, blockOrder + part1Size + part2Size, sizeof(BasicBlock*) * part3Size); + memcpy(tempOrder + part1Size + part3Size, blockOrder + part1Size, sizeof(BasicBlock*) * part2Size); + std::swap(blockOrder, tempOrder); + + for (unsigned i = part1Size; i < numHotBlocks; i++) + { + ordinals[blockOrder[i]->bbNum] = i; } - } - // Rearrange blocks - // - for (unsigned i = 1; i < blockCount; i++) - { - BasicBlock* const block = blockVector[i - 1]; - BasicBlock* const next = blockVector[i]; - assert(BasicBlock::sameEHRegion(block, next)); + assert((ordinals[srcBlk->bbNum] + 1) == ordinals[dstBlk->bbNum]); + + if (srcNextEdge != nullptr) + { + cutPoints.Push(srcNextEdge); + } - if (!block->NextIs(next)) + if (dstPrevEdge != nullptr) { - fgUnlinkBlock(next); - fgInsertBBafter(block, next); + cutPoints.Push(dstPrevEdge); } } - // Fix call-finally pairs - // - for (int i = 0; hasEH && (i < callFinallyPairs.Height()); i++) + if (modified) { - const CallFinallyPair& pair = callFinallyPairs.BottomRef(i); - - if (!pair.callFinally->NextIs(pair.callFinallyRet)) + for (unsigned i = 1; i < numHotBlocks; i++) { - fgUnlinkBlock(pair.callFinallyRet); - fgInsertBBafter(pair.callFinally, pair.callFinallyRet); + BasicBlock* const block = blockOrder[i - 1]; + BasicBlock* const next = blockOrder[i]; + if (!block->NextIs(next)) + { + fgUnlinkBlock(next); + fgInsertBBafter(block, next); + } } } } +//----------------------------------------------------------------------------- +// fgSearchImprovedLayout: Try to improve upon RPO-based layout with the 3-opt method: +// - Identify a subset of "interesting" (not cold, has branches, etc.) blocks to move +// - Partition this set into three segments: S1 - S2 - S3 +// - Evaluate cost of swapped layout: S1 - S3 - S2 +// - If the cost improves, keep this layout +// - Repeat for a certain number of iterations, or until no improvements are made +// +// Template parameters: +// hasEH - If true, method has EH regions, so check that we don't try to move blocks in different regions +// +// template +// void Compiler::fgSearchImprovedLayout() +// { +// #ifdef DEBUG +// if (verbose) +// { +// printf("*************** In fgSearchImprovedLayout()\n"); + +// printf("\nInitial BasicBlocks"); +// fgDispBasicBlocks(verboseTrees); +// printf("\n"); +// } +// #endif // DEBUG + +// BlockSet visitedBlocks(BlockSetOps::MakeEmpty(this)); +// BasicBlock* startBlock = nullptr; + +// // Find the first block that doesn't fall into its hottest successor. +// // This will be our first "interesting" block. +// // +// for (BasicBlock* const block : Blocks(fgFirstBB, fgLastBBInMainFunction())) +// { +// // Ignore try/handler blocks +// if (hasEH && (block->hasTryIndex() || block->hasHndIndex())) +// { +// continue; +// } + +// BlockSetOps::AddElemD(this, visitedBlocks, block->bbNum); +// FlowEdge* hottestSuccEdge = nullptr; + +// for (FlowEdge* const succEdge : block->SuccEdges(this)) +// { +// BasicBlock* const succ = succEdge->getDestinationBlock(); + +// // Ignore try/handler successors +// // +// if (hasEH && (succ->hasTryIndex() || succ->hasHndIndex())) +// { +// continue; +// } + +// const bool isForwardJump = !BlockSetOps::IsMember(this, visitedBlocks, succ->bbNum); + +// if (isForwardJump && +// ((hottestSuccEdge == nullptr) || (succEdge->getLikelihood() > hottestSuccEdge->getLikelihood()))) +// { +// hottestSuccEdge = succEdge; +// } +// } + +// if ((hottestSuccEdge != nullptr) && !block->NextIs(hottestSuccEdge->getDestinationBlock())) +// { +// // We found the first "interesting" block that doesn't fall into its hottest successor +// // +// startBlock = block; +// break; +// } +// } + +// if (startBlock == nullptr) +// { +// JITDUMP("\nSkipping reordering"); +// return; +// } + +// // blockVector will contain the set of interesting blocks to move. +// // tempBlockVector will assist with moving segments of interesting blocks. +// // +// BasicBlock** blockVector = new BasicBlock*[fgBBNumMax]; +// BasicBlock** tempBlockVector = new BasicBlock*[fgBBNumMax]; +// unsigned blockCount = 0; +// ArrayStack callFinallyPairs(getAllocator(), hasEH ? ArrayStack::builtinSize : +// 0); + +// for (BasicBlock* const block : Blocks(startBlock, fgLastBBInMainFunction())) +// { +// // Don't consider blocks in EH regions +// // +// if (block->hasTryIndex() || block->hasHndIndex()) +// { +// continue; +// } + +// // We've reached the cold section of the main method body; +// // nothing is interesting at this point +// // +// if (block->isRunRarely()) +// { +// break; +// } + +// blockVector[blockCount] = block; +// tempBlockVector[blockCount++] = block; + +// if (hasEH && block->isBBCallFinallyPair()) +// { +// callFinallyPairs.Emplace(block, block->Next()); +// } +// } + +// if (blockCount < 3) +// { +// JITDUMP("\nNot enough interesting blocks; skipping reordering"); +// return; +// } + +// JITDUMP("\nInteresting blocks: [" FMT_BB "-" FMT_BB "]", startBlock->bbNum, blockVector[blockCount - 1]->bbNum); + + // auto evaluateCost = [](BasicBlock* const block, BasicBlock* const next) -> weight_t { + // assert(block != nullptr); + + // if ((block->NumSucc() == 0) || (next == nullptr)) + // { + // return 0.0; + // } + + // const weight_t cost = block->bbWeight; + + // for (FlowEdge* const edge : block->SuccEdges()) + // { + // if (edge->getDestinationBlock() == next) + // { + // return cost - edge->getLikelyWeight(); + // } + // } + +// return cost; +// }; + +// // finalBlock is the first block after the set of interesting blocks. +// // We will need to keep track of it to compute the cost of creating/breaking fallthrough into it. +// // finalBlock can be null. +// // +// BasicBlock* const finalBlock = blockVector[blockCount - 1]->Next(); +// bool improvedLayout = true; +// constexpr unsigned maxIter = 5; // TODO: Reconsider? + +// for (unsigned numIter = 0; improvedLayout && (numIter < maxIter); numIter++) +// { +// JITDUMP("\n\n--Iteration %d--", (numIter + 1)); +// improvedLayout = false; +// BasicBlock* const exitBlock = blockVector[blockCount - 1]; + +// for (unsigned i = 1; i < (blockCount - 1); i++) +// { +// BasicBlock* const blockI = blockVector[i]; +// BasicBlock* const blockIPrev = blockVector[i - 1]; + +// for (unsigned j = i + 1; j < blockCount; j++) +// { +// // Evaluate the current partition at (i,j) +// // S1: 0 ~ i-1 +// // S2: i ~ j-1 +// // S3: j ~ exitBlock + +// BasicBlock* const blockJ = blockVector[j]; +// BasicBlock* const blockJPrev = blockVector[j - 1]; + +// const weight_t oldScore = evaluateCost(blockIPrev, blockI) + evaluateCost(blockJPrev, blockJ) + +// evaluateCost(exitBlock, finalBlock); +// const weight_t newScore = evaluateCost(blockIPrev, blockJ) + evaluateCost(exitBlock, blockI) + +// evaluateCost(blockJPrev, finalBlock); + +// if ((newScore < oldScore) && !Compiler::fgProfileWeightsEqual(oldScore, newScore, 0.001)) +// { +// JITDUMP("\nFound better layout by partitioning at i=%d, j=%d", i, j); +// JITDUMP("\nOld score: %f, New score: %f", oldScore, newScore); +// const unsigned part1Size = i; +// const unsigned part2Size = j - i; +// const unsigned part3Size = blockCount - j; + +// memcpy(tempBlockVector, blockVector, sizeof(BasicBlock*) * part1Size); +// memcpy(tempBlockVector + part1Size, blockVector + part1Size + part2Size, +// sizeof(BasicBlock*) * part3Size); +// memcpy(tempBlockVector + part1Size + part3Size, blockVector + part1Size, +// sizeof(BasicBlock*) * part2Size); + +// std::swap(blockVector, tempBlockVector); +// improvedLayout = true; +// break; +// } +// } + +// if (improvedLayout) +// { +// break; +// } +// } +// } + +// // Rearrange blocks +// // +// for (unsigned i = 1; i < blockCount; i++) +// { +// BasicBlock* const block = blockVector[i - 1]; +// BasicBlock* const next = blockVector[i]; +// assert(BasicBlock::sameEHRegion(block, next)); + +// if (!block->NextIs(next)) +// { +// fgUnlinkBlock(next); +// fgInsertBBafter(block, next); +// } +// } + +// // Fix call-finally pairs +// // +// for (int i = 0; hasEH && (i < callFinallyPairs.Height()); i++) +// { +// const CallFinallyPair& pair = callFinallyPairs.BottomRef(i); + +// if (!pair.callFinally->NextIs(pair.callFinallyRet)) +// { +// fgUnlinkBlock(pair.callFinallyRet); +// fgInsertBBafter(pair.callFinally, pair.callFinallyRet); +// } +// } +// } + //------------------------------------------------------------- // ehUpdateTryLasts: Iterates EH descriptors, updating each try region's // end block as determined by getTryLast. diff --git a/src/coreclr/jit/priorityqueue.h b/src/coreclr/jit/priorityqueue.h index e9f5df2f55d61..f1bace10dca76 100644 --- a/src/coreclr/jit/priorityqueue.h +++ b/src/coreclr/jit/priorityqueue.h @@ -11,8 +11,9 @@ class PriorityQueue jitstd::vector data; Compare comp; +#ifdef DEBUG // Returns true only if each element has a higher priority than its children. - bool verifyMaxHeap() const + bool VerifyMaxHeap() const { for (size_t i = 0; i < data.size(); i++) { @@ -37,6 +38,7 @@ class PriorityQueue return true; } +#endif // DEBUG public: PriorityQueue(const jitstd::allocator& allocator, const Compare& compare) @@ -45,25 +47,20 @@ class PriorityQueue { } - const T& top() const + const T& Top() const { assert(!data.empty()); return data.front(); } - bool empty() const + bool Empty() const { return data.empty(); } - size_t size() const - { - return data.size(); - } - // Insert new element at the back of the vector. // Then, while the new element has a higher priority than its parent, swap them. - void push(const T& value) + void Push(const T& value) { size_t i = data.size(); data.push_back(value); @@ -77,13 +74,13 @@ class PriorityQueue std::swap(data[parent], data[i]); } - assert(verifyMaxHeap()); + assert(VerifyMaxHeap()); } // Swap the root and last element to facilitate removing the former. // Then, while the new root element has a lower priority than its children, // swap the element with its highest-priority child. - void pop() + void Pop() { assert(!data.empty()); std::swap(data.front(), data.back()); @@ -108,6 +105,6 @@ class PriorityQueue } } - assert(verifyMaxHeap()); + assert(VerifyMaxHeap()); } }; From 4700e65f0413e4181ed9dd47d7360624493be058 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Fri, 20 Sep 2024 15:31:14 -0400 Subject: [PATCH 10/40] Fix lambda capture --- src/coreclr/jit/fgopt.cpp | 23 ++++++++++++++++++++--- src/coreclr/jit/priorityqueue.h | 4 ++-- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index b1bac29c6d54f..c316dd1dc5d09 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5213,7 +5213,7 @@ void Compiler::fgSearchImprovedLayout() const unsigned firstColdIndex = numHotBlocks; weight_t cost; - auto getEdgeAndUpdateCost = [this, &cost, blockOrder, firstColdIndex] (unsigned srcPos, unsigned dstPos) -> FlowEdge* { + auto getEdgeAndUpdateCost = [this, &cost, &blockOrder, firstColdIndex] (unsigned srcPos, unsigned dstPos) -> FlowEdge* { if ((srcPos > firstColdIndex) || (dstPos > firstColdIndex)) { return nullptr; @@ -5244,15 +5244,27 @@ void Compiler::fgSearchImprovedLayout() BasicBlock* const dstBlk = candidateEdge->getDestinationBlock(); const unsigned srcPos = ordinals[srcBlk->bbNum]; const unsigned dstPos = ordinals[dstBlk->bbNum]; - assert((srcPos + 1) != dstPos); + + // Why are we considering an edge from a block that isn't hot? assert(srcPos < numHotBlocks); - assert(dstPos < numHotBlocks); + // Why are we considering an edge that already falls through? + assert((srcPos + 1) != dstPos); + + // Don't consider backedges for single-block loops if (srcPos == dstPos) { continue; } + // Don't consider jumps from hot blocks to cold blocks. + // We can get such candidates if the edge has a low likelihood, + // or if weight wasn't propagated correctly from srcBlk to dstBlk. + if (dstPos >= numHotBlocks) + { + continue; + } + const bool isForwardJump = (srcPos < dstPos); if (!isForwardJump) { @@ -5260,6 +5272,10 @@ void Compiler::fgSearchImprovedLayout() continue; } + // Before getting any edges, make sure 'ordinals' is up-to-date + assert(blockOrder[srcPos] == srcBlk); + assert(blockOrder[dstPos] == dstBlk); + FlowEdge* const srcNextEdge = getEdgeAndUpdateCost(srcPos, srcPos + 1); FlowEdge* const dstPrevEdge = getEdgeAndUpdateCost(dstPos - 1, dstPos); @@ -5289,6 +5305,7 @@ void Compiler::fgSearchImprovedLayout() ordinals[blockOrder[i]->bbNum] = i; } + // Ensure this move created fallthrough from srcBlk to dstBlk assert((ordinals[srcBlk->bbNum] + 1) == ordinals[dstBlk->bbNum]); if (srcNextEdge != nullptr) diff --git a/src/coreclr/jit/priorityqueue.h b/src/coreclr/jit/priorityqueue.h index f1bace10dca76..72cc3f85d815e 100644 --- a/src/coreclr/jit/priorityqueue.h +++ b/src/coreclr/jit/priorityqueue.h @@ -74,7 +74,7 @@ class PriorityQueue std::swap(data[parent], data[i]); } - assert(VerifyMaxHeap()); + // assert(VerifyMaxHeap()); } // Swap the root and last element to facilitate removing the former. @@ -105,6 +105,6 @@ class PriorityQueue } } - assert(VerifyMaxHeap()); + // assert(VerifyMaxHeap()); } }; From ebb7e6a7b170d2734dfcde3d4d748e25e7b8ef27 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Tue, 24 Sep 2024 14:08:29 -0400 Subject: [PATCH 11/40] Consider forward conditional jumps --- src/coreclr/jit/fgopt.cpp | 128 +++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 57 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 929897c63368c..82fb4babdfcbe 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5140,10 +5140,9 @@ void Compiler::fgSearchImprovedLayout() return left->getLikelyWeight() < right->getLikelyWeight(); }; - unsigned numHotBlocks = 0; - unsigned maxBBNum = 0; + unsigned numHotBlocks = 0; PriorityQueue cutPoints(getAllocator(CMK_FlowEdge), edgeWeightComp); - BasicBlock* finalBlock = fgLastBBInMainFunction(); + BasicBlock* finalBlock = fgLastBBInMainFunction(); for (BasicBlock* const block : Blocks(fgFirstBB, finalBlock)) { @@ -5153,34 +5152,40 @@ void Compiler::fgSearchImprovedLayout() break; } - maxBBNum = max(maxBBNum, block->bbNum); numHotBlocks++; if (block->KindIs(BBJ_ALWAYS) && !block->JumpsToNext()) { cutPoints.Push(block->GetTargetEdge()); } - // else if (block->KindIs(BBJ_COND)) - // { - // FlowEdge* likelyEdge; - // FlowEdge* unlikelyEdge; - - // if (block->GetTrueEdge()->getLikelihood() > 0.5) - // { - // likelyEdge = block->GetTrueEdge(); - // unlikelyEdge = block->GetFalseEdge(); - // } - // else - // { - // likelyEdge = block->GetFalseEdge(); - // unlikelyEdge = block->GetTrueEdge(); - // } - - // if (!block->NextIs(likelyEdge->getDestinationBlock())) - // { - // if ((likelyEdge->getLikelihood() > 0.5) || !block->NextIs()) - // } - // } + else if (block->KindIs(BBJ_COND)) + { + FlowEdge* likelyEdge; + FlowEdge* unlikelyEdge; + + if (block->GetTrueEdge()->getLikelihood() > 0.5) + { + likelyEdge = block->GetTrueEdge(); + unlikelyEdge = block->GetFalseEdge(); + } + else + { + likelyEdge = block->GetFalseEdge(); + unlikelyEdge = block->GetTrueEdge(); + } + + assert(likelyEdge != unlikelyEdge); + + if (!block->NextIs(likelyEdge->getDestinationBlock())) + { + cutPoints.Push(likelyEdge); + + if (!block->NextIs(unlikelyEdge->getDestinationBlock())) + { + cutPoints.Push(unlikelyEdge); + } + } + } } if (numHotBlocks < 3) @@ -5190,16 +5195,16 @@ void Compiler::fgSearchImprovedLayout() } assert(finalBlock != nullptr); - const bool hasExitBlock = !finalBlock->IsLast(); - unsigned* const ordinals = new unsigned[maxBBNum + 1]; - BasicBlock** blockOrder = new BasicBlock*[numHotBlocks + (int)hasExitBlock]; - BasicBlock** tempOrder = new BasicBlock*[numHotBlocks + (int)hasExitBlock]; - unsigned position = 0; + const bool hasExitBlock = !finalBlock->IsLast(); + unsigned* const ordinals = new (this, CMK_Generic) unsigned[fgBBNumMax + 1]{}; + BasicBlock** blockOrder = new (this, CMK_BasicBlock) BasicBlock*[numHotBlocks + (int)hasExitBlock]; + BasicBlock** tempOrder = new (this, CMK_BasicBlock) BasicBlock*[numHotBlocks + (int)hasExitBlock]; + unsigned position = 0; for (BasicBlock* const block : Blocks(fgFirstBB, finalBlock)) { blockOrder[position] = tempOrder[position] = block; - ordinals[block->bbNum] = position++; + ordinals[block->bbNum] = position++; } assert(position == numHotBlocks); @@ -5209,11 +5214,12 @@ void Compiler::fgSearchImprovedLayout() blockOrder[numHotBlocks] = tempOrder[numHotBlocks] = finalBlock->Next(); } - const unsigned lastHotIndex = numHotBlocks - 1; + const unsigned lastHotIndex = numHotBlocks - 1; const unsigned firstColdIndex = numHotBlocks; + weight_t cost; - weight_t cost; - auto getEdgeAndUpdateCost = [this, &cost, &blockOrder, firstColdIndex] (unsigned srcPos, unsigned dstPos) -> FlowEdge* { + auto getEdgeAndUpdateCost = [this, &cost, &blockOrder, firstColdIndex](unsigned srcPos, + unsigned dstPos) -> FlowEdge* { if ((srcPos > firstColdIndex) || (dstPos > firstColdIndex)) { return nullptr; @@ -5238,18 +5244,26 @@ void Compiler::fgSearchImprovedLayout() FlowEdge* const candidateEdge = cutPoints.Top(); cutPoints.Pop(); const weight_t improvement = candidateEdge->getLikelyWeight(); + printf("%f\n", improvement); cost = 0.0; BasicBlock* const srcBlk = candidateEdge->getSourceBlock(); BasicBlock* const dstBlk = candidateEdge->getDestinationBlock(); - const unsigned srcPos = ordinals[srcBlk->bbNum]; - const unsigned dstPos = ordinals[dstBlk->bbNum]; + const unsigned srcPos = ordinals[srcBlk->bbNum]; + const unsigned dstPos = ordinals[dstBlk->bbNum]; - // Why are we considering an edge from a block that isn't hot? assert(srcPos < numHotBlocks); + assert(dstPos < numHotBlocks); - // Why are we considering an edge that already falls through? - assert((srcPos + 1) != dstPos); + // Don't consider jumps from hot blocks to cold blocks. + // We can get such candidates if weight wasn't propagated correctly from 'srcBlk' to 'dstBlk'. + // We know 'dstPos' is cold if its position is not set in 'ordinals'. + // ('fgFirstBB' has a position of 0 too, but we don't want to change the entry point anyway, + // so skip any attempts to do so here) + if ((dstPos == 0) && !dstBlk->IsFirst()) + { + continue; + } // Don't consider backedges for single-block loops if (srcPos == dstPos) @@ -5257,11 +5271,11 @@ void Compiler::fgSearchImprovedLayout() continue; } - // Don't consider jumps from hot blocks to cold blocks. - // We can get such candidates if the edge has a low likelihood, - // or if weight wasn't propagated correctly from srcBlk to dstBlk. - if (dstPos >= numHotBlocks) + // Previous moves might have inadvertently created fallthrough from srcBlk to dstBlk, + // so there's nothing to do this round. + if ((srcPos + 1) == dstPos) { + assert(modified); continue; } @@ -5454,23 +5468,23 @@ void Compiler::fgSearchImprovedLayout() // JITDUMP("\nInteresting blocks: [" FMT_BB "-" FMT_BB "]", startBlock->bbNum, blockVector[blockCount - 1]->bbNum); - // auto evaluateCost = [](BasicBlock* const block, BasicBlock* const next) -> weight_t { - // assert(block != nullptr); +// auto evaluateCost = [](BasicBlock* const block, BasicBlock* const next) -> weight_t { +// assert(block != nullptr); - // if ((block->NumSucc() == 0) || (next == nullptr)) - // { - // return 0.0; - // } +// if ((block->NumSucc() == 0) || (next == nullptr)) +// { +// return 0.0; +// } - // const weight_t cost = block->bbWeight; +// const weight_t cost = block->bbWeight; - // for (FlowEdge* const edge : block->SuccEdges()) - // { - // if (edge->getDestinationBlock() == next) - // { - // return cost - edge->getLikelyWeight(); - // } - // } +// for (FlowEdge* const edge : block->SuccEdges()) +// { +// if (edge->getDestinationBlock() == next) +// { +// return cost - edge->getLikelyWeight(); +// } +// } // return cost; // }; From 4eb44719c04b0f2ff4c161e2ac53dceb8d6ce203 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Tue, 24 Sep 2024 14:08:57 -0400 Subject: [PATCH 12/40] Remove debug print --- src/coreclr/jit/fgopt.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 82fb4babdfcbe..bd63f9cada0b1 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5244,7 +5244,6 @@ void Compiler::fgSearchImprovedLayout() FlowEdge* const candidateEdge = cutPoints.Top(); cutPoints.Pop(); const weight_t improvement = candidateEdge->getLikelyWeight(); - printf("%f\n", improvement); cost = 0.0; BasicBlock* const srcBlk = candidateEdge->getSourceBlock(); From 0175b18f29fe907dc2409829b196245a12076fc2 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Wed, 25 Sep 2024 15:55:06 -0400 Subject: [PATCH 13/40] Consider backward jumps; find more initial candidates --- src/coreclr/jit/compiler.h | 2 - src/coreclr/jit/fgopt.cpp | 595 +++++++++++++++---------------------- 2 files changed, 237 insertions(+), 360 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index e3daf692b480f..62b253cb9fb74 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6258,8 +6258,6 @@ class Compiler bool fgReorderBlocks(bool useProfile); void fgDoReversePostOrderLayout(); void fgMoveColdBlocks(); - - template void fgSearchImprovedLayout(); template diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index bd63f9cada0b1..51632d7223d1a 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -3348,15 +3348,7 @@ bool Compiler::fgReorderBlocks(bool useProfile) { fgDoReversePostOrderLayout(); fgMoveColdBlocks(); - - if (compHndBBtabCount != 0) - { - fgSearchImprovedLayout(); - } - else - { - fgSearchImprovedLayout(); - } + fgSearchImprovedLayout(); // Renumber blocks to facilitate LSRA's order of block visitation // TODO: Consider removing this, and using traversal order in lSRA @@ -5128,64 +5120,40 @@ void Compiler::fgMoveColdBlocks() ehUpdateTryLasts(getTryLast, setTryLast); } -template +//----------------------------------------------------------------------------- +// fgSearchImprovedLayout: Try to improve upon RPO-based layout with the 3-opt method: +// - Identify a range of hot blocks to reorder within +// - Partition this set into three segments: S1 - S2 - S3 +// - Evaluate cost of swapped layout: S1 - S3 - S2 +// - If the cost improves, keep this layout +// void Compiler::fgSearchImprovedLayout() { - if (hasEH) + if (compHndBBtabCount != 0) { + // TODO: Reorder methods with EH regions return; } - auto edgeWeightComp = [](const FlowEdge* left, const FlowEdge* right) -> bool { - return left->getLikelyWeight() < right->getLikelyWeight(); - }; - - unsigned numHotBlocks = 0; - PriorityQueue cutPoints(getAllocator(CMK_FlowEdge), edgeWeightComp); - BasicBlock* finalBlock = fgLastBBInMainFunction(); + // Note that we're default-initializing 'ordinals' with zeros. + // Block reordering shouldn't change the method's entry point, + // so if a block has an ordinal of zero and it's not 'fgFirstBB', + // the block wasn't visited below, so it's not in the range of hot blocks. + unsigned* const ordinals = new (this, CMK_Generic) unsigned[fgBBNumMax + 1]{}; + unsigned numHotBlocks = 0; + BasicBlock* finalBlock = fgLastBBInMainFunction(); + BasicBlock* firstColdBlock = nullptr; for (BasicBlock* const block : Blocks(fgFirstBB, finalBlock)) { if (block->isBBWeightCold(this)) { - finalBlock = block->Prev(); + firstColdBlock = block; + finalBlock = block->Prev(); break; } - numHotBlocks++; - - if (block->KindIs(BBJ_ALWAYS) && !block->JumpsToNext()) - { - cutPoints.Push(block->GetTargetEdge()); - } - else if (block->KindIs(BBJ_COND)) - { - FlowEdge* likelyEdge; - FlowEdge* unlikelyEdge; - - if (block->GetTrueEdge()->getLikelihood() > 0.5) - { - likelyEdge = block->GetTrueEdge(); - unlikelyEdge = block->GetFalseEdge(); - } - else - { - likelyEdge = block->GetFalseEdge(); - unlikelyEdge = block->GetTrueEdge(); - } - - assert(likelyEdge != unlikelyEdge); - - if (!block->NextIs(likelyEdge->getDestinationBlock())) - { - cutPoints.Push(likelyEdge); - - if (!block->NextIs(unlikelyEdge->getDestinationBlock())) - { - cutPoints.Push(unlikelyEdge); - } - } - } + ordinals[block->bbNum] = numHotBlocks++; } if (numHotBlocks < 3) @@ -5194,57 +5162,78 @@ void Compiler::fgSearchImprovedLayout() return; } - assert(finalBlock != nullptr); - const bool hasExitBlock = !finalBlock->IsLast(); - unsigned* const ordinals = new (this, CMK_Generic) unsigned[fgBBNumMax + 1]{}; - BasicBlock** blockOrder = new (this, CMK_BasicBlock) BasicBlock*[numHotBlocks + (int)hasExitBlock]; - BasicBlock** tempOrder = new (this, CMK_BasicBlock) BasicBlock*[numHotBlocks + (int)hasExitBlock]; - unsigned position = 0; - - for (BasicBlock* const block : Blocks(fgFirstBB, finalBlock)) - { - blockOrder[position] = tempOrder[position] = block; - ordinals[block->bbNum] = position++; - } - - assert(position == numHotBlocks); + // We will use a priority queue to greedily get the next hottest edge to try to create fallthrough for + auto edgeWeightComp = [](const FlowEdge* left, const FlowEdge* right) -> bool { + return left->getLikelyWeight() < right->getLikelyWeight(); + }; + PriorityQueue cutPoints(getAllocator(CMK_FlowEdge), edgeWeightComp); - if (hasExitBlock) - { - blockOrder[numHotBlocks] = tempOrder[numHotBlocks] = finalBlock->Next(); - } + // Since adding to priority queues has logarithmic time complexity, + // try to avoid adding edges that we obviously won't consider when reordering. + auto considerEdge = [&cutPoints, ordinals](FlowEdge* edge) { + assert(edge != nullptr); - const unsigned lastHotIndex = numHotBlocks - 1; - const unsigned firstColdIndex = numHotBlocks; - weight_t cost; + BasicBlock* const srcBlk = edge->getSourceBlock(); + BasicBlock* const dstBlk = edge->getDestinationBlock(); + const unsigned srcPos = ordinals[srcBlk->bbNum]; + const unsigned dstPos = ordinals[dstBlk->bbNum]; - auto getEdgeAndUpdateCost = [this, &cost, &blockOrder, firstColdIndex](unsigned srcPos, - unsigned dstPos) -> FlowEdge* { - if ((srcPos > firstColdIndex) || (dstPos > firstColdIndex)) + // Don't consider edges from outside the hot range. + // If 'srcBlk' has an ordinal of zero and it isn't the first block, + // it's not tracked by 'ordinals', so it's not in the hot section. + if ((srcPos == 0) && !srcBlk->IsFirst()) { - return nullptr; + return; } - FlowEdge* const edge = fgGetPredForBlock(blockOrder[dstPos], blockOrder[srcPos]); + // Don't consider edges to blocks outside the hot range, + // or backedges to 'fgFirstBB'; we don't want to change the entry point. + if (dstPos == 0) + { + return; + } - if (edge != nullptr) + // Don't consider backedges for single-block loops + if (srcPos == dstPos) { - cost += edge->getLikelyWeight(); + return; } - return edge; + cutPoints.Push(edge); }; - // S ~~~~ s | ~~~~ | d ~~~~ E - // S ~~~~ s | d ~~~~ | ~~~~ E - bool modified = false; + assert(finalBlock != nullptr); + BasicBlock** blockOrder = new (this, CMK_BasicBlock) BasicBlock*[numHotBlocks]; + BasicBlock** tempOrder = new (this, CMK_BasicBlock) BasicBlock*[numHotBlocks]; + unsigned position = 0; + + // Initialize current block order, and find potential cut points in the hot section + for (BasicBlock* const block : Blocks(fgFirstBB, finalBlock)) + { + blockOrder[position] = block; + position++; + + for (FlowEdge* const succEdge : block->SuccEdges(this)) + { + if (!block->NextIs(succEdge->getDestinationBlock())) + { + considerEdge(succEdge); + } + } + } + assert(position == numHotBlocks); + const unsigned lastHotIndex = numHotBlocks - 1; + bool modified = false; + + // For each candidate edge, determine if it's profitable to partition after the source block + // and before the destination block, and swapping the partitions to create fallthrough. + // If it is, do the swap, and for the blocks before/after each cut point that lost fallthrough, + // consider adding their successors/predecessors to the worklist. while (!cutPoints.Empty()) { FlowEdge* const candidateEdge = cutPoints.Top(); cutPoints.Pop(); - const weight_t improvement = candidateEdge->getLikelyWeight(); - cost = 0.0; BasicBlock* const srcBlk = candidateEdge->getSourceBlock(); BasicBlock* const dstBlk = candidateEdge->getDestinationBlock(); @@ -5254,21 +5243,12 @@ void Compiler::fgSearchImprovedLayout() assert(srcPos < numHotBlocks); assert(dstPos < numHotBlocks); - // Don't consider jumps from hot blocks to cold blocks. - // We can get such candidates if weight wasn't propagated correctly from 'srcBlk' to 'dstBlk'. - // We know 'dstPos' is cold if its position is not set in 'ordinals'. - // ('fgFirstBB' has a position of 0 too, but we don't want to change the entry point anyway, - // so skip any attempts to do so here) - if ((dstPos == 0) && !dstBlk->IsFirst()) - { - continue; - } + // 'dstBlk' better be hot, and better not be 'fgFirstBB' + // (we don't want to change the entry point) + assert(dstPos != 0); - // Don't consider backedges for single-block loops - if (srcPos == dstPos) - { - continue; - } + // 'srcBlk' and 'dstBlk' better be distinct + assert(srcPos != dstPos); // Previous moves might have inadvertently created fallthrough from srcBlk to dstBlk, // so there's nothing to do this round. @@ -5278,41 +5258,180 @@ void Compiler::fgSearchImprovedLayout() continue; } - const bool isForwardJump = (srcPos < dstPos); - if (!isForwardJump) - { - // TODO: Move backward jumps - continue; - } - - // Before getting any edges, make sure 'ordinals' is up-to-date + // Before getting any edges, make sure 'ordinals' is accurate assert(blockOrder[srcPos] == srcBlk); assert(blockOrder[dstPos] == dstBlk); - FlowEdge* const srcNextEdge = getEdgeAndUpdateCost(srcPos, srcPos + 1); - FlowEdge* const dstPrevEdge = getEdgeAndUpdateCost(dstPos - 1, dstPos); + auto getCost = [this](BasicBlock* srcBlk, BasicBlock* dstBlk) -> weight_t { + assert(srcBlk != nullptr); + assert(dstBlk != nullptr); + FlowEdge* const edge = fgGetPredForBlock(dstBlk, srcBlk); + return (edge != nullptr) ? edge->getLikelyWeight() : 0.0; + }; + + const bool isForwardJump = (srcPos < dstPos); + weight_t srcPrevCost = 0.0; + weight_t srcNextCost = 0.0; + weight_t dstPrevCost = 0.0; + weight_t cost, improvement; + unsigned part1Size, part2Size, part3Size; + + if (isForwardJump) + { + // Here is the proposed partition: + // S1: 0 ~ srcPos + // S2: srcPos+1 ~ dstPos-1 + // S3: dstPos ~ lastHotIndex + // S4: cold section + // + // After the swap: + // S1: 0 ~ srcPos + // S3: dstPos ~ lastHotIndex + // S2: srcPos+1 ~ dstPos-1 + // S4: cold section + part1Size = srcPos + 1; + part2Size = dstPos - srcPos - 1; + part3Size = numHotBlocks - dstPos; + + srcNextCost = getCost(srcBlk, blockOrder[srcPos + 1]); + dstPrevCost = getCost(blockOrder[dstPos - 1], dstBlk); + cost = srcNextCost + dstPrevCost; + improvement = candidateEdge->getLikelyWeight() + getCost(blockOrder[lastHotIndex], blockOrder[srcPos + 1]); - if (hasExitBlock) + if (firstColdBlock != nullptr) + { + cost += getCost(blockOrder[lastHotIndex], firstColdBlock); + improvement += getCost(blockOrder[dstPos - 1], firstColdBlock); + } + } + else { - getEdgeAndUpdateCost(lastHotIndex, firstColdIndex); + assert(dstPos < srcPos); + + // Here is the proposed partition: + // S1: 0 ~ dstPos-1 + // S2: dstPos ~ srcPos-1 + // S3: srcPos + // S4: srcPos+1 ~ remaining code + // + // After the swap: + // S1: 0 ~ dstPos-1 + // S3: srcPos + // S2: dstPos ~ srcPos-1 + // S4: srcPos+1 ~ remaining code + part1Size = dstPos; + part2Size = srcPos - dstPos; + part3Size = 1; + + srcPrevCost = getCost(blockOrder[srcPos - 1], srcBlk); + dstPrevCost = getCost(blockOrder[dstPos - 1], dstBlk); + cost = srcPrevCost + dstPrevCost; + improvement = candidateEdge->getLikelyWeight() + getCost(blockOrder[dstPos - 1], srcBlk); + + if (srcPos != lastHotIndex) + { + srcNextCost = getCost(srcBlk, blockOrder[srcPos + 1]); + cost += srcNextCost; + improvement += getCost(blockOrder[srcPos - 1], blockOrder[srcPos + 1]); + } } + // Continue evaluating candidates if this one isn't profitable if ((improvement <= cost) || Compiler::fgProfileWeightsEqual(improvement, cost, 0.001)) { continue; } - modified = true; + // We've found a profitable cut point. Continue with the swap. + + auto getHottestPred = [](BasicBlock* block) -> FlowEdge* { + assert(block != nullptr); + FlowEdge* hottestPred = block->bbPreds; + + for (FlowEdge* const predEdge : block->PredEdges()) + { + if (predEdge->getLikelyWeight() > hottestPred->getLikelyWeight()) + { + hottestPred = predEdge; + } + } + + return hottestPred; + }; + + auto getHottestSucc = [](BasicBlock* block) -> FlowEdge* { + assert(block != nullptr); + FlowEdge* hottestSucc = nullptr; + + for (FlowEdge* const succEdge : block->SuccEdges()) + { + if ((hottestSucc == nullptr) || (succEdge->getLikelihood() > hottestSucc->getLikelihood())) + { + hottestSucc = succEdge; + } + } + + return hottestSucc; + }; + + // If we're going to incur cost from moving srcBlk, + // find a hot edge out of srcBlk's predecessor to consider + if (srcPrevCost != 0.0) + { + FlowEdge* const hottestSucc = getHottestSucc(blockOrder[srcPos - 1]); + if ((hottestSucc != nullptr) && (hottestSucc->getDestinationBlock() != srcBlk)) + { + considerEdge(hottestSucc); + } + } - const unsigned part1Size = srcPos + 1; - const unsigned part2Size = dstPos - srcPos - 1; - const unsigned part3Size = numHotBlocks - dstPos; + // If we're going to break up fallthrough out of srcBlk, + // find a hot predecessor for the current fallthrough successor to consider + if (srcNextCost != 0.0) + { + FlowEdge* const hottestPred = getHottestPred(blockOrder[srcPos + 1]); + if ((hottestPred != nullptr) && (hottestPred->getSourceBlock() != srcBlk)) + { + considerEdge(hottestPred); + } + } + + // If we're going to break up fallthrough into dstBlk, + // find a hot successor for the current fallthrough predecessor to consider + if (dstPrevCost != 0.0) + { + FlowEdge* const hottestSucc = getHottestSucc(blockOrder[dstPos - 1]); + if ((hottestSucc != nullptr) && (hottestSucc->getDestinationBlock() != dstBlk)) + { + considerEdge(hottestSucc); + } + } + // Swap the partitions memcpy(tempOrder, blockOrder, sizeof(BasicBlock*) * part1Size); memcpy(tempOrder + part1Size, blockOrder + part1Size + part2Size, sizeof(BasicBlock*) * part3Size); memcpy(tempOrder + part1Size + part3Size, blockOrder + part1Size, sizeof(BasicBlock*) * part2Size); + + if (!isForwardJump) + { + const unsigned remainingCodeSize = numHotBlocks - srcPos - 1; + if (remainingCodeSize != 0) + { + memcpy(tempOrder + srcPos + 1, blockOrder + srcPos + 1, sizeof(BasicBlock*) * remainingCodeSize); + } + else + { + assert(srcPos == lastHotIndex); + } + } + else + { + assert((part1Size + part2Size + part3Size) == numHotBlocks); + } + std::swap(blockOrder, tempOrder); + // Update the positions of the blocks we moved for (unsigned i = part1Size; i < numHotBlocks; i++) { ordinals[blockOrder[i]->bbNum] = i; @@ -5320,16 +5439,7 @@ void Compiler::fgSearchImprovedLayout() // Ensure this move created fallthrough from srcBlk to dstBlk assert((ordinals[srcBlk->bbNum] + 1) == ordinals[dstBlk->bbNum]); - - if (srcNextEdge != nullptr) - { - cutPoints.Push(srcNextEdge); - } - - if (dstPrevEdge != nullptr) - { - cutPoints.Push(dstPrevEdge); - } + modified = true; } if (modified) @@ -5347,237 +5457,6 @@ void Compiler::fgSearchImprovedLayout() } } -//----------------------------------------------------------------------------- -// fgSearchImprovedLayout: Try to improve upon RPO-based layout with the 3-opt method: -// - Identify a subset of "interesting" (not cold, has branches, etc.) blocks to move -// - Partition this set into three segments: S1 - S2 - S3 -// - Evaluate cost of swapped layout: S1 - S3 - S2 -// - If the cost improves, keep this layout -// - Repeat for a certain number of iterations, or until no improvements are made -// -// Template parameters: -// hasEH - If true, method has EH regions, so check that we don't try to move blocks in different regions -// -// template -// void Compiler::fgSearchImprovedLayout() -// { -// #ifdef DEBUG -// if (verbose) -// { -// printf("*************** In fgSearchImprovedLayout()\n"); - -// printf("\nInitial BasicBlocks"); -// fgDispBasicBlocks(verboseTrees); -// printf("\n"); -// } -// #endif // DEBUG - -// BlockSet visitedBlocks(BlockSetOps::MakeEmpty(this)); -// BasicBlock* startBlock = nullptr; - -// // Find the first block that doesn't fall into its hottest successor. -// // This will be our first "interesting" block. -// // -// for (BasicBlock* const block : Blocks(fgFirstBB, fgLastBBInMainFunction())) -// { -// // Ignore try/handler blocks -// if (hasEH && (block->hasTryIndex() || block->hasHndIndex())) -// { -// continue; -// } - -// BlockSetOps::AddElemD(this, visitedBlocks, block->bbNum); -// FlowEdge* hottestSuccEdge = nullptr; - -// for (FlowEdge* const succEdge : block->SuccEdges(this)) -// { -// BasicBlock* const succ = succEdge->getDestinationBlock(); - -// // Ignore try/handler successors -// // -// if (hasEH && (succ->hasTryIndex() || succ->hasHndIndex())) -// { -// continue; -// } - -// const bool isForwardJump = !BlockSetOps::IsMember(this, visitedBlocks, succ->bbNum); - -// if (isForwardJump && -// ((hottestSuccEdge == nullptr) || (succEdge->getLikelihood() > hottestSuccEdge->getLikelihood()))) -// { -// hottestSuccEdge = succEdge; -// } -// } - -// if ((hottestSuccEdge != nullptr) && !block->NextIs(hottestSuccEdge->getDestinationBlock())) -// { -// // We found the first "interesting" block that doesn't fall into its hottest successor -// // -// startBlock = block; -// break; -// } -// } - -// if (startBlock == nullptr) -// { -// JITDUMP("\nSkipping reordering"); -// return; -// } - -// // blockVector will contain the set of interesting blocks to move. -// // tempBlockVector will assist with moving segments of interesting blocks. -// // -// BasicBlock** blockVector = new BasicBlock*[fgBBNumMax]; -// BasicBlock** tempBlockVector = new BasicBlock*[fgBBNumMax]; -// unsigned blockCount = 0; -// ArrayStack callFinallyPairs(getAllocator(), hasEH ? ArrayStack::builtinSize : -// 0); - -// for (BasicBlock* const block : Blocks(startBlock, fgLastBBInMainFunction())) -// { -// // Don't consider blocks in EH regions -// // -// if (block->hasTryIndex() || block->hasHndIndex()) -// { -// continue; -// } - -// // We've reached the cold section of the main method body; -// // nothing is interesting at this point -// // -// if (block->isRunRarely()) -// { -// break; -// } - -// blockVector[blockCount] = block; -// tempBlockVector[blockCount++] = block; - -// if (hasEH && block->isBBCallFinallyPair()) -// { -// callFinallyPairs.Emplace(block, block->Next()); -// } -// } - -// if (blockCount < 3) -// { -// JITDUMP("\nNot enough interesting blocks; skipping reordering"); -// return; -// } - -// JITDUMP("\nInteresting blocks: [" FMT_BB "-" FMT_BB "]", startBlock->bbNum, blockVector[blockCount - 1]->bbNum); - -// auto evaluateCost = [](BasicBlock* const block, BasicBlock* const next) -> weight_t { -// assert(block != nullptr); - -// if ((block->NumSucc() == 0) || (next == nullptr)) -// { -// return 0.0; -// } - -// const weight_t cost = block->bbWeight; - -// for (FlowEdge* const edge : block->SuccEdges()) -// { -// if (edge->getDestinationBlock() == next) -// { -// return cost - edge->getLikelyWeight(); -// } -// } - -// return cost; -// }; - -// // finalBlock is the first block after the set of interesting blocks. -// // We will need to keep track of it to compute the cost of creating/breaking fallthrough into it. -// // finalBlock can be null. -// // -// BasicBlock* const finalBlock = blockVector[blockCount - 1]->Next(); -// bool improvedLayout = true; -// constexpr unsigned maxIter = 5; // TODO: Reconsider? - -// for (unsigned numIter = 0; improvedLayout && (numIter < maxIter); numIter++) -// { -// JITDUMP("\n\n--Iteration %d--", (numIter + 1)); -// improvedLayout = false; -// BasicBlock* const exitBlock = blockVector[blockCount - 1]; - -// for (unsigned i = 1; i < (blockCount - 1); i++) -// { -// BasicBlock* const blockI = blockVector[i]; -// BasicBlock* const blockIPrev = blockVector[i - 1]; - -// for (unsigned j = i + 1; j < blockCount; j++) -// { -// // Evaluate the current partition at (i,j) -// // S1: 0 ~ i-1 -// // S2: i ~ j-1 -// // S3: j ~ exitBlock - -// BasicBlock* const blockJ = blockVector[j]; -// BasicBlock* const blockJPrev = blockVector[j - 1]; - -// const weight_t oldScore = evaluateCost(blockIPrev, blockI) + evaluateCost(blockJPrev, blockJ) + -// evaluateCost(exitBlock, finalBlock); -// const weight_t newScore = evaluateCost(blockIPrev, blockJ) + evaluateCost(exitBlock, blockI) + -// evaluateCost(blockJPrev, finalBlock); - -// if ((newScore < oldScore) && !Compiler::fgProfileWeightsEqual(oldScore, newScore, 0.001)) -// { -// JITDUMP("\nFound better layout by partitioning at i=%d, j=%d", i, j); -// JITDUMP("\nOld score: %f, New score: %f", oldScore, newScore); -// const unsigned part1Size = i; -// const unsigned part2Size = j - i; -// const unsigned part3Size = blockCount - j; - -// memcpy(tempBlockVector, blockVector, sizeof(BasicBlock*) * part1Size); -// memcpy(tempBlockVector + part1Size, blockVector + part1Size + part2Size, -// sizeof(BasicBlock*) * part3Size); -// memcpy(tempBlockVector + part1Size + part3Size, blockVector + part1Size, -// sizeof(BasicBlock*) * part2Size); - -// std::swap(blockVector, tempBlockVector); -// improvedLayout = true; -// break; -// } -// } - -// if (improvedLayout) -// { -// break; -// } -// } -// } - -// // Rearrange blocks -// // -// for (unsigned i = 1; i < blockCount; i++) -// { -// BasicBlock* const block = blockVector[i - 1]; -// BasicBlock* const next = blockVector[i]; -// assert(BasicBlock::sameEHRegion(block, next)); - -// if (!block->NextIs(next)) -// { -// fgUnlinkBlock(next); -// fgInsertBBafter(block, next); -// } -// } - -// // Fix call-finally pairs -// // -// for (int i = 0; hasEH && (i < callFinallyPairs.Height()); i++) -// { -// const CallFinallyPair& pair = callFinallyPairs.BottomRef(i); - -// if (!pair.callFinally->NextIs(pair.callFinallyRet)) -// { -// fgUnlinkBlock(pair.callFinallyRet); -// fgInsertBBafter(pair.callFinally, pair.callFinallyRet); -// } -// } -// } - //------------------------------------------------------------- // ehUpdateTryLasts: Iterates EH descriptors, updating each try region's // end block as determined by getTryLast. From bbc28df5946730e737733de2796c75bc5bbb8dd9 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Wed, 25 Sep 2024 16:02:04 -0400 Subject: [PATCH 14/40] Revert irrelevant changes --- src/coreclr/jit/arraystack.h | 2 +- src/coreclr/jit/fgopt.cpp | 43 ++++++++++++++---------------------- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/src/coreclr/jit/arraystack.h b/src/coreclr/jit/arraystack.h index 626c66ee8acdc..08b2a798666f4 100644 --- a/src/coreclr/jit/arraystack.h +++ b/src/coreclr/jit/arraystack.h @@ -7,9 +7,9 @@ template class ArrayStack { -public: static constexpr int builtinSize = 8; +public: explicit ArrayStack(CompAllocator alloc, int initialCapacity = builtinSize) : m_alloc(alloc) { diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 51632d7223d1a..1f3721d081c68 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4658,20 +4658,6 @@ void Compiler::fgMoveHotJumps() } } -struct CallFinallyPair -{ - BasicBlock* callFinally; - BasicBlock* callFinallyRet; - - // Constructor provided so we can call ArrayStack::Emplace - // - CallFinallyPair(BasicBlock* first, BasicBlock* second) - : callFinally(first) - , callFinallyRet(second) - { - } -}; - //----------------------------------------------------------------------------- // fgDoReversePostOrderLayout: Reorder blocks using a greedy RPO traversal. // @@ -4700,12 +4686,8 @@ void Compiler::fgDoReversePostOrderLayout() { BasicBlock* const block = dfsTree->GetPostOrder(i); BasicBlock* const blockToMove = dfsTree->GetPostOrder(i - 1); - - if (!block->NextIs(blockToMove)) - { - fgUnlinkBlock(blockToMove); - fgInsertBBafter(block, blockToMove); - } + fgUnlinkBlock(blockToMove); + fgInsertBBafter(block, blockToMove); } fgMoveHotJumps(); @@ -4731,6 +4713,19 @@ void Compiler::fgDoReversePostOrderLayout() // The RPO will break up call-finally pairs, so save them before re-ordering // + struct CallFinallyPair + { + BasicBlock* callFinally; + BasicBlock* callFinallyRet; + + // Constructor provided so we can call ArrayStack::Emplace + // + CallFinallyPair(BasicBlock* first, BasicBlock* second) + : callFinally(first) + , callFinallyRet(second) + { + } + }; ArrayStack callFinallyPairs(getAllocator()); for (EHblkDsc* const HBtab : EHClauses(this)) @@ -4769,16 +4764,12 @@ void Compiler::fgDoReversePostOrderLayout() continue; } - if (!block->NextIs(blockToMove)) - { - fgUnlinkBlock(blockToMove); - fgInsertBBafter(block, blockToMove); - } + fgUnlinkBlock(blockToMove); + fgInsertBBafter(block, blockToMove); } } // Fix up call-finally pairs - // (We assume the RPO will mess these up, so don't bother checking if the blocks are still adjacent) // for (int i = 0; i < callFinallyPairs.Height(); i++) { From 2e507bea5d65b6053a1c086558365136b748adb5 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Wed, 25 Sep 2024 16:03:07 -0400 Subject: [PATCH 15/40] Missed a few --- src/coreclr/jit/arraystack.h | 2 +- src/coreclr/jit/fgopt.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/arraystack.h b/src/coreclr/jit/arraystack.h index 08b2a798666f4..872e64f3c8c99 100644 --- a/src/coreclr/jit/arraystack.h +++ b/src/coreclr/jit/arraystack.h @@ -7,7 +7,7 @@ template class ArrayStack { - static constexpr int builtinSize = 8; + static const int builtinSize = 8; public: explicit ArrayStack(CompAllocator alloc, int initialCapacity = builtinSize) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 1f3721d081c68..81637cad738c7 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4726,6 +4726,7 @@ void Compiler::fgDoReversePostOrderLayout() { } }; + ArrayStack callFinallyPairs(getAllocator()); for (EHblkDsc* const HBtab : EHClauses(this)) From 9ed6452a76bd4f3fd9b47f01046e6d48b10318b6 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Wed, 25 Sep 2024 16:56:39 -0400 Subject: [PATCH 16/40] Add JitDump check --- src/coreclr/jit/fgopt.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 81637cad738c7..e153d705022ab 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5127,6 +5127,17 @@ void Compiler::fgSearchImprovedLayout() return; } +#ifdef DEBUG + if (verbose) + { + printf("*************** In fgSearchImprovedLayout()\n"); + + printf("\nInitial BasicBlocks"); + fgDispBasicBlocks(verboseTrees); + printf("\n"); + } +#endif // DEBUG + // Note that we're default-initializing 'ordinals' with zeros. // Block reordering shouldn't change the method's entry point, // so if a block has an ordinal of zero and it's not 'fgFirstBB', From 40fc6bcc02f84a930c8c05d28037440477e3f9b3 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Wed, 25 Sep 2024 17:28:08 -0400 Subject: [PATCH 17/40] Add more candidate edges when reordering --- src/coreclr/jit/fgopt.cpp | 93 ++++++++++++++------------------------- 1 file changed, 34 insertions(+), 59 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index e153d705022ab..2697ef16044e3 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5205,6 +5205,30 @@ void Compiler::fgSearchImprovedLayout() cutPoints.Push(edge); }; + auto addNonFallthroughSuccs = [this, considerEdge](BasicBlock* block, BasicBlock* next) { + assert(block != nullptr); + + for (FlowEdge* const succEdge : block->SuccEdges(this)) + { + if (succEdge->getDestinationBlock() != next) + { + considerEdge(succEdge); + } + } + }; + + auto addNonFallthroughPreds = [considerEdge](BasicBlock* block, BasicBlock* prev) { + assert(block != nullptr); + + for (FlowEdge* const predEdge : block->PredEdges()) + { + if (predEdge->getSourceBlock() != prev) + { + considerEdge(predEdge); + } + } + }; + assert(finalBlock != nullptr); BasicBlock** blockOrder = new (this, CMK_BasicBlock) BasicBlock*[numHotBlocks]; BasicBlock** tempOrder = new (this, CMK_BasicBlock) BasicBlock*[numHotBlocks]; @@ -5215,14 +5239,7 @@ void Compiler::fgSearchImprovedLayout() { blockOrder[position] = block; position++; - - for (FlowEdge* const succEdge : block->SuccEdges(this)) - { - if (!block->NextIs(succEdge->getDestinationBlock())) - { - considerEdge(succEdge); - } - } + addNonFallthroughSuccs(block, block->Next()); } assert(position == numHotBlocks); @@ -5347,67 +5364,25 @@ void Compiler::fgSearchImprovedLayout() // We've found a profitable cut point. Continue with the swap. - auto getHottestPred = [](BasicBlock* block) -> FlowEdge* { - assert(block != nullptr); - FlowEdge* hottestPred = block->bbPreds; - - for (FlowEdge* const predEdge : block->PredEdges()) - { - if (predEdge->getLikelyWeight() > hottestPred->getLikelyWeight()) - { - hottestPred = predEdge; - } - } - - return hottestPred; - }; - - auto getHottestSucc = [](BasicBlock* block) -> FlowEdge* { - assert(block != nullptr); - FlowEdge* hottestSucc = nullptr; - - for (FlowEdge* const succEdge : block->SuccEdges()) - { - if ((hottestSucc == nullptr) || (succEdge->getLikelihood() > hottestSucc->getLikelihood())) - { - hottestSucc = succEdge; - } - } - - return hottestSucc; - }; - - // If we're going to incur cost from moving srcBlk, - // find a hot edge out of srcBlk's predecessor to consider + // If we're going to break up fallthrough into 'srcBlk', + // consider all other edges out of 'srcBlk''s current fallthrough predecessor if (srcPrevCost != 0.0) { - FlowEdge* const hottestSucc = getHottestSucc(blockOrder[srcPos - 1]); - if ((hottestSucc != nullptr) && (hottestSucc->getDestinationBlock() != srcBlk)) - { - considerEdge(hottestSucc); - } + addNonFallthroughSuccs(blockOrder[srcPos - 1], srcBlk); } - // If we're going to break up fallthrough out of srcBlk, - // find a hot predecessor for the current fallthrough successor to consider + // If we're going to break up fallthrough out of 'srcBlk', + // consider all other edges into 'srcBlk''s current fallthrough successor if (srcNextCost != 0.0) { - FlowEdge* const hottestPred = getHottestPred(blockOrder[srcPos + 1]); - if ((hottestPred != nullptr) && (hottestPred->getSourceBlock() != srcBlk)) - { - considerEdge(hottestPred); - } + addNonFallthroughPreds(blockOrder[srcPos + 1], srcBlk); } - // If we're going to break up fallthrough into dstBlk, - // find a hot successor for the current fallthrough predecessor to consider + // If we're going to break up fallthrough into 'dstBlk', + // consider all other edges out of 'dstBlk''s current fallthrough predecessor if (dstPrevCost != 0.0) { - FlowEdge* const hottestSucc = getHottestSucc(blockOrder[dstPos - 1]); - if ((hottestSucc != nullptr) && (hottestSucc->getDestinationBlock() != dstBlk)) - { - considerEdge(hottestSucc); - } + addNonFallthroughSuccs(blockOrder[dstPos - 1], dstBlk); } // Swap the partitions From a3b7392ce5362a3c1e7b20aa67f8e06f358b4816 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 26 Sep 2024 11:15:47 -0400 Subject: [PATCH 18/40] Don't add duplicate edges to cutPoints --- src/coreclr/jit/fgopt.cpp | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 2697ef16044e3..3f3507123811f 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5171,9 +5171,13 @@ void Compiler::fgSearchImprovedLayout() }; PriorityQueue cutPoints(getAllocator(CMK_FlowEdge), edgeWeightComp); + // We will also maintain a set of candidate edges to avoid adding duplicate edges to cutPoints. + // For large flowgraphs, we risk exploding 'cutPoints' with duplicate edges. + JitHashTable, bool> candidateSet(getAllocator(CMK_FlowEdge)); + // Since adding to priority queues has logarithmic time complexity, // try to avoid adding edges that we obviously won't consider when reordering. - auto considerEdge = [&cutPoints, ordinals](FlowEdge* edge) { + auto considerEdge = [&cutPoints, &candidateSet, ordinals](FlowEdge* edge) { assert(edge != nullptr); BasicBlock* const srcBlk = edge->getSourceBlock(); @@ -5202,6 +5206,22 @@ void Compiler::fgSearchImprovedLayout() return; } + // Check if the edge is already in 'cutPoints'. + // If it isn't, update the set. + bool* const lookupPtr = candidateSet.LookupPointer(edge); + if (lookupPtr == nullptr) + { + candidateSet.Set(edge, true); + } + else if (*lookupPtr == false) + { + *lookupPtr = true; + } + else + { + return; + } + cutPoints.Push(edge); }; @@ -5254,6 +5274,8 @@ void Compiler::fgSearchImprovedLayout() { FlowEdge* const candidateEdge = cutPoints.Top(); cutPoints.Pop(); + assert(candidateSet[candidateEdge]); + candidateSet[candidateEdge] = false; BasicBlock* const srcBlk = candidateEdge->getSourceBlock(); BasicBlock* const dstBlk = candidateEdge->getDestinationBlock(); @@ -5363,6 +5385,8 @@ void Compiler::fgSearchImprovedLayout() } // We've found a profitable cut point. Continue with the swap. + JITDUMP("Creating fallthrough for " FMT_BB " -> " FMT_BB " (cost=%f, improvement=%f)\n", srcBlk->bbNum, + dstBlk->bbNum, cost, improvement); // If we're going to break up fallthrough into 'srcBlk', // consider all other edges out of 'srcBlk''s current fallthrough predecessor From d468a8a8b7d9d1cba9c9f5ab1629aefba105704b Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Fri, 27 Sep 2024 16:35:50 -0400 Subject: [PATCH 19/40] Consider each candidate edge at most once --- src/coreclr/jit/fgopt.cpp | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 3f3507123811f..6987f5719bfa4 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5173,11 +5173,11 @@ void Compiler::fgSearchImprovedLayout() // We will also maintain a set of candidate edges to avoid adding duplicate edges to cutPoints. // For large flowgraphs, we risk exploding 'cutPoints' with duplicate edges. - JitHashTable, bool> candidateSet(getAllocator(CMK_FlowEdge)); + JitHashTable, bool> usedCandidates(getAllocator(CMK_FlowEdge)); // Since adding to priority queues has logarithmic time complexity, // try to avoid adding edges that we obviously won't consider when reordering. - auto considerEdge = [&cutPoints, &candidateSet, ordinals](FlowEdge* edge) { + auto considerEdge = [&cutPoints, &usedCandidates, ordinals](FlowEdge* edge) { assert(edge != nullptr); BasicBlock* const srcBlk = edge->getSourceBlock(); @@ -5206,23 +5206,11 @@ void Compiler::fgSearchImprovedLayout() return; } - // Check if the edge is already in 'cutPoints'. - // If it isn't, update the set. - bool* const lookupPtr = candidateSet.LookupPointer(edge); - if (lookupPtr == nullptr) + // Don't add an edge that we've already considered + if (!usedCandidates.Set(edge, true, usedCandidates.SetKind::Overwrite)) { - candidateSet.Set(edge, true); + cutPoints.Push(edge); } - else if (*lookupPtr == false) - { - *lookupPtr = true; - } - else - { - return; - } - - cutPoints.Push(edge); }; auto addNonFallthroughSuccs = [this, considerEdge](BasicBlock* block, BasicBlock* next) { @@ -5274,8 +5262,7 @@ void Compiler::fgSearchImprovedLayout() { FlowEdge* const candidateEdge = cutPoints.Top(); cutPoints.Pop(); - assert(candidateSet[candidateEdge]); - candidateSet[candidateEdge] = false; + assert(usedCandidates[candidateEdge]); BasicBlock* const srcBlk = candidateEdge->getSourceBlock(); BasicBlock* const dstBlk = candidateEdge->getDestinationBlock(); From 3ee5c42dea8bd245862525689a1fd92a4cd1b7b9 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 17 Oct 2024 13:05:32 -0400 Subject: [PATCH 20/40] Don't factor cold branches into cost calculatioN --- src/coreclr/jit/fgopt.cpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 46716340c2d19..554e28f730e53 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5024,17 +5024,15 @@ void Compiler::fgSearchImprovedLayout() // Block reordering shouldn't change the method's entry point, // so if a block has an ordinal of zero and it's not 'fgFirstBB', // the block wasn't visited below, so it's not in the range of hot blocks. - unsigned* const ordinals = new (this, CMK_Generic) unsigned[fgBBNumMax + 1]{}; - unsigned numHotBlocks = 0; - BasicBlock* finalBlock = fgLastBBInMainFunction(); - BasicBlock* firstColdBlock = nullptr; + unsigned* const ordinals = new (this, CMK_Generic) unsigned[fgBBNumMax + 1]{}; + unsigned numHotBlocks = 0; + BasicBlock* finalBlock = fgLastBBInMainFunction(); for (BasicBlock* const block : Blocks(fgFirstBB, finalBlock)) { if (block->isBBWeightCold(this)) { - firstColdBlock = block; - finalBlock = block->Prev(); + finalBlock = block->Prev(); break; } @@ -5209,11 +5207,8 @@ void Compiler::fgSearchImprovedLayout() cost = srcNextCost + dstPrevCost; improvement = candidateEdge->getLikelyWeight() + getCost(blockOrder[lastHotIndex], blockOrder[srcPos + 1]); - if (firstColdBlock != nullptr) - { - cost += getCost(blockOrder[lastHotIndex], firstColdBlock); - improvement += getCost(blockOrder[dstPos - 1], firstColdBlock); - } + // Don't include branches to the cold section (S4) in the cost/improvement calculation. + // If the section is truly cold, branches into it should have negligible cost. } else { From d6fea98ce7a0a73e13f70f9de97da55bbbd99d7d Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Mon, 21 Oct 2024 10:59:15 -0400 Subject: [PATCH 21/40] Remove used candidates set --- src/coreclr/jit/fgopt.cpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index a9500d8b89dfe..e1cb691f2bf14 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4912,13 +4912,9 @@ void Compiler::fgSearchImprovedLayout() }; PriorityQueue cutPoints(getAllocator(CMK_FlowEdge), edgeWeightComp); - // We will also maintain a set of candidate edges to avoid adding duplicate edges to cutPoints. - // For large flowgraphs, we risk exploding 'cutPoints' with duplicate edges. - JitHashTable, bool> usedCandidates(getAllocator(CMK_FlowEdge)); - // Since adding to priority queues has logarithmic time complexity, // try to avoid adding edges that we obviously won't consider when reordering. - auto considerEdge = [&cutPoints, &usedCandidates, ordinals](FlowEdge* edge) { + auto considerEdge = [&cutPoints, ordinals](FlowEdge* edge) { assert(edge != nullptr); BasicBlock* const srcBlk = edge->getSourceBlock(); @@ -4947,11 +4943,7 @@ void Compiler::fgSearchImprovedLayout() return; } - // Don't add an edge that we've already considered - if (!usedCandidates.Set(edge, true, usedCandidates.SetKind::Overwrite)) - { - cutPoints.Push(edge); - } + cutPoints.Push(edge); }; auto addNonFallthroughSuccs = [this, considerEdge](BasicBlock* block, BasicBlock* next) { @@ -5003,7 +4995,6 @@ void Compiler::fgSearchImprovedLayout() { FlowEdge* const candidateEdge = cutPoints.Top(); cutPoints.Pop(); - assert(usedCandidates[candidateEdge]); BasicBlock* const srcBlk = candidateEdge->getSourceBlock(); BasicBlock* const dstBlk = candidateEdge->getDestinationBlock(); From 872b4316378833b736e5021df0758a7e908dd732 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Mon, 21 Oct 2024 13:29:41 -0400 Subject: [PATCH 22/40] Revert "Remove used candidates set" This reverts commit d6fea98ce7a0a73e13f70f9de97da55bbbd99d7d. --- src/coreclr/jit/fgopt.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index e1cb691f2bf14..a9500d8b89dfe 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4912,9 +4912,13 @@ void Compiler::fgSearchImprovedLayout() }; PriorityQueue cutPoints(getAllocator(CMK_FlowEdge), edgeWeightComp); + // We will also maintain a set of candidate edges to avoid adding duplicate edges to cutPoints. + // For large flowgraphs, we risk exploding 'cutPoints' with duplicate edges. + JitHashTable, bool> usedCandidates(getAllocator(CMK_FlowEdge)); + // Since adding to priority queues has logarithmic time complexity, // try to avoid adding edges that we obviously won't consider when reordering. - auto considerEdge = [&cutPoints, ordinals](FlowEdge* edge) { + auto considerEdge = [&cutPoints, &usedCandidates, ordinals](FlowEdge* edge) { assert(edge != nullptr); BasicBlock* const srcBlk = edge->getSourceBlock(); @@ -4943,7 +4947,11 @@ void Compiler::fgSearchImprovedLayout() return; } - cutPoints.Push(edge); + // Don't add an edge that we've already considered + if (!usedCandidates.Set(edge, true, usedCandidates.SetKind::Overwrite)) + { + cutPoints.Push(edge); + } }; auto addNonFallthroughSuccs = [this, considerEdge](BasicBlock* block, BasicBlock* next) { @@ -4995,6 +5003,7 @@ void Compiler::fgSearchImprovedLayout() { FlowEdge* const candidateEdge = cutPoints.Top(); cutPoints.Pop(); + assert(usedCandidates[candidateEdge]); BasicBlock* const srcBlk = candidateEdge->getSourceBlock(); BasicBlock* const dstBlk = candidateEdge->getDestinationBlock(); From fadaebab41190998a301c03c24c50b0eb8085a13 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Mon, 21 Oct 2024 14:00:40 -0400 Subject: [PATCH 23/40] Tweak cold block finding --- src/coreclr/jit/fgopt.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index a9500d8b89dfe..bb3c930ed2839 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4887,16 +4887,18 @@ void Compiler::fgSearchImprovedLayout() // the block wasn't visited below, so it's not in the range of hot blocks. unsigned* const ordinals = new (this, CMK_Generic) unsigned[fgBBNumMax + 1]{}; unsigned numHotBlocks = 0; - BasicBlock* finalBlock = fgLastBBInMainFunction(); + BasicBlock* finalBlock; + // Walk backwards through the main method body, looking for the last hot block. + // Since we moved all cold blocks to the end of the method already, + // we should have a span of hot blocks to consider reordering at the beginning of the method. + for (finalBlock = fgLastBBInMainFunction(); !finalBlock->IsFirst() && finalBlock->isBBWeightCold(this); + finalBlock = finalBlock->Prev()) + ; + + // Initialize the ordinal numbers for the hot blocks for (BasicBlock* const block : Blocks(fgFirstBB, finalBlock)) { - if (block->isBBWeightCold(this)) - { - finalBlock = block->Prev(); - break; - } - ordinals[block->bbNum] = numHotBlocks++; } From e17b16971a96cb00c9bfbc915ff4613d6df0fff9 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 24 Oct 2024 23:08:22 -0400 Subject: [PATCH 24/40] Refactor into ThreeOptLayout class --- src/coreclr/jit/compiler.h | 21 ++ src/coreclr/jit/fgopt.cpp | 405 ++++++++++++++++++++++++------------- 2 files changed, 283 insertions(+), 143 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 1a5849d309c5c..52029ea404626 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6252,6 +6252,27 @@ class Compiler void fgMoveColdBlocks(); void fgSearchImprovedLayout(); + class ThreeOptLayout + { + static bool EdgeCmp(const FlowEdge* left, const FlowEdge* right); + + Compiler* compiler; + PriorityQueue cutPoints; + JitHashTable, bool> usedCandidates; + unsigned* ordinals; + BasicBlock** blockOrder; + BasicBlock** tempOrder; + + void ConsiderEdge(FlowEdge* edge, unsigned startPos); + void AddNonFallthroughSuccs(BasicBlock* block, BasicBlock* next, unsigned startPos); + void AddNonFallthroughPreds(BasicBlock* block, BasicBlock* prev, unsigned startPos); + bool RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock); + + public: + ThreeOptLayout(Compiler* comp); + void Run(); + }; + template void fgMoveHotJumps(); diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index bb3c930ed2839..837e9fcc00f9a 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4856,48 +4856,159 @@ void Compiler::fgMoveColdBlocks() } //----------------------------------------------------------------------------- -// fgSearchImprovedLayout: Try to improve upon RPO-based layout with the 3-opt method: -// - Identify a range of hot blocks to reorder within -// - Partition this set into three segments: S1 - S2 - S3 -// - Evaluate cost of swapped layout: S1 - S3 - S2 -// - If the cost improves, keep this layout +// Compiler::ThreeOptLayout::EdgeCmp: Comparator for the 'cutPoints' priority queue. +// If 'left' has a bigger edge weight than 'right', 3-opt will consider it first. +// Else, 3-opt will consider 'right' first. // -void Compiler::fgSearchImprovedLayout() +// Parameters: +// left - One of the two edges to compare +// right - The other edge to compare +// +// Returns: +// True if 'right' should be considered before 'left', and false otherwise +// +/* static */ bool Compiler::ThreeOptLayout::EdgeCmp(const FlowEdge* left, const FlowEdge* right) { - if (compHndBBtabCount != 0) + return left->getLikelyWeight() < right->getLikelyWeight(); +} + +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::ThreeOptLayout: Constructs a ThreeOptLayout instance. +// +// Parameters: +// comp - The Compiler instance +// +Compiler::ThreeOptLayout::ThreeOptLayout(Compiler* comp) + : compiler(comp) + , cutPoints(comp->getAllocator(CMK_FlowEdge), &ThreeOptLayout::EdgeCmp) + , usedCandidates(comp->getAllocator(CMK_FlowEdge)) + , ordinals(new(comp, CMK_Generic) unsigned[comp->fgBBNumMax + 1]{}) + , blockOrder(nullptr) + , tempOrder(nullptr) +{ +} + +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::ConsiderEdge: Adds 'edge' to 'cutPoints' for later consideration +// if 'edge' looks promising, and it hasn't been considered already. +// Since adding to 'cutPoints' has logarithmic time complexity and might cause a heap allocation, +// avoid adding edges that 3-opt obviously won't consider later. +// +// Parameters: +// edge - The branch to consider creating fallthrough for +// startPos - The start index of the region currently being reordered +// +void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge, unsigned startPos) +{ + // Since adding to priority queues has logarithmic time complexity, + // try to avoid adding edges that we obviously won't consider when reordering. + assert(edge != nullptr); + + BasicBlock* const srcBlk = edge->getSourceBlock(); + BasicBlock* const dstBlk = edge->getDestinationBlock(); + + // Ignore cross-region branches + if (!BasicBlock::sameEHRegion(srcBlk, dstBlk)) { - // TODO: Reorder methods with EH regions return; } -#ifdef DEBUG - if (verbose) + const unsigned srcPos = ordinals[srcBlk->bbNum]; + const unsigned dstPos = ordinals[dstBlk->bbNum]; + + // Don't consider edges from outside the hot range. + // If 'srcBlk' has an ordinal of zero and it isn't the first block, + // it's not tracked by 'ordinals', so it's not in the hot section. + if ((srcPos == 0) && !srcBlk->IsFirst()) { - printf("*************** In fgSearchImprovedLayout()\n"); + return; + } - printf("\nInitial BasicBlocks"); - fgDispBasicBlocks(verboseTrees); - printf("\n"); + // Don't consider edges to blocks outside the hot range, + // or backedges to the first block in a region; we don't want to change the entry point. + if (dstPos == startPos) + { + return; } -#endif // DEBUG - // Note that we're default-initializing 'ordinals' with zeros. - // Block reordering shouldn't change the method's entry point, - // so if a block has an ordinal of zero and it's not 'fgFirstBB', - // the block wasn't visited below, so it's not in the range of hot blocks. - unsigned* const ordinals = new (this, CMK_Generic) unsigned[fgBBNumMax + 1]{}; - unsigned numHotBlocks = 0; - BasicBlock* finalBlock; + // Don't consider backedges for single-block loops + if (srcPos == dstPos) + { + return; + } + + // Don't add an edge that we've already considered + if (!usedCandidates.Set(edge, true, usedCandidates.SetKind::Overwrite)) + { + cutPoints.Push(edge); + } +} + +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::AddNonFallthroughSuccs: Considers every edge out of 'block' +// that doesn't fall through as a future cut point. +// +// Parameters: +// block - The source of edges to consider +// next - The lexical successor to 'block' in 3-opt's current ordering +// startPos - The start index of the region currently being reordered +// +void Compiler::ThreeOptLayout::AddNonFallthroughSuccs(BasicBlock* block, BasicBlock* next, unsigned startPos) +{ + assert(block != nullptr); + for (FlowEdge* const succEdge : block->SuccEdges(compiler)) + { + if (succEdge->getDestinationBlock() != next) + { + ConsiderEdge(succEdge, startPos); + } + } +} + +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::AddNonFallthroughPreds: Considers every edge into 'block' +// that doesn't fall through as a future cut point. +// +// Parameters: +// block - The destination of edges to consider +// prev - The lexical predecessor to 'block' in 3-opt's current ordering +// startPos - The start index of the region currently being reordered +// +void Compiler::ThreeOptLayout::AddNonFallthroughPreds(BasicBlock* block, BasicBlock* prev, unsigned startPos) +{ + assert(block != nullptr); + + for (FlowEdge* const predEdge : block->PredEdges()) + { + if (predEdge->getSourceBlock() != prev) + { + ConsiderEdge(predEdge, startPos); + } + } +} +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::Run: Runs 3-opt for each contiguous region of the block list +// we're interested in reordering. +// Skips handler regions for now, as these are assumed to be cold. +// +void Compiler::ThreeOptLayout::Run() +{ // Walk backwards through the main method body, looking for the last hot block. // Since we moved all cold blocks to the end of the method already, // we should have a span of hot blocks to consider reordering at the beginning of the method. - for (finalBlock = fgLastBBInMainFunction(); !finalBlock->IsFirst() && finalBlock->isBBWeightCold(this); - finalBlock = finalBlock->Prev()) + BasicBlock* finalBlock; + for (finalBlock = compiler->fgLastBBInMainFunction(); + !finalBlock->IsFirst() && finalBlock->isBBWeightCold(compiler); finalBlock = finalBlock->Prev()) ; - // Initialize the ordinal numbers for the hot blocks - for (BasicBlock* const block : Blocks(fgFirstBB, finalBlock)) + // Initialize the ordinal numbers for the hot blocks. + // Note that we default-initialized 'ordinals' with zeros. + // Block reordering shouldn't change the method's entry point, + // so if a block has an ordinal of zero and it's not 'fgFirstBB', + // the block wasn't visited below, so it's not in the range of candidate blocks. + unsigned numHotBlocks = 0; + for (BasicBlock* const block : compiler->Blocks(compiler->fgFirstBB, finalBlock)) { ordinals[block->bbNum] = numHotBlocks++; } @@ -4908,94 +5019,85 @@ void Compiler::fgSearchImprovedLayout() return; } - // We will use a priority queue to greedily get the next hottest edge to try to create fallthrough for - auto edgeWeightComp = [](const FlowEdge* left, const FlowEdge* right) -> bool { - return left->getLikelyWeight() < right->getLikelyWeight(); - }; - PriorityQueue cutPoints(getAllocator(CMK_FlowEdge), edgeWeightComp); - - // We will also maintain a set of candidate edges to avoid adding duplicate edges to cutPoints. - // For large flowgraphs, we risk exploding 'cutPoints' with duplicate edges. - JitHashTable, bool> usedCandidates(getAllocator(CMK_FlowEdge)); - - // Since adding to priority queues has logarithmic time complexity, - // try to avoid adding edges that we obviously won't consider when reordering. - auto considerEdge = [&cutPoints, &usedCandidates, ordinals](FlowEdge* edge) { - assert(edge != nullptr); - - BasicBlock* const srcBlk = edge->getSourceBlock(); - BasicBlock* const dstBlk = edge->getDestinationBlock(); - const unsigned srcPos = ordinals[srcBlk->bbNum]; - const unsigned dstPos = ordinals[dstBlk->bbNum]; + assert(finalBlock != nullptr); + blockOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numHotBlocks]; + tempOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numHotBlocks]; + unsigned position = 0; - // Don't consider edges from outside the hot range. - // If 'srcBlk' has an ordinal of zero and it isn't the first block, - // it's not tracked by 'ordinals', so it's not in the hot section. - if ((srcPos == 0) && !srcBlk->IsFirst()) - { - return; - } + // Initialize current block order + for (BasicBlock* const block : compiler->Blocks(compiler->fgFirstBB, finalBlock)) + { + blockOrder[position++] = block; - // Don't consider edges to blocks outside the hot range, - // or backedges to 'fgFirstBB'; we don't want to change the entry point. - if (dstPos == 0) + EHblkDsc* const HBtab = compiler->ehGetBlockTryDsc(block); + if (HBtab != nullptr) { - return; + HBtab->ebdTryLast = block; } + } - // Don't consider backedges for single-block loops - if (srcPos == dstPos) + bool modified = false; + BasicBlock* lastTryBeg = nullptr; + for (EHblkDsc* const HBtab : EHClauses(compiler)) + { + BasicBlock* const tryBeg = HBtab->ebdTryBeg; + if ((tryBeg != lastTryBeg) && (ordinals[tryBeg->bbNum] != 0)) { - return; + modified |= RunThreeOptPass(tryBeg, HBtab->ebdTryLast); } - // Don't add an edge that we've already considered - if (!usedCandidates.Set(edge, true, usedCandidates.SetKind::Overwrite)) - { - cutPoints.Push(edge); - } - }; + lastTryBeg = tryBeg; + } - auto addNonFallthroughSuccs = [this, considerEdge](BasicBlock* block, BasicBlock* next) { - assert(block != nullptr); + if (compiler->fgFirstBB != lastTryBeg) + { + modified |= RunThreeOptPass(compiler->fgFirstBB, finalBlock); + } - for (FlowEdge* const succEdge : block->SuccEdges(this)) + if (modified) + { + for (unsigned i = 1; i < numHotBlocks; i++) { - if (succEdge->getDestinationBlock() != next) + BasicBlock* const block = blockOrder[i - 1]; + BasicBlock* const next = blockOrder[i]; + if (!block->NextIs(next)) { - considerEdge(succEdge); + compiler->fgUnlinkBlock(next); + compiler->fgInsertBBafter(block, next); } } - }; - - auto addNonFallthroughPreds = [considerEdge](BasicBlock* block, BasicBlock* prev) { - assert(block != nullptr); + } +} - for (FlowEdge* const predEdge : block->PredEdges()) - { - if (predEdge->getSourceBlock() != prev) - { - considerEdge(predEdge); - } - } - }; +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::RunThreeOptPass: Runs 3-opt for the given block range. +// +// Parameters: +// startBlock - The first block of the range to reorder +// endBlock - The last block (inclusive) of the range to reorder +// +bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock) +{ + assert(startBlock != nullptr); + assert(endBlock != nullptr); + assert((ordinals[startBlock->bbNum] != 0) || startBlock->IsFirst()); + assert(ordinals[endBlock->bbNum] != 0); + assert(cutPoints.Empty()); - assert(finalBlock != nullptr); - BasicBlock** blockOrder = new (this, CMK_BasicBlock) BasicBlock*[numHotBlocks]; - BasicBlock** tempOrder = new (this, CMK_BasicBlock) BasicBlock*[numHotBlocks]; - unsigned position = 0; + bool modified = false; + const unsigned startPos = ordinals[startBlock->bbNum]; + const unsigned endPos = ordinals[endBlock->bbNum]; + const unsigned numCandidateBlocks = endPos - startPos + 1; + assert(startPos <= endPos); - // Initialize current block order, and find potential cut points in the hot section - for (BasicBlock* const block : Blocks(fgFirstBB, finalBlock)) + // Initialize cutPoints with candidate branches in this section + for (unsigned position = startPos; position < endPos; position++) { - blockOrder[position] = block; - position++; - addNonFallthroughSuccs(block, block->Next()); + AddNonFallthroughSuccs(blockOrder[position], blockOrder[position + 1], startPos); } - assert(position == numHotBlocks); - const unsigned lastHotIndex = numHotBlocks - 1; - bool modified = false; + // Also consider any backedges out of the last block in this region + AddNonFallthroughSuccs(blockOrder[endPos], nullptr, startPos); // For each candidate edge, determine if it's profitable to partition after the source block // and before the destination block, and swapping the partitions to create fallthrough. @@ -5012,12 +5114,12 @@ void Compiler::fgSearchImprovedLayout() const unsigned srcPos = ordinals[srcBlk->bbNum]; const unsigned dstPos = ordinals[dstBlk->bbNum]; - assert(srcPos < numHotBlocks); - assert(dstPos < numHotBlocks); + // This edge better be between blocks in the current region + assert((srcPos >= startPos) && (srcPos <= endPos)); + assert((dstPos >= startPos) && (dstPos <= endPos)); - // 'dstBlk' better be hot, and better not be 'fgFirstBB' - // (we don't want to change the entry point) - assert(dstPos != 0); + // 'dstBlk' better not be the region's entry point + assert(dstPos != startPos); // 'srcBlk' and 'dstBlk' better be distinct assert(srcPos != dstPos); @@ -5037,7 +5139,7 @@ void Compiler::fgSearchImprovedLayout() auto getCost = [this](BasicBlock* srcBlk, BasicBlock* dstBlk) -> weight_t { assert(srcBlk != nullptr); assert(dstBlk != nullptr); - FlowEdge* const edge = fgGetPredForBlock(dstBlk, srcBlk); + FlowEdge* const edge = compiler->fgGetPredForBlock(dstBlk, srcBlk); return (edge != nullptr) ? edge->getLikelyWeight() : 0.0; }; @@ -5051,44 +5153,44 @@ void Compiler::fgSearchImprovedLayout() if (isForwardJump) { // Here is the proposed partition: - // S1: 0 ~ srcPos + // S1: startPos ~ srcPos // S2: srcPos+1 ~ dstPos-1 - // S3: dstPos ~ lastHotIndex - // S4: cold section + // S3: dstPos ~ endPos + // S4: remaining blocks // // After the swap: - // S1: 0 ~ srcPos - // S3: dstPos ~ lastHotIndex + // S1: startPos ~ srcPos + // S3: dstPos ~ endPos // S2: srcPos+1 ~ dstPos-1 - // S4: cold section - part1Size = srcPos + 1; + // S4: remaining blocks + part1Size = srcPos - startPos + 1; part2Size = dstPos - srcPos - 1; - part3Size = numHotBlocks - dstPos; + part3Size = endPos - dstPos + 1; srcNextCost = getCost(srcBlk, blockOrder[srcPos + 1]); dstPrevCost = getCost(blockOrder[dstPos - 1], dstBlk); cost = srcNextCost + dstPrevCost; - improvement = candidateEdge->getLikelyWeight() + getCost(blockOrder[lastHotIndex], blockOrder[srcPos + 1]); + improvement = candidateEdge->getLikelyWeight() + getCost(blockOrder[endPos], blockOrder[srcPos + 1]); - // Don't include branches to the cold section (S4) in the cost/improvement calculation. - // If the section is truly cold, branches into it should have negligible cost. + // Don't include branches S4 in the cost/improvement calculation, + // since we're only considering branches within this region. } else { assert(dstPos < srcPos); // Here is the proposed partition: - // S1: 0 ~ dstPos-1 + // S1: startPos ~ dstPos-1 // S2: dstPos ~ srcPos-1 // S3: srcPos - // S4: srcPos+1 ~ remaining code + // S4: srcPos+1 ~ endPos // // After the swap: - // S1: 0 ~ dstPos-1 + // S1: startPos ~ dstPos-1 // S3: srcPos // S2: dstPos ~ srcPos-1 - // S4: srcPos+1 ~ remaining code - part1Size = dstPos; + // S4: srcPos+1 ~ endPos + part1Size = dstPos - startPos; part2Size = srcPos - dstPos; part3Size = 1; @@ -5097,7 +5199,7 @@ void Compiler::fgSearchImprovedLayout() cost = srcPrevCost + dstPrevCost; improvement = candidateEdge->getLikelyWeight() + getCost(blockOrder[dstPos - 1], srcBlk); - if (srcPos != lastHotIndex) + if (srcPos != endPos) { srcNextCost = getCost(srcBlk, blockOrder[srcPos + 1]); cost += srcNextCost; @@ -5119,51 +5221,50 @@ void Compiler::fgSearchImprovedLayout() // consider all other edges out of 'srcBlk''s current fallthrough predecessor if (srcPrevCost != 0.0) { - addNonFallthroughSuccs(blockOrder[srcPos - 1], srcBlk); + AddNonFallthroughSuccs(blockOrder[srcPos - 1], srcBlk, startPos); } // If we're going to break up fallthrough out of 'srcBlk', // consider all other edges into 'srcBlk''s current fallthrough successor if (srcNextCost != 0.0) { - addNonFallthroughPreds(blockOrder[srcPos + 1], srcBlk); + AddNonFallthroughPreds(blockOrder[srcPos + 1], srcBlk, startPos); } // If we're going to break up fallthrough into 'dstBlk', // consider all other edges out of 'dstBlk''s current fallthrough predecessor if (dstPrevCost != 0.0) { - addNonFallthroughSuccs(blockOrder[dstPos - 1], dstBlk); + AddNonFallthroughSuccs(blockOrder[dstPos - 1], dstBlk, startPos); } // Swap the partitions - memcpy(tempOrder, blockOrder, sizeof(BasicBlock*) * part1Size); - memcpy(tempOrder + part1Size, blockOrder + part1Size + part2Size, sizeof(BasicBlock*) * part3Size); - memcpy(tempOrder + part1Size + part3Size, blockOrder + part1Size, sizeof(BasicBlock*) * part2Size); + BasicBlock** const regionStart = blockOrder + startPos; + BasicBlock** const tempStart = tempOrder + startPos; + const unsigned numBlocks = endPos - startPos + 1; + memcpy(tempStart, regionStart, sizeof(BasicBlock*) * part1Size); + memcpy(tempStart + part1Size, regionStart + part1Size + part2Size, sizeof(BasicBlock*) * part3Size); + memcpy(tempStart + part1Size + part3Size, regionStart + part1Size, sizeof(BasicBlock*) * part2Size); if (!isForwardJump) { - const unsigned remainingCodeSize = numHotBlocks - srcPos - 1; + const unsigned remainingCodeSize = endPos - srcPos; if (remainingCodeSize != 0) { - memcpy(tempOrder + srcPos + 1, blockOrder + srcPos + 1, sizeof(BasicBlock*) * remainingCodeSize); - } - else - { - assert(srcPos == lastHotIndex); + memcpy(tempStart + srcPos + 1, regionStart + srcPos + 1, sizeof(BasicBlock*) * remainingCodeSize); } } else { - assert((part1Size + part2Size + part3Size) == numHotBlocks); + assert((part1Size + part2Size + part3Size) == numBlocks); } std::swap(blockOrder, tempOrder); // Update the positions of the blocks we moved - for (unsigned i = part1Size; i < numHotBlocks; i++) + for (unsigned i = part1Size; i < numBlocks; i++) { - ordinals[blockOrder[i]->bbNum] = i; + ordinals[tempStart[i]->bbNum] = i; } // Ensure this move created fallthrough from srcBlk to dstBlk @@ -5171,19 +5272,37 @@ void Compiler::fgSearchImprovedLayout() modified = true; } - if (modified) + return modified; +} + +//----------------------------------------------------------------------------- +// fgSearchImprovedLayout: Try to improve upon RPO-based layout with the 3-opt method: +// - Identify a range of hot blocks to reorder within +// - Partition this set into three segments: S1 - S2 - S3 +// - Evaluate cost of swapped layout: S1 - S3 - S2 +// - If the cost improves, keep this layout +// +void Compiler::fgSearchImprovedLayout() +{ + if (compHndBBtabCount != 0) { - for (unsigned i = 1; i < numHotBlocks; i++) - { - BasicBlock* const block = blockOrder[i - 1]; - BasicBlock* const next = blockOrder[i]; - if (!block->NextIs(next)) - { - fgUnlinkBlock(next); - fgInsertBBafter(block, next); - } - } + // TODO: Reorder methods with EH regions + return; + } + +#ifdef DEBUG + if (verbose) + { + printf("*************** In fgSearchImprovedLayout()\n"); + + printf("\nInitial BasicBlocks"); + fgDispBasicBlocks(verboseTrees); + printf("\n"); } +#endif // DEBUG + + ThreeOptLayout layoutRunner(this); + layoutRunner.Run(); } //------------------------------------------------------------- From eea003fe9c8c1758872ac0e8cd68991d93ec0ca3 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Mon, 28 Oct 2024 17:23:58 -0400 Subject: [PATCH 25/40] Reorder EH regions --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/fgopt.cpp | 109 +++++++++++++++++++------------------ 2 files changed, 58 insertions(+), 53 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 52029ea404626..713be7831663c 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6266,7 +6266,7 @@ class Compiler void ConsiderEdge(FlowEdge* edge, unsigned startPos); void AddNonFallthroughSuccs(BasicBlock* block, BasicBlock* next, unsigned startPos); void AddNonFallthroughPreds(BasicBlock* block, BasicBlock* prev, unsigned startPos); - bool RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock); + bool RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock, unsigned numHotBlocks); public: ThreeOptLayout(Compiler* comp); diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 837e9fcc00f9a..706f6fcf7591b 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4900,15 +4900,24 @@ Compiler::ThreeOptLayout::ThreeOptLayout(Compiler* comp) // void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge, unsigned startPos) { - // Since adding to priority queues has logarithmic time complexity, - // try to avoid adding edges that we obviously won't consider when reordering. assert(edge != nullptr); BasicBlock* const srcBlk = edge->getSourceBlock(); BasicBlock* const dstBlk = edge->getDestinationBlock(); + BasicBlock* const startBlk = blockOrder[startPos]; // Ignore cross-region branches - if (!BasicBlock::sameEHRegion(srcBlk, dstBlk)) + if (!BasicBlock::sameEHRegion(srcBlk, startBlk) || !BasicBlock::sameEHRegion(dstBlk, startBlk)) + { + return; + } + + if (compiler->bbIsTryBeg(dstBlk)) + { + return; + } + + if (srcBlk->KindIs(BBJ_CALLFINALLYRET)) { return; } @@ -4924,9 +4933,11 @@ void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge, unsigned startPos) return; } - // Don't consider edges to blocks outside the hot range, + assert(srcPos >= startPos); + + // Don't consider edges to blocks outside the hot range (i.e. ordinal number isn't set), // or backedges to the first block in a region; we don't want to change the entry point. - if (dstPos == startPos) + if (dstPos <= startPos) { return; } @@ -4938,6 +4949,7 @@ void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge, unsigned startPos) } // Don't add an edge that we've already considered + // (For exceptionally branchy methods, we want to avoid exploding 'cutPoints' in size) if (!usedCandidates.Set(edge, true, usedCandidates.SetKind::Overwrite)) { cutPoints.Push(edge); @@ -4990,7 +5002,7 @@ void Compiler::ThreeOptLayout::AddNonFallthroughPreds(BasicBlock* block, BasicBl //----------------------------------------------------------------------------- // Compiler::ThreeOptLayout::Run: Runs 3-opt for each contiguous region of the block list // we're interested in reordering. -// Skips handler regions for now, as these are assumed to be cold. +// We skip reordering handler regions for now, as these are assumed to be cold. // void Compiler::ThreeOptLayout::Run() { @@ -5013,12 +5025,6 @@ void Compiler::ThreeOptLayout::Run() ordinals[block->bbNum] = numHotBlocks++; } - if (numHotBlocks < 3) - { - JITDUMP("Not enough hot blocks; skipping reordering\n"); - return; - } - assert(finalBlock != nullptr); blockOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numHotBlocks]; tempOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numHotBlocks]; @@ -5027,7 +5033,8 @@ void Compiler::ThreeOptLayout::Run() // Initialize current block order for (BasicBlock* const block : compiler->Blocks(compiler->fgFirstBB, finalBlock)) { - blockOrder[position++] = block; + blockOrder[position] = tempOrder[position] = block; + position++; EHblkDsc* const HBtab = compiler->ehGetBlockTryDsc(block); if (HBtab != nullptr) @@ -5036,22 +5043,25 @@ void Compiler::ThreeOptLayout::Run() } } - bool modified = false; - BasicBlock* lastTryBeg = nullptr; + bool modified = false; + unsigned XTnum = 0; for (EHblkDsc* const HBtab : EHClauses(compiler)) { BasicBlock* const tryBeg = HBtab->ebdTryBeg; - if ((tryBeg != lastTryBeg) && (ordinals[tryBeg->bbNum] != 0)) + if (tryBeg->getTryIndex() != XTnum++) { - modified |= RunThreeOptPass(tryBeg, HBtab->ebdTryLast); + continue; } - lastTryBeg = tryBeg; + if (ordinals[tryBeg->bbNum] != 0) + { + modified |= RunThreeOptPass(tryBeg, HBtab->ebdTryLast, numHotBlocks); + } } - if (compiler->fgFirstBB != lastTryBeg) + if (!compiler->fgFirstBB->hasTryIndex()) { - modified |= RunThreeOptPass(compiler->fgFirstBB, finalBlock); + modified |= RunThreeOptPass(compiler->fgFirstBB, blockOrder[numHotBlocks - 1], numHotBlocks); } if (modified) @@ -5076,18 +5086,17 @@ void Compiler::ThreeOptLayout::Run() // startBlock - The first block of the range to reorder // endBlock - The last block (inclusive) of the range to reorder // -bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock) +bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock, unsigned numHotBlocks) { assert(startBlock != nullptr); assert(endBlock != nullptr); - assert((ordinals[startBlock->bbNum] != 0) || startBlock->IsFirst()); - assert(ordinals[endBlock->bbNum] != 0); assert(cutPoints.Empty()); - bool modified = false; - const unsigned startPos = ordinals[startBlock->bbNum]; - const unsigned endPos = ordinals[endBlock->bbNum]; - const unsigned numCandidateBlocks = endPos - startPos + 1; + bool modified = false; + const unsigned startPos = ordinals[startBlock->bbNum]; + const unsigned endPos = ordinals[endBlock->bbNum]; + const unsigned numBlocks = (endPos - startPos + 1); + assert((startPos != 0) || startBlock->IsFirst()); assert(startPos <= endPos); // Initialize cutPoints with candidate branches in this section @@ -5100,9 +5109,9 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc AddNonFallthroughSuccs(blockOrder[endPos], nullptr, startPos); // For each candidate edge, determine if it's profitable to partition after the source block - // and before the destination block, and swapping the partitions to create fallthrough. + // and before the destination block, and swap the partitions to create fallthrough. // If it is, do the swap, and for the blocks before/after each cut point that lost fallthrough, - // consider adding their successors/predecessors to the worklist. + // consider adding their successors/predecessors to 'cutPoints'. while (!cutPoints.Empty()) { FlowEdge* const candidateEdge = cutPoints.Top(); @@ -5241,35 +5250,37 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc // Swap the partitions BasicBlock** const regionStart = blockOrder + startPos; BasicBlock** const tempStart = tempOrder + startPos; - const unsigned numBlocks = endPos - startPos + 1; memcpy(tempStart, regionStart, sizeof(BasicBlock*) * part1Size); memcpy(tempStart + part1Size, regionStart + part1Size + part2Size, sizeof(BasicBlock*) * part3Size); memcpy(tempStart + part1Size + part3Size, regionStart + part1Size, sizeof(BasicBlock*) * part2Size); - if (!isForwardJump) - { - const unsigned remainingCodeSize = endPos - srcPos; - if (remainingCodeSize != 0) - { - memcpy(tempStart + srcPos + 1, regionStart + srcPos + 1, sizeof(BasicBlock*) * remainingCodeSize); - } - } - else - { - assert((part1Size + part2Size + part3Size) == numBlocks); - } + const unsigned swappedSize = part1Size + part2Size + part3Size; + const unsigned remainingSize = numBlocks - swappedSize; + assert(numBlocks >= swappedSize); + assert((remainingSize == 0) || !isForwardJump); + memcpy(tempStart + swappedSize, regionStart + swappedSize, sizeof(BasicBlock*) * remainingSize); std::swap(blockOrder, tempOrder); - // Update the positions of the blocks we moved - for (unsigned i = part1Size; i < numBlocks; i++) + // Update the ordinals for the blocks we moved + for (unsigned i = startPos; i <= endPos; i++) { - ordinals[tempStart[i]->bbNum] = i; + ordinals[blockOrder[i]->bbNum] = i; } - // Ensure this move created fallthrough from srcBlk to dstBlk + // Ensure this move created fallthrough from 'srcBlk' to 'dstBlk' assert((ordinals[srcBlk->bbNum] + 1) == ordinals[dstBlk->bbNum]); modified = true; + + for (unsigned i = 0; i < numHotBlocks; i++) + { + assert(ordinals[blockOrder[i]->bbNum] == i); + } + } + + if (modified) + { + memcpy(tempOrder + startPos, blockOrder + startPos, sizeof(BasicBlock*) * numBlocks); } return modified; @@ -5284,12 +5295,6 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc // void Compiler::fgSearchImprovedLayout() { - if (compHndBBtabCount != 0) - { - // TODO: Reorder methods with EH regions - return; - } - #ifdef DEBUG if (verbose) { From 96ef576d50a9f43d73db10f70d40769e77c7273d Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Mon, 28 Oct 2024 17:50:17 -0400 Subject: [PATCH 26/40] Fix completely cold try regions (clean replay) --- src/coreclr/jit/jiteh.cpp | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/coreclr/jit/jiteh.cpp b/src/coreclr/jit/jiteh.cpp index e3d1714702404..15f4c04ee254b 100644 --- a/src/coreclr/jit/jiteh.cpp +++ b/src/coreclr/jit/jiteh.cpp @@ -1306,6 +1306,43 @@ void Compiler::fgRebuildEHRegions() } } + // Cold try regions may have been moved out of their parent regions. + // Find these instances, and move them to the ends of their parent regions. + for (EHblkDsc* const HBtab : EHClauses(this)) + { + // We can safely ignore try regions that aren't cold, are in handler regions, or aren't nested. + const unsigned enclosingTryIndex = HBtab->ebdEnclosingTryIndex; + if (!HBtab->ebdTryBeg->isBBWeightCold(this) || HBtab->ebdTryBeg->hasHndIndex() || + (enclosingTryIndex == EHblkDsc::NO_ENCLOSING_INDEX)) + { + continue; + } + + // If the child region is already contained by its parent, we don't need to move the child region. + EHblkDsc* enclosingTab = ehGetDsc(enclosingTryIndex); + BasicBlock* const insertionPoint = enclosingTab->ebdTryLast; + if (HBtab->ebdTryLast == insertionPoint) + { + continue; + } + + // Move the child to the end of its parent. + fgUnlinkRange(HBtab->ebdTryBeg, HBtab->ebdTryLast); + fgMoveBlocksAfter(HBtab->ebdTryBeg, HBtab->ebdTryLast, insertionPoint); + enclosingTab->ebdTryLast = HBtab->ebdTryLast; + + // Update each parent try region's end pointer. + for (unsigned XTnum = ehGetEnclosingTryIndex(enclosingTryIndex); XTnum != EHblkDsc::NO_ENCLOSING_INDEX; + XTnum = ehGetEnclosingTryIndex(XTnum)) + { + enclosingTab = ehGetDsc(XTnum); + if (enclosingTab->ebdTryLast == insertionPoint) + { + enclosingTab->ebdTryLast = HBtab->ebdTryLast; + } + } + } + // The above logic rebuilt the try regions in the main method body. // Now, resolve the regions in the funclet section, if there is one. assert((unsetTryEnds == 0) || (fgFirstFuncletBB != nullptr)); From 8d95f4919ec8d57641e7a756c2a8fee2ca02a12c Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Mon, 28 Oct 2024 22:16:10 -0400 Subject: [PATCH 27/40] Small cleanup --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/fgopt.cpp | 13 ++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 713be7831663c..52029ea404626 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6266,7 +6266,7 @@ class Compiler void ConsiderEdge(FlowEdge* edge, unsigned startPos); void AddNonFallthroughSuccs(BasicBlock* block, BasicBlock* next, unsigned startPos); void AddNonFallthroughPreds(BasicBlock* block, BasicBlock* prev, unsigned startPos); - bool RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock, unsigned numHotBlocks); + bool RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock); public: ThreeOptLayout(Compiler* comp); diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 706f6fcf7591b..bb193e40c181c 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5055,13 +5055,13 @@ void Compiler::ThreeOptLayout::Run() if (ordinals[tryBeg->bbNum] != 0) { - modified |= RunThreeOptPass(tryBeg, HBtab->ebdTryLast, numHotBlocks); + modified |= RunThreeOptPass(tryBeg, HBtab->ebdTryLast); } } if (!compiler->fgFirstBB->hasTryIndex()) { - modified |= RunThreeOptPass(compiler->fgFirstBB, blockOrder[numHotBlocks - 1], numHotBlocks); + modified |= RunThreeOptPass(compiler->fgFirstBB, blockOrder[numHotBlocks - 1]); } if (modified) @@ -5086,7 +5086,7 @@ void Compiler::ThreeOptLayout::Run() // startBlock - The first block of the range to reorder // endBlock - The last block (inclusive) of the range to reorder // -bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock, unsigned numHotBlocks) +bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock) { assert(startBlock != nullptr); assert(endBlock != nullptr); @@ -5263,7 +5263,7 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc std::swap(blockOrder, tempOrder); // Update the ordinals for the blocks we moved - for (unsigned i = startPos; i <= endPos; i++) + for (unsigned i = (startPos + part1Size); i <= endPos; i++) { ordinals[blockOrder[i]->bbNum] = i; } @@ -5271,11 +5271,6 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc // Ensure this move created fallthrough from 'srcBlk' to 'dstBlk' assert((ordinals[srcBlk->bbNum] + 1) == ordinals[dstBlk->bbNum]); modified = true; - - for (unsigned i = 0; i < numHotBlocks; i++) - { - assert(ordinals[blockOrder[i]->bbNum] == i); - } } if (modified) From 5388e9eb6468831a1f040816bd5ea25d1aa17302 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Mon, 28 Oct 2024 23:33:02 -0400 Subject: [PATCH 28/40] Add currEHRegion member --- src/coreclr/jit/compiler.h | 7 +++--- src/coreclr/jit/fgopt.cpp | 47 ++++++++++++++------------------------ 2 files changed, 21 insertions(+), 33 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 52029ea404626..1bb059cc75cc2 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6262,10 +6262,11 @@ class Compiler unsigned* ordinals; BasicBlock** blockOrder; BasicBlock** tempOrder; + unsigned currEHRegion; - void ConsiderEdge(FlowEdge* edge, unsigned startPos); - void AddNonFallthroughSuccs(BasicBlock* block, BasicBlock* next, unsigned startPos); - void AddNonFallthroughPreds(BasicBlock* block, BasicBlock* prev, unsigned startPos); + void ConsiderEdge(FlowEdge* edge); + void AddNonFallthroughSuccs(BasicBlock* block, BasicBlock* next); + void AddNonFallthroughPreds(BasicBlock* block, BasicBlock* prev); bool RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock); public: diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index bb193e40c181c..0aae0232a3e54 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4885,6 +4885,7 @@ Compiler::ThreeOptLayout::ThreeOptLayout(Compiler* comp) , ordinals(new(comp, CMK_Generic) unsigned[comp->fgBBNumMax + 1]{}) , blockOrder(nullptr) , tempOrder(nullptr) + , currEHRegion(0) { } @@ -4896,23 +4897,16 @@ Compiler::ThreeOptLayout::ThreeOptLayout(Compiler* comp) // // Parameters: // edge - The branch to consider creating fallthrough for -// startPos - The start index of the region currently being reordered // -void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge, unsigned startPos) +void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge) { assert(edge != nullptr); BasicBlock* const srcBlk = edge->getSourceBlock(); BasicBlock* const dstBlk = edge->getDestinationBlock(); - BasicBlock* const startBlk = blockOrder[startPos]; // Ignore cross-region branches - if (!BasicBlock::sameEHRegion(srcBlk, startBlk) || !BasicBlock::sameEHRegion(dstBlk, startBlk)) - { - return; - } - - if (compiler->bbIsTryBeg(dstBlk)) + if ((srcBlk->bbTryIndex != currEHRegion) || (dstBlk->bbTryIndex != currEHRegion)) { return; } @@ -4933,11 +4927,9 @@ void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge, unsigned startPos) return; } - assert(srcPos >= startPos); - // Don't consider edges to blocks outside the hot range (i.e. ordinal number isn't set), // or backedges to the first block in a region; we don't want to change the entry point. - if (dstPos <= startPos) + if ((dstPos == 0) || compiler->bbIsTryBeg(dstBlk)) { return; } @@ -4963,16 +4955,15 @@ void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge, unsigned startPos) // Parameters: // block - The source of edges to consider // next - The lexical successor to 'block' in 3-opt's current ordering -// startPos - The start index of the region currently being reordered // -void Compiler::ThreeOptLayout::AddNonFallthroughSuccs(BasicBlock* block, BasicBlock* next, unsigned startPos) +void Compiler::ThreeOptLayout::AddNonFallthroughSuccs(BasicBlock* block, BasicBlock* next) { assert(block != nullptr); for (FlowEdge* const succEdge : block->SuccEdges(compiler)) { if (succEdge->getDestinationBlock() != next) { - ConsiderEdge(succEdge, startPos); + ConsiderEdge(succEdge); } } } @@ -4984,9 +4975,8 @@ void Compiler::ThreeOptLayout::AddNonFallthroughSuccs(BasicBlock* block, BasicBl // Parameters: // block - The destination of edges to consider // prev - The lexical predecessor to 'block' in 3-opt's current ordering -// startPos - The start index of the region currently being reordered // -void Compiler::ThreeOptLayout::AddNonFallthroughPreds(BasicBlock* block, BasicBlock* prev, unsigned startPos) +void Compiler::ThreeOptLayout::AddNonFallthroughPreds(BasicBlock* block, BasicBlock* prev) { assert(block != nullptr); @@ -4994,7 +4984,7 @@ void Compiler::ThreeOptLayout::AddNonFallthroughPreds(BasicBlock* block, BasicBl { if (predEdge->getSourceBlock() != prev) { - ConsiderEdge(predEdge, startPos); + ConsiderEdge(predEdge); } } } @@ -5044,11 +5034,10 @@ void Compiler::ThreeOptLayout::Run() } bool modified = false; - unsigned XTnum = 0; for (EHblkDsc* const HBtab : EHClauses(compiler)) { BasicBlock* const tryBeg = HBtab->ebdTryBeg; - if (tryBeg->getTryIndex() != XTnum++) + if (tryBeg->getTryIndex() != currEHRegion++) { continue; } @@ -5059,10 +5048,8 @@ void Compiler::ThreeOptLayout::Run() } } - if (!compiler->fgFirstBB->hasTryIndex()) - { - modified |= RunThreeOptPass(compiler->fgFirstBB, blockOrder[numHotBlocks - 1]); - } + currEHRegion = 0; + modified |= RunThreeOptPass(compiler->fgFirstBB, blockOrder[numHotBlocks - 1]); if (modified) { @@ -5102,11 +5089,11 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc // Initialize cutPoints with candidate branches in this section for (unsigned position = startPos; position < endPos; position++) { - AddNonFallthroughSuccs(blockOrder[position], blockOrder[position + 1], startPos); + AddNonFallthroughSuccs(blockOrder[position], blockOrder[position + 1]); } // Also consider any backedges out of the last block in this region - AddNonFallthroughSuccs(blockOrder[endPos], nullptr, startPos); + AddNonFallthroughSuccs(blockOrder[endPos], nullptr); // For each candidate edge, determine if it's profitable to partition after the source block // and before the destination block, and swap the partitions to create fallthrough. @@ -5181,7 +5168,7 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc cost = srcNextCost + dstPrevCost; improvement = candidateEdge->getLikelyWeight() + getCost(blockOrder[endPos], blockOrder[srcPos + 1]); - // Don't include branches S4 in the cost/improvement calculation, + // Don't include branches into S4 in the cost/improvement calculation, // since we're only considering branches within this region. } else @@ -5230,21 +5217,21 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc // consider all other edges out of 'srcBlk''s current fallthrough predecessor if (srcPrevCost != 0.0) { - AddNonFallthroughSuccs(blockOrder[srcPos - 1], srcBlk, startPos); + AddNonFallthroughSuccs(blockOrder[srcPos - 1], srcBlk); } // If we're going to break up fallthrough out of 'srcBlk', // consider all other edges into 'srcBlk''s current fallthrough successor if (srcNextCost != 0.0) { - AddNonFallthroughPreds(blockOrder[srcPos + 1], srcBlk, startPos); + AddNonFallthroughPreds(blockOrder[srcPos + 1], srcBlk); } // If we're going to break up fallthrough into 'dstBlk', // consider all other edges out of 'dstBlk''s current fallthrough predecessor if (dstPrevCost != 0.0) { - AddNonFallthroughSuccs(blockOrder[dstPos - 1], dstBlk, startPos); + AddNonFallthroughSuccs(blockOrder[dstPos - 1], dstBlk); } // Swap the partitions From 5e90c899fe47dbe64e2fe9144e549f08ade18775 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Tue, 29 Oct 2024 15:13:47 -0400 Subject: [PATCH 29/40] Only move blocks within same region after 3-opt --- src/coreclr/jit/fgopt.cpp | 7 ++++++- src/coreclr/jit/jiteh.cpp | 37 ------------------------------------- 2 files changed, 6 insertions(+), 38 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 0aae0232a3e54..dd2a480c58402 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4911,6 +4911,11 @@ void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge) return; } + if (srcBlk->hasHndIndex() || dstBlk->hasHndIndex()) + { + return; + } + if (srcBlk->KindIs(BBJ_CALLFINALLYRET)) { return; @@ -5057,7 +5062,7 @@ void Compiler::ThreeOptLayout::Run() { BasicBlock* const block = blockOrder[i - 1]; BasicBlock* const next = blockOrder[i]; - if (!block->NextIs(next)) + if (!block->NextIs(next) && BasicBlock::sameEHRegion(block, next)) { compiler->fgUnlinkBlock(next); compiler->fgInsertBBafter(block, next); diff --git a/src/coreclr/jit/jiteh.cpp b/src/coreclr/jit/jiteh.cpp index 15f4c04ee254b..e3d1714702404 100644 --- a/src/coreclr/jit/jiteh.cpp +++ b/src/coreclr/jit/jiteh.cpp @@ -1306,43 +1306,6 @@ void Compiler::fgRebuildEHRegions() } } - // Cold try regions may have been moved out of their parent regions. - // Find these instances, and move them to the ends of their parent regions. - for (EHblkDsc* const HBtab : EHClauses(this)) - { - // We can safely ignore try regions that aren't cold, are in handler regions, or aren't nested. - const unsigned enclosingTryIndex = HBtab->ebdEnclosingTryIndex; - if (!HBtab->ebdTryBeg->isBBWeightCold(this) || HBtab->ebdTryBeg->hasHndIndex() || - (enclosingTryIndex == EHblkDsc::NO_ENCLOSING_INDEX)) - { - continue; - } - - // If the child region is already contained by its parent, we don't need to move the child region. - EHblkDsc* enclosingTab = ehGetDsc(enclosingTryIndex); - BasicBlock* const insertionPoint = enclosingTab->ebdTryLast; - if (HBtab->ebdTryLast == insertionPoint) - { - continue; - } - - // Move the child to the end of its parent. - fgUnlinkRange(HBtab->ebdTryBeg, HBtab->ebdTryLast); - fgMoveBlocksAfter(HBtab->ebdTryBeg, HBtab->ebdTryLast, insertionPoint); - enclosingTab->ebdTryLast = HBtab->ebdTryLast; - - // Update each parent try region's end pointer. - for (unsigned XTnum = ehGetEnclosingTryIndex(enclosingTryIndex); XTnum != EHblkDsc::NO_ENCLOSING_INDEX; - XTnum = ehGetEnclosingTryIndex(XTnum)) - { - enclosingTab = ehGetDsc(XTnum); - if (enclosingTab->ebdTryLast == insertionPoint) - { - enclosingTab->ebdTryLast = HBtab->ebdTryLast; - } - } - } - // The above logic rebuilt the try regions in the main method body. // Now, resolve the regions in the funclet section, if there is one. assert((unsetTryEnds == 0) || (fgFirstFuncletBB != nullptr)); From b7a0a0322786a59e912a6f534c8434190a72c9d8 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Tue, 29 Oct 2024 15:38:25 -0400 Subject: [PATCH 30/40] Cleanup --- src/coreclr/jit/fgopt.cpp | 26 +++++++++++++++++++++----- src/coreclr/jit/priorityqueue.h | 2 +- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 95aa0b4940f4d..c9a20aa5ebfe5 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5031,6 +5031,9 @@ void Compiler::ThreeOptLayout::Run() blockOrder[position] = tempOrder[position] = block; position++; + // While walking the span of blocks to reorder, + // remember where each try region ends within this span. + // We'll use this information to run 3-opt per region. EHblkDsc* const HBtab = compiler->ehGetBlockTryDsc(block); if (HBtab != nullptr) { @@ -5038,22 +5041,29 @@ void Compiler::ThreeOptLayout::Run() } } + // Reorder try regions first bool modified = false; for (EHblkDsc* const HBtab : EHClauses(compiler)) { + // If multiple region indices map to the same region, + // make sure we reorder its blocks only once BasicBlock* const tryBeg = HBtab->ebdTryBeg; if (tryBeg->getTryIndex() != currEHRegion++) { continue; } - if (ordinals[tryBeg->bbNum] != 0) + // Only reorder try regions within the candidate span of blocks. + if ((ordinals[tryBeg->bbNum] != 0) || tryBeg->IsFirst()) { + JITDUMP("Running 3-opt for try region #%d\n", (currEHRegion - 1)); modified |= RunThreeOptPass(tryBeg, HBtab->ebdTryLast); } } + // Finally, reorder the main method body currEHRegion = 0; + JITDUMP("Running 3-opt for main method body\n"); modified |= RunThreeOptPass(compiler->fgFirstBB, blockOrder[numHotBlocks - 1]); if (modified) @@ -5062,6 +5072,10 @@ void Compiler::ThreeOptLayout::Run() { BasicBlock* const block = blockOrder[i - 1]; BasicBlock* const next = blockOrder[i]; + + // Only reorder within EH regions to maintain contiguity. + // TODO: Allow moving blocks in different regions when 'next' is the region entry. + // This would allow us to move entire regions up/down because of the contiguity requirement. if (!block->NextIs(next) && BasicBlock::sameEHRegion(block, next)) { compiler->fgUnlinkBlock(next); @@ -5084,9 +5098,9 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc assert(endBlock != nullptr); assert(cutPoints.Empty()); - bool modified = false; - const unsigned startPos = ordinals[startBlock->bbNum]; - const unsigned endPos = ordinals[endBlock->bbNum]; + bool modified = false; + const unsigned startPos = ordinals[startBlock->bbNum]; + const unsigned endPos = ordinals[endBlock->bbNum]; const unsigned numBlocks = (endPos - startPos + 1); assert((startPos != 0) || startBlock->IsFirst()); assert(startPos <= endPos); @@ -5245,7 +5259,8 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc memcpy(tempStart + part1Size, regionStart + part1Size + part2Size, sizeof(BasicBlock*) * part3Size); memcpy(tempStart + part1Size + part3Size, regionStart + part1Size, sizeof(BasicBlock*) * part2Size); - const unsigned swappedSize = part1Size + part2Size + part3Size; + // For backward jumps, there might be additional blocks after 'srcBlk' that need to be copied over. + const unsigned swappedSize = part1Size + part2Size + part3Size; const unsigned remainingSize = numBlocks - swappedSize; assert(numBlocks >= swappedSize); assert((remainingSize == 0) || !isForwardJump); @@ -5264,6 +5279,7 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc modified = true; } + // Write back to 'tempOrder' so changes to this region aren't lost next time we swap 'tempOrder' and 'blockOrder' if (modified) { memcpy(tempOrder + startPos, blockOrder + startPos, sizeof(BasicBlock*) * numBlocks); diff --git a/src/coreclr/jit/priorityqueue.h b/src/coreclr/jit/priorityqueue.h index 298cca0a1e8c9..dc5c62877fc16 100644 --- a/src/coreclr/jit/priorityqueue.h +++ b/src/coreclr/jit/priorityqueue.h @@ -118,4 +118,4 @@ class PriorityQueue // assert(VerifyMaxHeap()); return root; } -}; \ No newline at end of file +}; From 6ad754123dfed27b3933f96b83d1a48a5b14bc30 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Tue, 29 Oct 2024 15:44:30 -0400 Subject: [PATCH 31/40] Comments --- src/coreclr/jit/fgopt.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index c9a20aa5ebfe5..01b62cc779fd8 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4911,11 +4911,15 @@ void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge) return; } + // Don't waste time reordering within handler regions if (srcBlk->hasHndIndex() || dstBlk->hasHndIndex()) { return; } + // For backward jumps, we will consider partitioning before 'srcBlk'. + // If 'srcBlk' is a BBJ_CALLFINALLYRET, this partition will split up a call-finally pair. + // Thus, don't consider edges out of BBJ_CALLFINALLYRET blocks. if (srcBlk->KindIs(BBJ_CALLFINALLYRET)) { return; From a26be54c73a2e959a738d2ba403a654cbc86b5dd Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Wed, 30 Oct 2024 14:38:32 -0400 Subject: [PATCH 32/40] EdgeCmp tie-breaker --- src/coreclr/jit/fgopt.cpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 01b62cc779fd8..0e3b6fa8882c1 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4869,7 +4869,25 @@ void Compiler::fgMoveColdBlocks() // /* static */ bool Compiler::ThreeOptLayout::EdgeCmp(const FlowEdge* left, const FlowEdge* right) { - return left->getLikelyWeight() < right->getLikelyWeight(); + assert(left != right); + const weight_t leftWeight = left->getLikelyWeight(); + const weight_t rightWeight = right->getLikelyWeight(); + + // Break ties by comparing the source blocks' bbIDs. + // If both edges are out of the same source block, use the target blocks' bbIDs. + if (leftWeight == rightWeight) + { + BasicBlock* const leftSrc = left->getSourceBlock(); + BasicBlock* const rightSrc = right->getSourceBlock(); + if (leftSrc == rightSrc) + { + return left->getDestinationBlock()->bbID < right->getDestinationBlock()->bbID; + } + + return leftSrc->bbID < rightSrc->bbID; + } + + return leftWeight < rightWeight; } //----------------------------------------------------------------------------- From a1b8661660c2265b3d7e8927f906bc7899ebea06 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Wed, 30 Oct 2024 15:17:41 -0400 Subject: [PATCH 33/40] Comment feedback; replace 'usedCandidates' set with FlowEdge flag --- src/coreclr/jit/block.h | 21 +++++++++++++++++++++ src/coreclr/jit/compiler.h | 1 - src/coreclr/jit/fgopt.cpp | 22 +++++++++++++--------- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index 569307a63eaae..fd59a8ea0db02 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -594,6 +594,9 @@ struct FlowEdge // The count of duplicate "edges" (used for switch stmts or degenerate branches) unsigned m_dupCount; + // Convenience flag for phases that need to track edge visitation + bool m_visited; + // True if likelihood has been set INDEBUG(bool m_likelihoodSet); @@ -604,6 +607,7 @@ struct FlowEdge , m_destBlock(destBlock) , m_likelihood(0) , m_dupCount(0) + , m_visited(false) #ifdef DEBUG , m_likelihoodSet(false) #endif // DEBUG @@ -688,6 +692,23 @@ struct FlowEdge assert(m_dupCount >= 1); m_dupCount--; } + + bool visited() const + { + return m_visited; + } + + void markVisited() + { + assert(!visited()); + m_visited = true; + } + + void markUnvisited() + { + assert(visited()); + m_visited = false; + } }; //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 31f77344d81ea..b5153d900edc2 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6260,7 +6260,6 @@ class Compiler Compiler* compiler; PriorityQueue cutPoints; - JitHashTable, bool> usedCandidates; unsigned* ordinals; BasicBlock** blockOrder; BasicBlock** tempOrder; diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 0e3b6fa8882c1..39b09b4d3c760 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4899,7 +4899,6 @@ void Compiler::fgMoveColdBlocks() Compiler::ThreeOptLayout::ThreeOptLayout(Compiler* comp) : compiler(comp) , cutPoints(comp->getAllocator(CMK_FlowEdge), &ThreeOptLayout::EdgeCmp) - , usedCandidates(comp->getAllocator(CMK_FlowEdge)) , ordinals(new(comp, CMK_Generic) unsigned[comp->fgBBNumMax + 1]{}) , blockOrder(nullptr) , tempOrder(nullptr) @@ -4920,6 +4919,14 @@ void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge) { assert(edge != nullptr); + // Don't add an edge that we've already considered + // (For exceptionally branchy methods, we want to avoid exploding 'cutPoints' in size) + if (edge->visited()) + { + return; + } + + edge->markVisited(); BasicBlock* const srcBlk = edge->getSourceBlock(); BasicBlock* const dstBlk = edge->getDestinationBlock(); @@ -4929,7 +4936,9 @@ void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge) return; } - // Don't waste time reordering within handler regions + // Don't waste time reordering within handler regions. + // Note that if a finally region is sufficiently hot, + // we should have cloned it into the main method body already. if (srcBlk->hasHndIndex() || dstBlk->hasHndIndex()) { return; @@ -4967,12 +4976,7 @@ void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge) return; } - // Don't add an edge that we've already considered - // (For exceptionally branchy methods, we want to avoid exploding 'cutPoints' in size) - if (!usedCandidates.Set(edge, true, usedCandidates.SetKind::Overwrite)) - { - cutPoints.Push(edge); - } + cutPoints.Push(edge); } //----------------------------------------------------------------------------- @@ -5143,7 +5147,7 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc while (!cutPoints.Empty()) { FlowEdge* const candidateEdge = cutPoints.Pop(); - assert(usedCandidates[candidateEdge]); + assert(candidateEdge->visited()); BasicBlock* const srcBlk = candidateEdge->getSourceBlock(); BasicBlock* const dstBlk = candidateEdge->getDestinationBlock(); From ebbb973ea64265d70c1e5d516cde171745acf319 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 31 Oct 2024 11:36:01 -0400 Subject: [PATCH 34/40] Reframe cost model as maximal score problem --- src/coreclr/jit/fgopt.cpp | 54 ++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 39b09b4d3c760..61d2b6bc0704f 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4877,7 +4877,7 @@ void Compiler::fgMoveColdBlocks() // If both edges are out of the same source block, use the target blocks' bbIDs. if (leftWeight == rightWeight) { - BasicBlock* const leftSrc = left->getSourceBlock(); + BasicBlock* const leftSrc = left->getSourceBlock(); BasicBlock* const rightSrc = right->getSourceBlock(); if (leftSrc == rightSrc) { @@ -5176,7 +5176,14 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc assert(blockOrder[srcPos] == srcBlk); assert(blockOrder[dstPos] == dstBlk); - auto getCost = [this](BasicBlock* srcBlk, BasicBlock* dstBlk) -> weight_t { + // To determine if it's worth creating fallthrough from 'srcBlk' into 'dstBlk', + // we first sum the weights of fallthrough edges at the proposed cut points + // (i.e. the "score" of the current partition order). + // Then, we do the same for the fallthrough edges that would be created by reordering partitions. + // If the new score exceeds the current score, then the proposed fallthrough gains + // justify losing the existing fallthrough behavior. + + auto getScore = [this](BasicBlock* srcBlk, BasicBlock* dstBlk) -> weight_t { assert(srcBlk != nullptr); assert(dstBlk != nullptr); FlowEdge* const edge = compiler->fgGetPredForBlock(dstBlk, srcBlk); @@ -5184,10 +5191,10 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc }; const bool isForwardJump = (srcPos < dstPos); - weight_t srcPrevCost = 0.0; - weight_t srcNextCost = 0.0; - weight_t dstPrevCost = 0.0; - weight_t cost, improvement; + weight_t srcPrevScore = 0.0; + weight_t srcNextScore = 0.0; + weight_t dstPrevScore = 0.0; + weight_t currScore, newScore; unsigned part1Size, part2Size, part3Size; if (isForwardJump) @@ -5207,10 +5214,10 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc part2Size = dstPos - srcPos - 1; part3Size = endPos - dstPos + 1; - srcNextCost = getCost(srcBlk, blockOrder[srcPos + 1]); - dstPrevCost = getCost(blockOrder[dstPos - 1], dstBlk); - cost = srcNextCost + dstPrevCost; - improvement = candidateEdge->getLikelyWeight() + getCost(blockOrder[endPos], blockOrder[srcPos + 1]); + srcNextScore = getScore(srcBlk, blockOrder[srcPos + 1]); + dstPrevScore = getScore(blockOrder[dstPos - 1], dstBlk); + currScore = srcNextScore + dstPrevScore; + newScore = candidateEdge->getLikelyWeight() + getScore(blockOrder[endPos], blockOrder[srcPos + 1]); // Don't include branches into S4 in the cost/improvement calculation, // since we're only considering branches within this region. @@ -5234,46 +5241,47 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc part2Size = srcPos - dstPos; part3Size = 1; - srcPrevCost = getCost(blockOrder[srcPos - 1], srcBlk); - dstPrevCost = getCost(blockOrder[dstPos - 1], dstBlk); - cost = srcPrevCost + dstPrevCost; - improvement = candidateEdge->getLikelyWeight() + getCost(blockOrder[dstPos - 1], srcBlk); + srcPrevScore = getScore(blockOrder[srcPos - 1], srcBlk); + dstPrevScore = getScore(blockOrder[dstPos - 1], dstBlk); + currScore = srcPrevScore + dstPrevScore; + newScore = candidateEdge->getLikelyWeight() + getScore(blockOrder[dstPos - 1], srcBlk); if (srcPos != endPos) { - srcNextCost = getCost(srcBlk, blockOrder[srcPos + 1]); - cost += srcNextCost; - improvement += getCost(blockOrder[srcPos - 1], blockOrder[srcPos + 1]); + srcNextScore = getScore(srcBlk, blockOrder[srcPos + 1]); + currScore += srcNextScore; + newScore += getScore(blockOrder[srcPos - 1], blockOrder[srcPos + 1]); } } // Continue evaluating candidates if this one isn't profitable - if ((improvement <= cost) || Compiler::fgProfileWeightsEqual(improvement, cost, 0.001)) + if ((newScore <= currScore) || Compiler::fgProfileWeightsEqual(newScore, currScore, 0.001)) { continue; } // We've found a profitable cut point. Continue with the swap. - JITDUMP("Creating fallthrough for " FMT_BB " -> " FMT_BB " (cost=%f, improvement=%f)\n", srcBlk->bbNum, - dstBlk->bbNum, cost, improvement); + JITDUMP("Creating fallthrough for " FMT_BB " -> " FMT_BB + " (current partition score = %f, new partition score = %f)\n", + srcBlk->bbNum, dstBlk->bbNum, currScore, newScore); // If we're going to break up fallthrough into 'srcBlk', // consider all other edges out of 'srcBlk''s current fallthrough predecessor - if (srcPrevCost != 0.0) + if (srcPrevScore != 0.0) { AddNonFallthroughSuccs(blockOrder[srcPos - 1], srcBlk); } // If we're going to break up fallthrough out of 'srcBlk', // consider all other edges into 'srcBlk''s current fallthrough successor - if (srcNextCost != 0.0) + if (srcNextScore != 0.0) { AddNonFallthroughPreds(blockOrder[srcPos + 1], srcBlk); } // If we're going to break up fallthrough into 'dstBlk', // consider all other edges out of 'dstBlk''s current fallthrough predecessor - if (dstPrevCost != 0.0) + if (dstPrevScore != 0.0) { AddNonFallthroughSuccs(blockOrder[dstPos - 1], dstBlk); } From 7d9734ff17630ddda782a55e2ef3453e58a099f0 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 31 Oct 2024 12:53:35 -0400 Subject: [PATCH 35/40] Small refactor --- src/coreclr/jit/compiler.h | 5 ++- src/coreclr/jit/fgopt.cpp | 86 +++++++++++++++++++------------------- 2 files changed, 46 insertions(+), 45 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index b5153d900edc2..844b6304ecc95 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6263,11 +6263,12 @@ class Compiler unsigned* ordinals; BasicBlock** blockOrder; BasicBlock** tempOrder; + unsigned numCandidateBlocks; unsigned currEHRegion; void ConsiderEdge(FlowEdge* edge); - void AddNonFallthroughSuccs(BasicBlock* block, BasicBlock* next); - void AddNonFallthroughPreds(BasicBlock* block, BasicBlock* prev); + void AddNonFallthroughSuccs(unsigned blockPos); + void AddNonFallthroughPreds(unsigned blockPos); bool RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock); public: diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 61d2b6bc0704f..1acea6da7637b 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4902,6 +4902,7 @@ Compiler::ThreeOptLayout::ThreeOptLayout(Compiler* comp) , ordinals(new(comp, CMK_Generic) unsigned[comp->fgBBNumMax + 1]{}) , blockOrder(nullptr) , tempOrder(nullptr) + , numCandidateBlocks(0) , currEHRegion(0) { } @@ -4980,16 +4981,18 @@ void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge) } //----------------------------------------------------------------------------- -// Compiler::ThreeOptLayout::AddNonFallthroughSuccs: Considers every edge out of 'block' +// Compiler::ThreeOptLayout::AddNonFallthroughSuccs: Considers every edge out of a given block // that doesn't fall through as a future cut point. // // Parameters: -// block - The source of edges to consider -// next - The lexical successor to 'block' in 3-opt's current ordering +// blockPos - The index into 'blockOrder' of the source block // -void Compiler::ThreeOptLayout::AddNonFallthroughSuccs(BasicBlock* block, BasicBlock* next) +void Compiler::ThreeOptLayout::AddNonFallthroughSuccs(unsigned blockPos) { - assert(block != nullptr); + assert(blockPos < numCandidateBlocks); + BasicBlock* const block = blockOrder[blockPos]; + BasicBlock* const next = ((blockPos + 1) >= numCandidateBlocks) ? nullptr : blockOrder[blockPos + 1]; + for (FlowEdge* const succEdge : block->SuccEdges(compiler)) { if (succEdge->getDestinationBlock() != next) @@ -5000,16 +5003,17 @@ void Compiler::ThreeOptLayout::AddNonFallthroughSuccs(BasicBlock* block, BasicBl } //----------------------------------------------------------------------------- -// Compiler::ThreeOptLayout::AddNonFallthroughPreds: Considers every edge into 'block' +// Compiler::ThreeOptLayout::AddNonFallthroughPreds: Considers every edge into a given block // that doesn't fall through as a future cut point. // // Parameters: -// block - The destination of edges to consider -// prev - The lexical predecessor to 'block' in 3-opt's current ordering +// blockPos - The index into 'blockOrder' of the target block // -void Compiler::ThreeOptLayout::AddNonFallthroughPreds(BasicBlock* block, BasicBlock* prev) +void Compiler::ThreeOptLayout::AddNonFallthroughPreds(unsigned blockPos) { - assert(block != nullptr); + assert(blockPos < numCandidateBlocks); + BasicBlock* const block = blockOrder[blockPos]; + BasicBlock* const prev = (blockPos == 0) ? nullptr : blockOrder[blockPos - 1]; for (FlowEdge* const predEdge : block->PredEdges()) { @@ -5040,15 +5044,14 @@ void Compiler::ThreeOptLayout::Run() // Block reordering shouldn't change the method's entry point, // so if a block has an ordinal of zero and it's not 'fgFirstBB', // the block wasn't visited below, so it's not in the range of candidate blocks. - unsigned numHotBlocks = 0; for (BasicBlock* const block : compiler->Blocks(compiler->fgFirstBB, finalBlock)) { - ordinals[block->bbNum] = numHotBlocks++; + ordinals[block->bbNum] = numCandidateBlocks++; } - assert(finalBlock != nullptr); - blockOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numHotBlocks]; - tempOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numHotBlocks]; + assert(numCandidateBlocks != 0); + blockOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numCandidateBlocks]; + tempOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numCandidateBlocks]; unsigned position = 0; // Initialize current block order @@ -5090,11 +5093,11 @@ void Compiler::ThreeOptLayout::Run() // Finally, reorder the main method body currEHRegion = 0; JITDUMP("Running 3-opt for main method body\n"); - modified |= RunThreeOptPass(compiler->fgFirstBB, blockOrder[numHotBlocks - 1]); + modified |= RunThreeOptPass(compiler->fgFirstBB, blockOrder[numCandidateBlocks - 1]); if (modified) { - for (unsigned i = 1; i < numHotBlocks; i++) + for (unsigned i = 1; i < numCandidateBlocks; i++) { BasicBlock* const block = blockOrder[i - 1]; BasicBlock* const next = blockOrder[i]; @@ -5132,14 +5135,11 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc assert(startPos <= endPos); // Initialize cutPoints with candidate branches in this section - for (unsigned position = startPos; position < endPos; position++) + for (unsigned position = startPos; position <= endPos; position++) { - AddNonFallthroughSuccs(blockOrder[position], blockOrder[position + 1]); + AddNonFallthroughSuccs(position); } - // Also consider any backedges out of the last block in this region - AddNonFallthroughSuccs(blockOrder[endPos], nullptr); - // For each candidate edge, determine if it's profitable to partition after the source block // and before the destination block, and swap the partitions to create fallthrough. // If it is, do the swap, and for the blocks before/after each cut point that lost fallthrough, @@ -5265,27 +5265,6 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc " (current partition score = %f, new partition score = %f)\n", srcBlk->bbNum, dstBlk->bbNum, currScore, newScore); - // If we're going to break up fallthrough into 'srcBlk', - // consider all other edges out of 'srcBlk''s current fallthrough predecessor - if (srcPrevScore != 0.0) - { - AddNonFallthroughSuccs(blockOrder[srcPos - 1], srcBlk); - } - - // If we're going to break up fallthrough out of 'srcBlk', - // consider all other edges into 'srcBlk''s current fallthrough successor - if (srcNextScore != 0.0) - { - AddNonFallthroughPreds(blockOrder[srcPos + 1], srcBlk); - } - - // If we're going to break up fallthrough into 'dstBlk', - // consider all other edges out of 'dstBlk''s current fallthrough predecessor - if (dstPrevScore != 0.0) - { - AddNonFallthroughSuccs(blockOrder[dstPos - 1], dstBlk); - } - // Swap the partitions BasicBlock** const regionStart = blockOrder + startPos; BasicBlock** const tempStart = tempOrder + startPos; @@ -5311,6 +5290,27 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc // Ensure this move created fallthrough from 'srcBlk' to 'dstBlk' assert((ordinals[srcBlk->bbNum] + 1) == ordinals[dstBlk->bbNum]); modified = true; + + // If we broke up fallthrough into 'srcBlk', + // consider all other edges out of 'srcBlk''s current fallthrough predecessor + if (srcPrevScore != 0.0) + { + AddNonFallthroughSuccs(srcPos - 1); + } + + // If we broke up fallthrough out of 'srcBlk', + // consider all other edges into 'srcBlk''s current fallthrough successor + if (srcNextScore != 0.0) + { + AddNonFallthroughPreds(srcPos + 1); + } + + // If we broke up fallthrough into 'dstBlk', + // consider all other edges out of 'dstBlk''s current fallthrough predecessor + if (dstPrevScore != 0.0) + { + AddNonFallthroughSuccs(dstPos - 1); + } } // Write back to 'tempOrder' so changes to this region aren't lost next time we swap 'tempOrder' and 'blockOrder' From a2b3e1620ffa8e1414d56e0c098d28895eb79724 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Thu, 31 Oct 2024 13:18:16 -0400 Subject: [PATCH 36/40] Simplify adding new candidate edges --- src/coreclr/jit/fgopt.cpp | 46 ++++++++++++++------------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 1acea6da7637b..4b773f4cb731a 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5164,7 +5164,7 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc // 'srcBlk' and 'dstBlk' better be distinct assert(srcPos != dstPos); - // Previous moves might have inadvertently created fallthrough from srcBlk to dstBlk, + // Previous moves might have inadvertently created fallthrough from 'srcBlk' to 'dstBlk', // so there's nothing to do this round. if ((srcPos + 1) == dstPos) { @@ -5191,9 +5191,6 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc }; const bool isForwardJump = (srcPos < dstPos); - weight_t srcPrevScore = 0.0; - weight_t srcNextScore = 0.0; - weight_t dstPrevScore = 0.0; weight_t currScore, newScore; unsigned part1Size, part2Size, part3Size; @@ -5214,10 +5211,8 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc part2Size = dstPos - srcPos - 1; part3Size = endPos - dstPos + 1; - srcNextScore = getScore(srcBlk, blockOrder[srcPos + 1]); - dstPrevScore = getScore(blockOrder[dstPos - 1], dstBlk); - currScore = srcNextScore + dstPrevScore; - newScore = candidateEdge->getLikelyWeight() + getScore(blockOrder[endPos], blockOrder[srcPos + 1]); + currScore = getScore(srcBlk, blockOrder[srcPos + 1]) + getScore(blockOrder[dstPos - 1], dstBlk); + newScore = candidateEdge->getLikelyWeight() + getScore(blockOrder[endPos], blockOrder[srcPos + 1]); // Don't include branches into S4 in the cost/improvement calculation, // since we're only considering branches within this region. @@ -5241,15 +5236,12 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc part2Size = srcPos - dstPos; part3Size = 1; - srcPrevScore = getScore(blockOrder[srcPos - 1], srcBlk); - dstPrevScore = getScore(blockOrder[dstPos - 1], dstBlk); - currScore = srcPrevScore + dstPrevScore; - newScore = candidateEdge->getLikelyWeight() + getScore(blockOrder[dstPos - 1], srcBlk); + currScore = getScore(blockOrder[srcPos - 1], srcBlk) + getScore(blockOrder[dstPos - 1], dstBlk); + newScore = candidateEdge->getLikelyWeight() + getScore(blockOrder[dstPos - 1], srcBlk); if (srcPos != endPos) { - srcNextScore = getScore(srcBlk, blockOrder[srcPos + 1]); - currScore += srcNextScore; + currScore += getScore(srcBlk, blockOrder[srcPos + 1]); newScore += getScore(blockOrder[srcPos - 1], blockOrder[srcPos + 1]); } } @@ -5291,25 +5283,19 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc assert((ordinals[srcBlk->bbNum] + 1) == ordinals[dstBlk->bbNum]); modified = true; - // If we broke up fallthrough into 'srcBlk', - // consider all other edges out of 'srcBlk''s current fallthrough predecessor - if (srcPrevScore != 0.0) - { - AddNonFallthroughSuccs(srcPos - 1); - } + // At every cut point is an opportunity to consider more candidate edges. + // To the left of each cut point, consider successor edges that don't fall through. + // Ditto predecessor edges to the right of each cut point. + AddNonFallthroughSuccs(startPos + part1Size - 1); + AddNonFallthroughSuccs(startPos + part1Size + part2Size - 1); + AddNonFallthroughSuccs(startPos + part1Size + part2Size + part3Size - 1); - // If we broke up fallthrough out of 'srcBlk', - // consider all other edges into 'srcBlk''s current fallthrough successor - if (srcNextScore != 0.0) - { - AddNonFallthroughPreds(srcPos + 1); - } + AddNonFallthroughPreds(startPos + part1Size); + AddNonFallthroughPreds(startPos + part1Size + part2Size); - // If we broke up fallthrough into 'dstBlk', - // consider all other edges out of 'dstBlk''s current fallthrough predecessor - if (dstPrevScore != 0.0) + if (remainingSize != 0) { - AddNonFallthroughSuccs(dstPos - 1); + AddNonFallthroughPreds(startPos + part1Size + part2Size + part3Size); } } From 88d5389d29b59d075cddcd524af3e7545d9c4374 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Mon, 4 Nov 2024 10:16:06 -0500 Subject: [PATCH 37/40] Guess number of hot blocks when allocating arrays --- src/coreclr/jit/fgopt.cpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 4b773f4cb731a..4a47e57aa7d41 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5034,31 +5034,32 @@ void Compiler::ThreeOptLayout::Run() // Walk backwards through the main method body, looking for the last hot block. // Since we moved all cold blocks to the end of the method already, // we should have a span of hot blocks to consider reordering at the beginning of the method. + // While doing this, try to get as tight an upper bound for the number of hot blocks as possible. + // For methods without funclet regions, 'numBlocksUpperBound' is exact. + // Otherwise, it's off by the number of handler blocks. BasicBlock* finalBlock; + unsigned numBlocksUpperBound = compiler->fgBBcount; for (finalBlock = compiler->fgLastBBInMainFunction(); !finalBlock->IsFirst() && finalBlock->isBBWeightCold(compiler); finalBlock = finalBlock->Prev()) - ; + { + numBlocksUpperBound--; + } - // Initialize the ordinal numbers for the hot blocks. + assert(numBlocksUpperBound != 0); + blockOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numBlocksUpperBound]; + tempOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numBlocksUpperBound]; + + // Initialize the current block order. // Note that we default-initialized 'ordinals' with zeros. // Block reordering shouldn't change the method's entry point, // so if a block has an ordinal of zero and it's not 'fgFirstBB', // the block wasn't visited below, so it's not in the range of candidate blocks. for (BasicBlock* const block : compiler->Blocks(compiler->fgFirstBB, finalBlock)) { - ordinals[block->bbNum] = numCandidateBlocks++; - } - - assert(numCandidateBlocks != 0); - blockOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numCandidateBlocks]; - tempOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numCandidateBlocks]; - unsigned position = 0; - - // Initialize current block order - for (BasicBlock* const block : compiler->Blocks(compiler->fgFirstBB, finalBlock)) - { - blockOrder[position] = tempOrder[position] = block; - position++; + assert(numCandidateBlocks < numBlocksUpperBound); + ordinals[block->bbNum] = numCandidateBlocks; + blockOrder[numCandidateBlocks] = tempOrder[numCandidateBlocks] = block; + numCandidateBlocks++; // While walking the span of blocks to reorder, // remember where each try region ends within this span. From 4ab9f177d7dd1177c4031f56b9d10e58e8ff61a5 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Mon, 4 Nov 2024 10:23:56 -0500 Subject: [PATCH 38/40] Skip reordering if too few blocks --- src/coreclr/jit/fgopt.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 4a47e57aa7d41..8ee78031b7b61 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5038,7 +5038,7 @@ void Compiler::ThreeOptLayout::Run() // For methods without funclet regions, 'numBlocksUpperBound' is exact. // Otherwise, it's off by the number of handler blocks. BasicBlock* finalBlock; - unsigned numBlocksUpperBound = compiler->fgBBcount; + unsigned numBlocksUpperBound = compiler->fgBBcount; for (finalBlock = compiler->fgLastBBInMainFunction(); !finalBlock->IsFirst() && finalBlock->isBBWeightCold(compiler); finalBlock = finalBlock->Prev()) { @@ -5057,7 +5057,7 @@ void Compiler::ThreeOptLayout::Run() for (BasicBlock* const block : compiler->Blocks(compiler->fgFirstBB, finalBlock)) { assert(numCandidateBlocks < numBlocksUpperBound); - ordinals[block->bbNum] = numCandidateBlocks; + ordinals[block->bbNum] = numCandidateBlocks; blockOrder[numCandidateBlocks] = tempOrder[numCandidateBlocks] = block; numCandidateBlocks++; @@ -5135,6 +5135,12 @@ bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBloc assert((startPos != 0) || startBlock->IsFirst()); assert(startPos <= endPos); + if (numBlocks < 3) + { + JITDUMP("Not enough blocks to partition anything. Skipping reordering.\n"); + return false; + } + // Initialize cutPoints with candidate branches in this section for (unsigned position = startPos; position <= endPos; position++) { From ed3907494b4dc629d2e2cb799637ed76882ec2b1 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Mon, 4 Nov 2024 10:30:04 -0500 Subject: [PATCH 39/40] Add another check for too few blocks --- src/coreclr/jit/fgopt.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 8ee78031b7b61..669e27b3c0832 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5045,6 +5045,12 @@ void Compiler::ThreeOptLayout::Run() numBlocksUpperBound--; } + // For methods with fewer than three candidate blocks, we cannot partition anything + if (finalBlock->IsFirst() || finalBlock->Prev()->IsFirst()) + { + return; + } + assert(numBlocksUpperBound != 0); blockOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numBlocksUpperBound]; tempOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numBlocksUpperBound]; From f2eb773a463f58ed29f44892760ab4c1efb37b98 Mon Sep 17 00:00:00 2001 From: "Aman Khalid (from Dev Box)" Date: Mon, 4 Nov 2024 10:32:29 -0500 Subject: [PATCH 40/40] JITDUMP msg --- src/coreclr/jit/fgopt.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 669e27b3c0832..309fd8bfa73b2 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -5048,6 +5048,7 @@ void Compiler::ThreeOptLayout::Run() // For methods with fewer than three candidate blocks, we cannot partition anything if (finalBlock->IsFirst() || finalBlock->Prev()->IsFirst()) { + JITDUMP("Not enough blocks to partition anything. Skipping 3-opt.\n"); return; }