diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index 569307a63eaae..fd59a8ea0db02 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -594,6 +594,9 @@ struct FlowEdge // The count of duplicate "edges" (used for switch stmts or degenerate branches) unsigned m_dupCount; + // Convenience flag for phases that need to track edge visitation + bool m_visited; + // True if likelihood has been set INDEBUG(bool m_likelihoodSet); @@ -604,6 +607,7 @@ struct FlowEdge , m_destBlock(destBlock) , m_likelihood(0) , m_dupCount(0) + , m_visited(false) #ifdef DEBUG , m_likelihoodSet(false) #endif // DEBUG @@ -688,6 +692,23 @@ struct FlowEdge assert(m_dupCount >= 1); m_dupCount--; } + + bool visited() const + { + return m_visited; + } + + void markVisited() + { + assert(!visited()); + m_visited = true; + } + + void markUnvisited() + { + assert(visited()); + m_visited = false; + } }; //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index bdb8b686a176f..42637e015a746 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -5257,6 +5257,7 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl auto lateLayoutPhase = [this] { fgDoReversePostOrderLayout(); fgMoveColdBlocks(); + fgSearchImprovedLayout(); if (compHndBBtabCount != 0) { diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 56410dfd3035a..844b6304ecc95 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -38,6 +38,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "cycletimer.h" #include "blockset.h" #include "arraystack.h" +#include "priorityqueue.h" #include "hashbv.h" #include "jitexpandarray.h" #include "valuenum.h" @@ -6251,6 +6252,29 @@ class Compiler bool fgReorderBlocks(bool useProfile); void fgDoReversePostOrderLayout(); void fgMoveColdBlocks(); + void fgSearchImprovedLayout(); + + class ThreeOptLayout + { + static bool EdgeCmp(const FlowEdge* left, const FlowEdge* right); + + Compiler* compiler; + PriorityQueue cutPoints; + unsigned* ordinals; + BasicBlock** blockOrder; + BasicBlock** tempOrder; + unsigned numCandidateBlocks; + unsigned currEHRegion; + + void ConsiderEdge(FlowEdge* edge); + void AddNonFallthroughSuccs(unsigned blockPos); + void AddNonFallthroughPreds(unsigned blockPos); + bool RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock); + + public: + ThreeOptLayout(Compiler* comp); + void Run(); + }; template void fgMoveHotJumps(); diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 742f58364a390..309fd8bfa73b2 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -4855,6 +4855,497 @@ void Compiler::fgMoveColdBlocks() } } +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::EdgeCmp: Comparator for the 'cutPoints' priority queue. +// If 'left' has a bigger edge weight than 'right', 3-opt will consider it first. +// Else, 3-opt will consider 'right' first. +// +// Parameters: +// left - One of the two edges to compare +// right - The other edge to compare +// +// Returns: +// True if 'right' should be considered before 'left', and false otherwise +// +/* static */ bool Compiler::ThreeOptLayout::EdgeCmp(const FlowEdge* left, const FlowEdge* right) +{ + assert(left != right); + const weight_t leftWeight = left->getLikelyWeight(); + const weight_t rightWeight = right->getLikelyWeight(); + + // Break ties by comparing the source blocks' bbIDs. + // If both edges are out of the same source block, use the target blocks' bbIDs. + if (leftWeight == rightWeight) + { + BasicBlock* const leftSrc = left->getSourceBlock(); + BasicBlock* const rightSrc = right->getSourceBlock(); + if (leftSrc == rightSrc) + { + return left->getDestinationBlock()->bbID < right->getDestinationBlock()->bbID; + } + + return leftSrc->bbID < rightSrc->bbID; + } + + return leftWeight < rightWeight; +} + +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::ThreeOptLayout: Constructs a ThreeOptLayout instance. +// +// Parameters: +// comp - The Compiler instance +// +Compiler::ThreeOptLayout::ThreeOptLayout(Compiler* comp) + : compiler(comp) + , cutPoints(comp->getAllocator(CMK_FlowEdge), &ThreeOptLayout::EdgeCmp) + , ordinals(new(comp, CMK_Generic) unsigned[comp->fgBBNumMax + 1]{}) + , blockOrder(nullptr) + , tempOrder(nullptr) + , numCandidateBlocks(0) + , currEHRegion(0) +{ +} + +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::ConsiderEdge: Adds 'edge' to 'cutPoints' for later consideration +// if 'edge' looks promising, and it hasn't been considered already. +// Since adding to 'cutPoints' has logarithmic time complexity and might cause a heap allocation, +// avoid adding edges that 3-opt obviously won't consider later. +// +// Parameters: +// edge - The branch to consider creating fallthrough for +// +void Compiler::ThreeOptLayout::ConsiderEdge(FlowEdge* edge) +{ + assert(edge != nullptr); + + // Don't add an edge that we've already considered + // (For exceptionally branchy methods, we want to avoid exploding 'cutPoints' in size) + if (edge->visited()) + { + return; + } + + edge->markVisited(); + BasicBlock* const srcBlk = edge->getSourceBlock(); + BasicBlock* const dstBlk = edge->getDestinationBlock(); + + // Ignore cross-region branches + if ((srcBlk->bbTryIndex != currEHRegion) || (dstBlk->bbTryIndex != currEHRegion)) + { + return; + } + + // Don't waste time reordering within handler regions. + // Note that if a finally region is sufficiently hot, + // we should have cloned it into the main method body already. + if (srcBlk->hasHndIndex() || dstBlk->hasHndIndex()) + { + return; + } + + // For backward jumps, we will consider partitioning before 'srcBlk'. + // If 'srcBlk' is a BBJ_CALLFINALLYRET, this partition will split up a call-finally pair. + // Thus, don't consider edges out of BBJ_CALLFINALLYRET blocks. + if (srcBlk->KindIs(BBJ_CALLFINALLYRET)) + { + return; + } + + const unsigned srcPos = ordinals[srcBlk->bbNum]; + const unsigned dstPos = ordinals[dstBlk->bbNum]; + + // Don't consider edges from outside the hot range. + // If 'srcBlk' has an ordinal of zero and it isn't the first block, + // it's not tracked by 'ordinals', so it's not in the hot section. + if ((srcPos == 0) && !srcBlk->IsFirst()) + { + return; + } + + // Don't consider edges to blocks outside the hot range (i.e. ordinal number isn't set), + // or backedges to the first block in a region; we don't want to change the entry point. + if ((dstPos == 0) || compiler->bbIsTryBeg(dstBlk)) + { + return; + } + + // Don't consider backedges for single-block loops + if (srcPos == dstPos) + { + return; + } + + cutPoints.Push(edge); +} + +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::AddNonFallthroughSuccs: Considers every edge out of a given block +// that doesn't fall through as a future cut point. +// +// Parameters: +// blockPos - The index into 'blockOrder' of the source block +// +void Compiler::ThreeOptLayout::AddNonFallthroughSuccs(unsigned blockPos) +{ + assert(blockPos < numCandidateBlocks); + BasicBlock* const block = blockOrder[blockPos]; + BasicBlock* const next = ((blockPos + 1) >= numCandidateBlocks) ? nullptr : blockOrder[blockPos + 1]; + + for (FlowEdge* const succEdge : block->SuccEdges(compiler)) + { + if (succEdge->getDestinationBlock() != next) + { + ConsiderEdge(succEdge); + } + } +} + +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::AddNonFallthroughPreds: Considers every edge into a given block +// that doesn't fall through as a future cut point. +// +// Parameters: +// blockPos - The index into 'blockOrder' of the target block +// +void Compiler::ThreeOptLayout::AddNonFallthroughPreds(unsigned blockPos) +{ + assert(blockPos < numCandidateBlocks); + BasicBlock* const block = blockOrder[blockPos]; + BasicBlock* const prev = (blockPos == 0) ? nullptr : blockOrder[blockPos - 1]; + + for (FlowEdge* const predEdge : block->PredEdges()) + { + if (predEdge->getSourceBlock() != prev) + { + ConsiderEdge(predEdge); + } + } +} + +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::Run: Runs 3-opt for each contiguous region of the block list +// we're interested in reordering. +// We skip reordering handler regions for now, as these are assumed to be cold. +// +void Compiler::ThreeOptLayout::Run() +{ + // Walk backwards through the main method body, looking for the last hot block. + // Since we moved all cold blocks to the end of the method already, + // we should have a span of hot blocks to consider reordering at the beginning of the method. + // While doing this, try to get as tight an upper bound for the number of hot blocks as possible. + // For methods without funclet regions, 'numBlocksUpperBound' is exact. + // Otherwise, it's off by the number of handler blocks. + BasicBlock* finalBlock; + unsigned numBlocksUpperBound = compiler->fgBBcount; + for (finalBlock = compiler->fgLastBBInMainFunction(); + !finalBlock->IsFirst() && finalBlock->isBBWeightCold(compiler); finalBlock = finalBlock->Prev()) + { + numBlocksUpperBound--; + } + + // For methods with fewer than three candidate blocks, we cannot partition anything + if (finalBlock->IsFirst() || finalBlock->Prev()->IsFirst()) + { + JITDUMP("Not enough blocks to partition anything. Skipping 3-opt.\n"); + return; + } + + assert(numBlocksUpperBound != 0); + blockOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numBlocksUpperBound]; + tempOrder = new (compiler, CMK_BasicBlock) BasicBlock*[numBlocksUpperBound]; + + // Initialize the current block order. + // Note that we default-initialized 'ordinals' with zeros. + // Block reordering shouldn't change the method's entry point, + // so if a block has an ordinal of zero and it's not 'fgFirstBB', + // the block wasn't visited below, so it's not in the range of candidate blocks. + for (BasicBlock* const block : compiler->Blocks(compiler->fgFirstBB, finalBlock)) + { + assert(numCandidateBlocks < numBlocksUpperBound); + ordinals[block->bbNum] = numCandidateBlocks; + blockOrder[numCandidateBlocks] = tempOrder[numCandidateBlocks] = block; + numCandidateBlocks++; + + // While walking the span of blocks to reorder, + // remember where each try region ends within this span. + // We'll use this information to run 3-opt per region. + EHblkDsc* const HBtab = compiler->ehGetBlockTryDsc(block); + if (HBtab != nullptr) + { + HBtab->ebdTryLast = block; + } + } + + // Reorder try regions first + bool modified = false; + for (EHblkDsc* const HBtab : EHClauses(compiler)) + { + // If multiple region indices map to the same region, + // make sure we reorder its blocks only once + BasicBlock* const tryBeg = HBtab->ebdTryBeg; + if (tryBeg->getTryIndex() != currEHRegion++) + { + continue; + } + + // Only reorder try regions within the candidate span of blocks. + if ((ordinals[tryBeg->bbNum] != 0) || tryBeg->IsFirst()) + { + JITDUMP("Running 3-opt for try region #%d\n", (currEHRegion - 1)); + modified |= RunThreeOptPass(tryBeg, HBtab->ebdTryLast); + } + } + + // Finally, reorder the main method body + currEHRegion = 0; + JITDUMP("Running 3-opt for main method body\n"); + modified |= RunThreeOptPass(compiler->fgFirstBB, blockOrder[numCandidateBlocks - 1]); + + if (modified) + { + for (unsigned i = 1; i < numCandidateBlocks; i++) + { + BasicBlock* const block = blockOrder[i - 1]; + BasicBlock* const next = blockOrder[i]; + + // Only reorder within EH regions to maintain contiguity. + // TODO: Allow moving blocks in different regions when 'next' is the region entry. + // This would allow us to move entire regions up/down because of the contiguity requirement. + if (!block->NextIs(next) && BasicBlock::sameEHRegion(block, next)) + { + compiler->fgUnlinkBlock(next); + compiler->fgInsertBBafter(block, next); + } + } + } +} + +//----------------------------------------------------------------------------- +// Compiler::ThreeOptLayout::RunThreeOptPass: Runs 3-opt for the given block range. +// +// Parameters: +// startBlock - The first block of the range to reorder +// endBlock - The last block (inclusive) of the range to reorder +// +bool Compiler::ThreeOptLayout::RunThreeOptPass(BasicBlock* startBlock, BasicBlock* endBlock) +{ + assert(startBlock != nullptr); + assert(endBlock != nullptr); + assert(cutPoints.Empty()); + + bool modified = false; + const unsigned startPos = ordinals[startBlock->bbNum]; + const unsigned endPos = ordinals[endBlock->bbNum]; + const unsigned numBlocks = (endPos - startPos + 1); + assert((startPos != 0) || startBlock->IsFirst()); + assert(startPos <= endPos); + + if (numBlocks < 3) + { + JITDUMP("Not enough blocks to partition anything. Skipping reordering.\n"); + return false; + } + + // Initialize cutPoints with candidate branches in this section + for (unsigned position = startPos; position <= endPos; position++) + { + AddNonFallthroughSuccs(position); + } + + // For each candidate edge, determine if it's profitable to partition after the source block + // and before the destination block, and swap the partitions to create fallthrough. + // If it is, do the swap, and for the blocks before/after each cut point that lost fallthrough, + // consider adding their successors/predecessors to 'cutPoints'. + while (!cutPoints.Empty()) + { + FlowEdge* const candidateEdge = cutPoints.Pop(); + assert(candidateEdge->visited()); + + BasicBlock* const srcBlk = candidateEdge->getSourceBlock(); + BasicBlock* const dstBlk = candidateEdge->getDestinationBlock(); + const unsigned srcPos = ordinals[srcBlk->bbNum]; + const unsigned dstPos = ordinals[dstBlk->bbNum]; + + // This edge better be between blocks in the current region + assert((srcPos >= startPos) && (srcPos <= endPos)); + assert((dstPos >= startPos) && (dstPos <= endPos)); + + // 'dstBlk' better not be the region's entry point + assert(dstPos != startPos); + + // 'srcBlk' and 'dstBlk' better be distinct + assert(srcPos != dstPos); + + // Previous moves might have inadvertently created fallthrough from 'srcBlk' to 'dstBlk', + // so there's nothing to do this round. + if ((srcPos + 1) == dstPos) + { + assert(modified); + continue; + } + + // Before getting any edges, make sure 'ordinals' is accurate + assert(blockOrder[srcPos] == srcBlk); + assert(blockOrder[dstPos] == dstBlk); + + // To determine if it's worth creating fallthrough from 'srcBlk' into 'dstBlk', + // we first sum the weights of fallthrough edges at the proposed cut points + // (i.e. the "score" of the current partition order). + // Then, we do the same for the fallthrough edges that would be created by reordering partitions. + // If the new score exceeds the current score, then the proposed fallthrough gains + // justify losing the existing fallthrough behavior. + + auto getScore = [this](BasicBlock* srcBlk, BasicBlock* dstBlk) -> weight_t { + assert(srcBlk != nullptr); + assert(dstBlk != nullptr); + FlowEdge* const edge = compiler->fgGetPredForBlock(dstBlk, srcBlk); + return (edge != nullptr) ? edge->getLikelyWeight() : 0.0; + }; + + const bool isForwardJump = (srcPos < dstPos); + weight_t currScore, newScore; + unsigned part1Size, part2Size, part3Size; + + if (isForwardJump) + { + // Here is the proposed partition: + // S1: startPos ~ srcPos + // S2: srcPos+1 ~ dstPos-1 + // S3: dstPos ~ endPos + // S4: remaining blocks + // + // After the swap: + // S1: startPos ~ srcPos + // S3: dstPos ~ endPos + // S2: srcPos+1 ~ dstPos-1 + // S4: remaining blocks + part1Size = srcPos - startPos + 1; + part2Size = dstPos - srcPos - 1; + part3Size = endPos - dstPos + 1; + + currScore = getScore(srcBlk, blockOrder[srcPos + 1]) + getScore(blockOrder[dstPos - 1], dstBlk); + newScore = candidateEdge->getLikelyWeight() + getScore(blockOrder[endPos], blockOrder[srcPos + 1]); + + // Don't include branches into S4 in the cost/improvement calculation, + // since we're only considering branches within this region. + } + else + { + assert(dstPos < srcPos); + + // Here is the proposed partition: + // S1: startPos ~ dstPos-1 + // S2: dstPos ~ srcPos-1 + // S3: srcPos + // S4: srcPos+1 ~ endPos + // + // After the swap: + // S1: startPos ~ dstPos-1 + // S3: srcPos + // S2: dstPos ~ srcPos-1 + // S4: srcPos+1 ~ endPos + part1Size = dstPos - startPos; + part2Size = srcPos - dstPos; + part3Size = 1; + + currScore = getScore(blockOrder[srcPos - 1], srcBlk) + getScore(blockOrder[dstPos - 1], dstBlk); + newScore = candidateEdge->getLikelyWeight() + getScore(blockOrder[dstPos - 1], srcBlk); + + if (srcPos != endPos) + { + currScore += getScore(srcBlk, blockOrder[srcPos + 1]); + newScore += getScore(blockOrder[srcPos - 1], blockOrder[srcPos + 1]); + } + } + + // Continue evaluating candidates if this one isn't profitable + if ((newScore <= currScore) || Compiler::fgProfileWeightsEqual(newScore, currScore, 0.001)) + { + continue; + } + + // We've found a profitable cut point. Continue with the swap. + JITDUMP("Creating fallthrough for " FMT_BB " -> " FMT_BB + " (current partition score = %f, new partition score = %f)\n", + srcBlk->bbNum, dstBlk->bbNum, currScore, newScore); + + // Swap the partitions + BasicBlock** const regionStart = blockOrder + startPos; + BasicBlock** const tempStart = tempOrder + startPos; + memcpy(tempStart, regionStart, sizeof(BasicBlock*) * part1Size); + memcpy(tempStart + part1Size, regionStart + part1Size + part2Size, sizeof(BasicBlock*) * part3Size); + memcpy(tempStart + part1Size + part3Size, regionStart + part1Size, sizeof(BasicBlock*) * part2Size); + + // For backward jumps, there might be additional blocks after 'srcBlk' that need to be copied over. + const unsigned swappedSize = part1Size + part2Size + part3Size; + const unsigned remainingSize = numBlocks - swappedSize; + assert(numBlocks >= swappedSize); + assert((remainingSize == 0) || !isForwardJump); + memcpy(tempStart + swappedSize, regionStart + swappedSize, sizeof(BasicBlock*) * remainingSize); + + std::swap(blockOrder, tempOrder); + + // Update the ordinals for the blocks we moved + for (unsigned i = (startPos + part1Size); i <= endPos; i++) + { + ordinals[blockOrder[i]->bbNum] = i; + } + + // Ensure this move created fallthrough from 'srcBlk' to 'dstBlk' + assert((ordinals[srcBlk->bbNum] + 1) == ordinals[dstBlk->bbNum]); + modified = true; + + // At every cut point is an opportunity to consider more candidate edges. + // To the left of each cut point, consider successor edges that don't fall through. + // Ditto predecessor edges to the right of each cut point. + AddNonFallthroughSuccs(startPos + part1Size - 1); + AddNonFallthroughSuccs(startPos + part1Size + part2Size - 1); + AddNonFallthroughSuccs(startPos + part1Size + part2Size + part3Size - 1); + + AddNonFallthroughPreds(startPos + part1Size); + AddNonFallthroughPreds(startPos + part1Size + part2Size); + + if (remainingSize != 0) + { + AddNonFallthroughPreds(startPos + part1Size + part2Size + part3Size); + } + } + + // Write back to 'tempOrder' so changes to this region aren't lost next time we swap 'tempOrder' and 'blockOrder' + if (modified) + { + memcpy(tempOrder + startPos, blockOrder + startPos, sizeof(BasicBlock*) * numBlocks); + } + + return modified; +} + +//----------------------------------------------------------------------------- +// fgSearchImprovedLayout: Try to improve upon RPO-based layout with the 3-opt method: +// - Identify a range of hot blocks to reorder within +// - Partition this set into three segments: S1 - S2 - S3 +// - Evaluate cost of swapped layout: S1 - S3 - S2 +// - If the cost improves, keep this layout +// +void Compiler::fgSearchImprovedLayout() +{ +#ifdef DEBUG + if (verbose) + { + printf("*************** In fgSearchImprovedLayout()\n"); + + printf("\nInitial BasicBlocks"); + fgDispBasicBlocks(verboseTrees); + printf("\n"); + } +#endif // DEBUG + + ThreeOptLayout layoutRunner(this); + layoutRunner.Run(); +} + //------------------------------------------------------------- // fgUpdateFlowGraphPhase: run flow graph optimization as a // phase, with no tail duplication