From 3244072110c935af510a93d9c89d1ed39ff613ab Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 7 Oct 2021 17:28:19 -0700 Subject: [PATCH 01/24] Hide align behind a jmp fix the alignBytesRemoved Some fixes and working model Some fixes and redesign Some more fixes more fixes fix Add the check for fgFirstBB misc changes code cleanup + JitHideAlignBehindJmp switch validatePadding only if align are before the loop IG More cleanup, remove commented code jit format --- src/coreclr/jit/block.h | 36 ++++++ src/coreclr/jit/codegenlinear.cpp | 53 ++++++++- src/coreclr/jit/compiler.cpp | 6 + src/coreclr/jit/compiler.h | 11 +- src/coreclr/jit/emit.cpp | 180 ++++++++++++++++++++---------- src/coreclr/jit/emit.h | 40 +++++-- src/coreclr/jit/emitarm64.cpp | 14 ++- src/coreclr/jit/emitxarch.cpp | 38 +++++-- src/coreclr/jit/jitconfigvalues.h | 6 + src/coreclr/jit/lclvars.cpp | 32 +++++- src/coreclr/jit/lower.cpp | 2 +- 11 files changed, 329 insertions(+), 89 deletions(-) diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index ec7918d4aac95..8855333b9d327 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -552,6 +552,7 @@ enum BasicBlockFlags : unsigned __int64 BBF_PATCHPOINT = MAKE_BBFLAG(36), // Block is a patchpoint BBF_HAS_CLASS_PROFILE = MAKE_BBFLAG(37), // BB contains a call needing a class profile BBF_PARTIAL_COMPILATION_PATCHPOINT = MAKE_BBFLAG(38), // Block is a partial compilation patchpoint + BBF_LOOP_ALIGN_ADDED = MAKE_BBFLAG(39), // Alignment instruction is added for this block. // The following are sets of flags. @@ -653,6 +654,18 @@ struct BasicBlock : private LIR::Range { return ((bbFlags & BBF_LOOP_HEAD) != 0); } + + bool isLoopAlignAdded() const + { + return ((bbFlags & BBF_LOOP_ALIGN_ADDED) != 0); + } + + void setLoopAlignAdded() + { + assert(isLoopAlign()); + bbFlags |= BBF_LOOP_ALIGN_ADDED; + } + bool isLoopAlign() const { return ((bbFlags & BBF_LOOP_ALIGN) != 0); @@ -1869,6 +1882,29 @@ inline PredBlockList::iterator& PredBlockList::iterator::operator++() return *this; } +#if FEATURE_LOOP_ALIGN +// block list that needs alignment +// +struct alignBlocksList +{ +public: + alignBlocksList* next; + +private: + BasicBlock* m_block; + +public: + BasicBlock* getBlock() const + { + return m_block; + } + + alignBlocksList(BasicBlock* block) : next(nullptr), m_block(block) + { + } +}; +#endif // FEATURE_LOOP_ALIGN + // This enum represents a pre/post-visit action state to emulate a depth-first // spanning tree traversal of a tree or graph. enum DfsStackState diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 840ad2e8c6091..1057dfe08be0b 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -172,8 +172,13 @@ void CodeGen::genCodeForBBlist() BasicBlock* block; +#if FEATURE_LOOP_ALIGN + alignBlocksList* blockToAlign = compiler->alignBBLists; +#endif + for (block = compiler->fgFirstBB; block != nullptr; block = block->bbNext) { + #ifdef DEBUG if (compiler->verbose) { @@ -744,6 +749,17 @@ void CodeGen::genCodeForBBlist() case BBJ_ALWAYS: inst_JMP(EJ_jmp, block->bbJumpDest); + +#if FEATURE_LOOP_ALIGN + // Add 'align' instruction if the alignment for 'blockToAlign' was not + // already added. + if ((blockToAlign != nullptr) && !blockToAlign->getBlock()->isLoopAlignAdded()) + { + assert(ShouldAlignLoops()); + + GetEmitter()->emitLoopAlignment(blockToAlign->getBlock()); + } +#endif FALLTHROUGH; case BBJ_COND: @@ -784,9 +800,13 @@ void CodeGen::genCodeForBBlist() #if FEATURE_LOOP_ALIGN // If next block is the first block of a loop (identified by BBF_LOOP_ALIGN), - // then need to add align instruction in current "block". Also mark the - // corresponding IG with IGF_LOOP_ALIGN to know that there will be align - // instructions at the end of that IG. + // then need to add align instruction in current "block", provided there was + // no "align" instruction added after one of the previous "jmp" instructions. + // + // If the "align" instruction was added previously (isLoopAlignAdded() == true), + // update its idaTargetIG to point to the current IG. The current IG is the one + // that is just before the IG having loop start. Setting idaTargetIG, establish the + // connection of "align" instruction to the loop it actually wants to align. // // For non-adaptive alignment, add alignment instruction of size depending on the // compJitAlignLoopBoundary. @@ -794,9 +814,32 @@ void CodeGen::genCodeForBBlist() if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign())) { - assert(ShouldAlignLoops()); + if (compiler->opts.compJitHideAlignBehindJmp) + { + // blockToAlign should not be null because that's the one + // that we are trying to align. + assert(blockToAlign != nullptr); + assert(block->bbNext == blockToAlign->getBlock()); + + // If there was no unconditional jmp until now, just handle it before the loop + if (!blockToAlign->getBlock()->isLoopAlignAdded()) + { + assert(ShouldAlignLoops()); + + GetEmitter()->emitLoopAlignment(blockToAlign->getBlock()); + } + + GetEmitter()->emitConnectAlignInstrWithCurIG(); - GetEmitter()->emitLoopAlignment(); + // Move to the next block to be aligned + blockToAlign = blockToAlign->next; + } + else + { + assert(ShouldAlignLoops()); + + GetEmitter()->emitLoopAlignment(nullptr); + } } #endif diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 6d5745729ecaf..39fe14c921b7f 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -1947,6 +1947,10 @@ void Compiler::compInit(ArenaAllocator* pAlloc, #endif // FEATURE_SIMD compUsesThrowHelper = false; + +#if FEATURE_LOOP_ALIGN + alignBBLists = nullptr; +#endif } /***************************************************************************** @@ -2547,11 +2551,13 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.compJitAlignLoopForJcc = JitConfig.JitAlignLoopForJcc() == 1; opts.compJitAlignLoopMaxCodeSize = (unsigned short)JitConfig.JitAlignLoopMaxCodeSize(); + opts.compJitHideAlignBehindJmp = JitConfig.JitHideAlignBehindJmp() == 1; #else opts.compJitAlignLoopAdaptive = true; opts.compJitAlignLoopBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY; opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT; opts.compJitAlignLoopMaxCodeSize = DEFAULT_MAX_LOOPSIZE_FOR_ALIGN; + opts.compJitHideAlignBehindJmp = true; #endif #ifdef TARGET_XARCH diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 51e07b137527b..9d6d55523c80a 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3973,7 +3973,7 @@ class Compiler void lvaSortByRefCount(); void lvaMarkLocalVars(); // Local variable ref-counting - void lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers); + void lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers, bool calculateAlign = false); void lvaMarkLocalVars(BasicBlock* block, bool isRecompute); void lvaAllocOutgoingArgSpaceVar(); // Set up lvaOutgoingArgSpaceVar @@ -8333,6 +8333,12 @@ class Compiler weight_t refCntWtdStkDbl); #endif // DOUBLE_ALIGN +#if FEATURE_LOOP_ALIGN +public: + alignBlocksList* alignBBLists; + +#endif // FEATURE_LOOP_ALIGN + bool IsFullPtrRegMapRequired() { return codeGen->IsFullPtrRegMapRequired(); @@ -9687,6 +9693,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // If set, perform adaptive loop alignment that limits number of padding based on loop size. bool compJitAlignLoopAdaptive; + // If set, tries to hide alignment instructions behind unconditional jumps. + bool compJitHideAlignBehindJmp; + #ifdef LATE_DISASM bool doLateDisasm; // Run the late disassembler #endif // LATE_DISASM diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 54d57dd22b2f7..30e5d33a44c48 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -872,7 +872,7 @@ insGroup* emitter::emitSavIG(bool emitAdd) // Move align instructions to the global list, update their 'next' links do { - // Grab the jump and remove it from the list + // Grab the align and remove it from the list instrDescAlign* oa = emitCurIGAlignList; emitCurIGAlignList = oa->idaNext; @@ -913,6 +913,9 @@ insGroup* emitter::emitSavIG(bool emitAdd) } emitAlignLast = last; + // Point to the first instruction of most recent + // align instruction(s) added. + emitRecentFirstAlign = list; } #endif @@ -1071,8 +1074,8 @@ void emitter::emitBegFN(bool hasFramePtr #if FEATURE_LOOP_ALIGN /* We don't have any align instructions */ - emitAlignList = emitAlignLast = nullptr; - emitCurIGAlignList = nullptr; + emitAlignList = emitRecentFirstAlign = emitAlignLast = nullptr; + emitCurIGAlignList = nullptr; #endif /* We have not recorded any live sets */ @@ -1574,7 +1577,7 @@ void emitter::emitCheckIGoffsets() { if (tempIG->igOffs != currentOffset) { - printf("Block #%u has offset %08X, expected %08X\n", tempIG->igNum, tempIG->igOffs, currentOffset); + printf("IG%02u has offset %08X, expected %08X\n", tempIG->igNum, tempIG->igOffs, currentOffset); assert(!"bad block offset"); } @@ -3555,7 +3558,7 @@ void emitter::emitDispIGflags(unsigned flags) { printf(", extend"); } - if (flags & IGF_LOOP_ALIGN) + if (flags & IGF_HAS_ALIGN) { printf(", align"); } @@ -4818,10 +4821,10 @@ void emitter::emitCheckAlignFitInCurIG(unsigned short nAlignInstr) //----------------------------------------------------------------------------- // // The next instruction will be a loop head entry point -// So insert an alignment instruction here to ensure that -// we can properly align the code. +// So insert an alignment instruction of "paddingBytes" to ensure that +// the code is properly aligned. // -void emitter::emitLoopAlign(unsigned short paddingBytes) +void emitter::emitLoopAlign(unsigned short paddingBytes, bool isFirstAlign) { // Determine if 'align' instruction about to be generated will // fall in current IG or next. @@ -4859,6 +4862,18 @@ void emitter::emitLoopAlign(unsigned short paddingBytes) id->idaIG = emitCurIG; + if (isFirstAlign) + { + // For multiple align instructions, set the targetIG only for the + // first align instruction + id->idaTargetIG = emitCurIG; + emitRecentFirstAlign = id; + } + else + { + id->idaTargetIG = nullptr; + } + /* Append this instruction to this IG's alignment list */ id->idaNext = emitCurIGAlignList; @@ -4896,23 +4911,38 @@ void emitter::emitLongLoopAlign(unsigned short alignmentBoundary) /* Insert a pseudo-instruction to ensure that we align the next instruction properly */ + bool isFirstAlign = true; while (insAlignCount) { - emitLoopAlign(paddingBytes); + emitLoopAlign(paddingBytes, isFirstAlign); insAlignCount--; + isFirstAlign = false; } #if defined(TARGET_XARCH) - emitLoopAlign(lastInsAlignSize); + emitLoopAlign(lastInsAlignSize, isFirstAlign); #endif } +//----------------------------------------------------------------------------- +// emitConnectAlignInstrWithCurIG: If "align" instruction is not just before the loop start, +// setting targetIG lets us know the exact IG that the "align" +// instruction is trying to align. This is used during loop size +// calculation. +// +void emitter::emitConnectAlignInstrWithCurIG() +{ + // Since we never align overlapping instructions, it is always guaranteed that + // the emitRecentFirstAlign points to the loop that is in process of getting aligned. + emitRecentFirstAlign->idaTargetIG = emitCurIG; +} + //----------------------------------------------------------------------------- // emitLoopAlignment: Insert an align instruction at the end of emitCurIG and -// mark it as IGF_LOOP_ALIGN to indicate that next IG is a -// loop needing alignment. +// mark it as IGF_HAS_ALIGN to indicate that a next or a future +// IG is a loop that needs alignment. // -void emitter::emitLoopAlignment() +void emitter::emitLoopAlignment(BasicBlock* blockToAlign) { unsigned short paddingBytes; @@ -4928,7 +4958,7 @@ void emitter::emitLoopAlignment() { emitCheckAlignFitInCurIG(1); paddingBytes = MAX_ENCODED_SIZE; - emitLoopAlign(paddingBytes); + emitLoopAlign(paddingBytes, true); } #elif defined(TARGET_ARM64) // For Arm64, each align instruction is 4-bytes long because of fixed-length encoding. @@ -4944,6 +4974,18 @@ void emitter::emitLoopAlignment() emitLongLoopAlign(paddingBytes); #endif + assert(emitLastIns->idIns() == INS_align); + + + if (emitComp->opts.compJitHideAlignBehindJmp) + { + // Mark that the alignment for this block is handled. Useful when hiding align behind the jmp. + blockToAlign->setLoopAlignAdded(); + } + else + { + assert(blockToAlign == nullptr); + } JITDUMP("Adding 'align' instruction of %d bytes in %s.\n", paddingBytes, emitLabelString(emitCurIG)); #ifdef DEBUG @@ -4958,7 +5000,7 @@ void emitter::emitLoopAlignment() // bool emitter::emitEndsWithAlignInstr() { - return emitCurIG->isLoopAlign(); + return emitCurIG->endsWithAlignInstr(); } //----------------------------------------------------------------------------- @@ -4982,9 +5024,9 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext) { loopSize += igInLoop->igSize; - if (igInLoop->isLoopAlign()) + if (igInLoop->endsWithAlignInstr()) { - // If igInLoop is marked as "IGF_LOOP_ALIGN", the basic block flow detected a loop start. + // If igInLoop is marked as "IGF_HAS_ALIGN", the basic block flow detected a loop start. // If the loop was formed because of forward jumps like the loop IG18 below, the backedge is not // set for them and such loops are not aligned. For such cases, the loop size threshold will never // be met and we would break as soon as loopSize > maxLoopSize. @@ -5149,25 +5191,32 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) bool markedCurrLoop = alignCurrentLoop; while ((alignInstr != nullptr)) { - // Find the IG before current loop and clear the IGF_LOOP_ALIGN flag - if (!alignCurrentLoop && (alignInstr->idaIG->igNext == dstIG)) + // Find the IG that has 'align' instruction to align the current loop + // and clear the IGF_HAS_ALIGN flag. + if (!alignCurrentLoop && (alignInstr->idaTargetIG->igNext == dstIG)) { assert(!markedCurrLoop); - alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN; + + // This IG should no longer contain alignment instruction + alignInstr->idaIG->removeAlignInstr(); + markedCurrLoop = true; JITDUMP("** Skip alignment for current loop IG%02u ~ IG%02u because it encloses an aligned loop " "IG%02u ~ IG%02u.\n", currLoopStart, currLoopEnd, emitLastLoopStart, emitLastLoopEnd); } - // Find the IG before the last loop and clear the IGF_LOOP_ALIGN flag - if (!alignLastLoop && (alignInstr->idaIG->igNext != nullptr) && - (alignInstr->idaIG->igNext->igNum == emitLastLoopStart)) + // Find the IG that has 'align' instruction to align the last loop + // and clear the IGF_HAS_ALIGN flag. + if (!alignLastLoop && (alignInstr->idaTargetIG->igNext != nullptr) && + (alignInstr->idaTargetIG->igNext->igNum == emitLastLoopStart)) { assert(!markedLastLoop); - assert(alignInstr->idaIG->isLoopAlign()); + assert(alignInstr->idaIG->endsWithAlignInstr()); + + // This IG should no longer contain alignment instruction + alignInstr->idaIG->removeAlignInstr(); - alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN; markedLastLoop = true; JITDUMP("** Skip alignment for aligned loop IG%02u ~ IG%02u because it encloses the current loop " "IG%02u ~ IG%02u.\n", @@ -5179,21 +5228,7 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) break; } -#if defined(TARGET_XARCH) - if (!emitComp->opts.compJitAlignLoopAdaptive) -#endif - { - // If there are multiple align instructions, skip the align instructions after - // the first align instruction and fast forward to the next IG - insGroup* alignIG = alignInstr->idaIG; - while ((alignInstr != nullptr) && (alignInstr->idaNext != nullptr) && - (alignInstr->idaNext->idaIG == alignIG)) - { - alignInstr = alignInstr->idaNext; - } - } - - alignInstr = alignInstr->idaNext; + alignInstr = emitAlignInNextIG(alignInstr); } assert(markedLastLoop && markedCurrLoop); @@ -5232,21 +5267,31 @@ void emitter::emitLoopAlignAdjustments() unsigned loopIGOffset = 0; instrDescAlign* alignInstr = emitAlignList; - for (; alignInstr != nullptr; alignInstr = alignInstr->idaNext) + for (; alignInstr != nullptr;) { assert(alignInstr->idIns() == INS_align); - insGroup* alignIG = alignInstr->idaIG; + insGroup* alignIG = alignInstr->idaTargetIG; + insGroup* containingIG = alignInstr->idaIG; + + JITDUMP(" Adjusting 'align' instruction in IG%02u that is targeted for IG%02u \n", containingIG->igNum, + alignIG->igNum); - loopIGOffset = alignIG->igOffs + alignIG->igSize; + // Since we only adjust the padding up to the next align instruction which is behind the jump, we make sure + // that we take into account all the alignBytes we removed until that point. Hence " - alignBytesRemoved" + + loopIGOffset = alignIG->igNext->igOffs - alignBytesRemoved; // igSize also includes INS_align instruction, take it off. loopIGOffset -= estimatedPaddingNeeded; // IG can be marked as not needing alignment if during setting igLoopBackEdge, it is detected // that the igLoopBackEdge encloses an IG that is marked for alignment. + unsigned actualPaddingNeeded = - alignIG->isLoopAlign() ? emitCalculatePaddingForLoopAlignment(alignIG, loopIGOffset DEBUG_ARG(false)) : 0; + containingIG->endsWithAlignInstr() + ? emitCalculatePaddingForLoopAlignment(alignIG, loopIGOffset DEBUG_ARG(false)) + : 0; assert(estimatedPaddingNeeded >= actualPaddingNeeded); @@ -5254,15 +5299,15 @@ void emitter::emitLoopAlignAdjustments() if (diff != 0) { - alignIG->igSize -= diff; + containingIG->igSize -= diff; alignBytesRemoved += diff; emitTotalCodeSize -= diff; // Update the flags - alignIG->igFlags |= IGF_UPD_ISZ; + containingIG->igFlags |= IGF_UPD_ISZ; if (actualPaddingNeeded == 0) { - alignIG->igFlags &= ~IGF_LOOP_ALIGN; + containingIG->removeAlignInstr(); } #ifdef TARGET_XARCH @@ -5312,21 +5357,19 @@ void emitter::emitLoopAlignAdjustments() } assert(paddingToAdj == 0); assert(instrAdjusted == 0); - - // fast forward the align instruction to next IG - alignInstr = prevAlignInstr; } JITDUMP("Adjusted alignment of %s from %u to %u.\n", emitLabelString(alignIG), estimatedPaddingNeeded, actualPaddingNeeded); - JITDUMP("Adjusted size of %s from %u to %u.\n", emitLabelString(alignIG), (alignIG->igSize + diff), - alignIG->igSize); + JITDUMP("Adjusted size of %s from %u to %u.\n", emitLabelString(containingIG), + (containingIG->igSize + diff), containingIG->igSize); } // Adjust the offset of all IGs starting from next IG until we reach the IG having the next // align instruction or the end of IG list. - insGroup* adjOffIG = alignIG->igNext; - insGroup* adjOffUptoIG = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast; + insGroup* adjOffIG = containingIG->igNext; + instrDescAlign* nextAlign = emitAlignInNextIG(alignInstr); + insGroup* adjOffUptoIG = nextAlign != nullptr ? nextAlign->idaIG : emitIGlast; while ((adjOffIG != nullptr) && (adjOffIG->igNum <= adjOffUptoIG->igNum)) { JITDUMP("Adjusted offset of %s from %04X to %04X\n", emitLabelString(adjOffIG), adjOffIG->igOffs, @@ -5335,9 +5378,11 @@ void emitter::emitLoopAlignAdjustments() adjOffIG = adjOffIG->igNext; } + alignInstr = nextAlign; + if (actualPaddingNeeded > 0) { - // Record the last IG that has align instruction. No overestimation + // Record the last loop IG that will be aligned. No overestimation // adjustment will be done after emitLastAlignedIgNum. emitLastAlignedIgNum = alignIG->igNum; } @@ -5353,7 +5398,7 @@ void emitter::emitLoopAlignAdjustments() // end of 'ig' so the loop that starts after 'ig' is aligned. // // Arguments: -// ig - The IG having 'align' instruction in the end. +// ig - The IG before the IG that has the loop head that need to be aligned. // offset - The offset at which the IG that follows 'ig' starts. // isAlignAdjusted - Determine if adjustments are done to the align instructions or not. // During generating code, it is 'false' (because we haven't adjusted the size yet). @@ -5383,7 +5428,6 @@ void emitter::emitLoopAlignAdjustments() // unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool isAlignAdjusted)) { - assert(ig->isLoopAlign()); unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary; // No padding if loop is already aligned @@ -5525,6 +5569,28 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs return paddingToAdd; } +// emitAlignInNextIG: On xarch, for adaptive alignment, this will usually point to next instruction in +// 'emitAlignList'. But for arm64 or non-adaptive alignment on xarch, where multiple +// align instructions are emitted, this method will skip the 'align' instruction present +// in the same IG and return the first instruction that is present in next IG. +// +emitter::instrDescAlign* emitter::emitAlignInNextIG(instrDescAlign* alignInstr) +{ +#if defined(TARGET_XARCH) + if (!emitComp->opts.compJitAlignLoopAdaptive) +#endif + { + // If there are multiple align instructions, skip the align instructions after + // the first align instruction and fast forward to the next IG + insGroup* alignIG = alignInstr->idaIG; + while ((alignInstr != nullptr) && (alignInstr->idaNext != nullptr) && (alignInstr->idaNext->idaIG == alignIG)) + { + alignInstr = alignInstr->idaNext; + } + } + return alignInstr != nullptr ? alignInstr->idaNext : nullptr; +} + #endif // FEATURE_LOOP_ALIGN void emitter::emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 22e9cfd6c79e0..3803488bca087 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -275,8 +275,10 @@ struct insGroup #define IGF_PLACEHOLDER 0x0100 // this is a placeholder group, to be filled in later #define IGF_EXTEND 0x0200 // this block is conceptually an extension of the previous block // and the emitter should continue to track GC info as if there was no new block. -#define IGF_LOOP_ALIGN 0x0400 // this group contains alignment instruction(s) at the end; the next IG is the - // head of a loop that needs alignment. +#define IGF_HAS_ALIGN 0x0400 // this group contains an alignment instruction(s) in the end; + // the next IG may or may not be the head of a loop that is needing alignment + // This IG might end up with 'jmp' and hence 'align' instruction might be added + // in this IG, after the 'jmp' instruction. // Mask of IGF_* flags that should be propagated to new blocks when they are created. // This allows prologs and epilogs to be any number of IGs, but still be @@ -349,9 +351,14 @@ struct insGroup return *(unsigned*)ptr; } - bool isLoopAlign() + void removeAlignInstr() { - return (igFlags & IGF_LOOP_ALIGN) != 0; + igFlags &= ~IGF_HAS_ALIGN; + } + + bool endsWithAlignInstr() + { + return (igFlags & IGF_HAS_ALIGN) != 0; } }; // end of struct insGroup @@ -1383,14 +1390,13 @@ class emitter #if FEATURE_LOOP_ALIGN struct instrDescAlign : instrDesc { - instrDescAlign* idaNext; // next align in the group/method - insGroup* idaIG; // containing group + instrDescAlign* idaNext; // next align in the group/method + insGroup* idaIG; // containing group + insGroup* idaTargetIG; // The IG before the loop IG. + // If no 'jmp' instructions were found until targetIG, + // the 'align' will be placed in this IG. }; - void emitCheckAlignFitInCurIG(unsigned short nAlignInstr); - void emitLoopAlign(unsigned short paddingBytes); - void emitLongLoopAlign(unsigned short alignmentBoundary); - #endif // FEATURE_LOOP_ALIGN #if !defined(TARGET_ARM64) // This shouldn't be needed for ARM32, either, but I don't want to touch the ARM32 JIT. @@ -1780,15 +1786,25 @@ class emitter unsigned emitLastLoopStart; // Start IG of last inner loop unsigned emitLastLoopEnd; // End IG of last inner loop unsigned emitLastAlignedIgNum; // last IG that has align instruction - instrDescAlign* emitAlignList; // list of local align instructions in method + instrDescAlign* emitAlignList; // list of all align instructions in method instrDescAlign* emitAlignLast; // last align instruction in method + + // Points to the most recent added align instruction. If there are multiple align instructions like in arm64 or + // non-adpative alignment on xarch, this points to the first align instruction of the series of align instructions. + instrDescAlign* emitRecentFirstAlign; + unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG_ARG(bool isAlignAdjusted)); // Get the smallest loop size - void emitLoopAlignment(); + void emitLoopAlignment(BasicBlock* blockToAlign); bool emitEndsWithAlignInstr(); // Validate if newLabel is appropriate void emitSetLoopBackEdge(BasicBlock* loopTopBlock); void emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool isAlignAdjusted)); + + void emitLoopAlign(unsigned short paddingBytes, bool isFirstAlign); + void emitLongLoopAlign(unsigned short alignmentBoundary); + instrDescAlign* emitAlignInNextIG(instrDescAlign* alignInstr); + void emitConnectAlignInstrWithCurIG(); #endif void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 1ae7aa873a0ae..dbb464ea0010a 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -11439,13 +11439,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // IG can be marked as not needing alignment after emitting align instruction. // Alternatively, there are fewer align instructions needed than emitted. // If that is the case, skip outputting alignment. - if (!ig->isLoopAlign() || id->idIsEmptyAlign()) + if (!ig->endsWithAlignInstr() || id->idIsEmptyAlign()) { skipIns = true; } #ifdef DEBUG - if (!ig->isLoopAlign()) + if (!ig->endsWithAlignInstr()) { // Validate if the state is correctly updated assert(id->idIsEmptyAlign()); @@ -13330,7 +13330,15 @@ void emitter::emitDispIns( case IF_SN_0A: // SN_0A ................ ................ if (ins == INS_align) { - printf("[%d bytes]", id->idIsEmptyAlign() ? 0 : INSTR_ENCODED_SIZE); + instrDescAlign* alignInstrId = (instrDescAlign*)id; + printf("[%d bytes", id->idIsEmptyAlign() ? 0 : INSTR_ENCODED_SIZE); + + // targetIG is only set for 1st of the series of align instruction + if ((alignInstrId->idaTargetIG != nullptr) && (alignInstrId->idaTargetIG->igNext != nullptr)) + { + printf(" for IG%02u", alignInstrId->idaTargetIG->igNext->igNum); + } + printf("]"); } break; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 00c403d08768d..e58262af4e648 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -9830,7 +9830,14 @@ void emitter::emitDispIns( #if FEATURE_LOOP_ALIGN if (ins == INS_align) { - printf("[%d bytes]", id->idCodeSize()); + instrDescAlign* alignInstrId = (instrDescAlign*)id; + printf("[%d bytes", alignInstrId->idCodeSize()); + // targetIG is only set for 1st of the series of align instruction + if ((alignInstrId->idaTargetIG != nullptr) && (alignInstrId->idaTargetIG->igNext != nullptr)) + { + printf(" for IG%02u", alignInstrId->idaTargetIG->igNext->igNum); + } + printf("]"); } #endif break; @@ -10018,25 +10025,38 @@ static BYTE* emitOutputNOP(BYTE* dstRW, size_t nBytes) // BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) { + instrDescAlign* alignInstr = (instrDescAlign*)id; + +#ifdef DEBUG + // For cases where 'align' was placed behing a 'jmp' in an IG that does not + // immediately preced the loop IG, we do not know in advance the offset of + // IG having loop. For such cases, skip the padding calculation validation. + bool validatePadding = alignInstr->idaIG == alignInstr->idaTargetIG; +#endif + // Candidate for loop alignment assert(codeGen->ShouldAlignLoops()); - assert(ig->isLoopAlign()); + assert(ig->endsWithAlignInstr()); unsigned paddingToAdd = id->idCodeSize(); // Either things are already aligned or align them here. - assert((paddingToAdd == 0) || (((size_t)dst & (emitComp->opts.compJitAlignLoopBoundary - 1)) != 0)); + assert(!validatePadding || (paddingToAdd == 0) || + (((size_t)dst & (emitComp->opts.compJitAlignLoopBoundary - 1)) != 0)); // Padding amount should not exceed the alignment boundary assert(0 <= paddingToAdd && paddingToAdd < emitComp->opts.compJitAlignLoopBoundary); #ifdef DEBUG - unsigned paddingNeeded = emitCalculatePaddingForLoopAlignment(ig, (size_t)dst, true); - - // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking - if (emitComp->opts.compJitAlignLoopAdaptive) + if (validatePadding) { - assert(paddingToAdd == paddingNeeded); + unsigned paddingNeeded = emitCalculatePaddingForLoopAlignment(((instrDescAlign*)id)->idaIG, (size_t)dst, true); + + // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking + if (emitComp->opts.compJitAlignLoopAdaptive) + { + assert(paddingToAdd == paddingNeeded); + } } emitComp->loopsAligned++; @@ -13373,7 +13393,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) sz = sizeof(instrDescAlign); // IG can be marked as not needing alignment after emitting align instruction // In such case, skip outputting alignment. - if (ig->isLoopAlign()) + if (ig->endsWithAlignInstr()) { dst = emitOutputAlign(ig, id, dst); } diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 658ba2a69186a..976e58fe62d08 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -67,6 +67,12 @@ CONFIG_INTEGER(JitAlignLoopAdaptive, W("JitAlignLoopAdaptive"), 1) // If set, perform adaptive loop alignment that limits number of padding based on loop size. +CONFIG_INTEGER(JitHideAlignBehindJmp, + W("JitHideAlignBehindJmp"), + 1) // If set, try to hide align instruction (if any) behind an unconditional jump instruction (if any) + // that + // is present before the loop start. + // Print the alignment boundaries in disassembly. CONFIG_INTEGER(JitDasmWithAlignmentBoundaries, W("JitDasmWithAlignmentBoundaries"), 0) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 0c865355afe1e..e94f0c60037c0 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4407,6 +4407,8 @@ void Compiler::lvaMarkLocalVars() // groundwork for assertion prop, check type consistency, etc. // See lvaMarkLclRefs for details on what else goes on. // setSlotNumbers -- true if local slot numbers should be assigned. +// calculateAlign -- Populate the list of alignBlocksList chaining all the +// loop head blocks that needs alignment. // // Notes: // Some implicit references are given actual counts or weight bumps here @@ -4419,7 +4421,7 @@ void Compiler::lvaMarkLocalVars() // When optimizing we also recompute lvaGenericsContextInUse based // on specially flagged LCL_VAR appearances. // -void Compiler::lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers) +void Compiler::lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers, bool calculateAlign) { JITDUMP("\n*** lvaComputeRefCounts ***\n"); unsigned lclNum = 0; @@ -4524,6 +4526,10 @@ void Compiler::lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers) const bool oldLvaGenericsContextInUse = lvaGenericsContextInUse; lvaGenericsContextInUse = false; +#if FEATURE_LOOP_ALIGN + alignBlocksList* alignBB = nullptr; +#endif + JITDUMP("\n*** lvaComputeRefCounts -- explicit counts ***\n"); // Second, account for all explicit local variable references @@ -4531,6 +4537,30 @@ void Compiler::lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers) { if (block->IsLIR()) { +#if FEATURE_LOOP_ALIGN + if (calculateAlign) + { + // Track the blocks that needs alignment + // except if it is first block, because currently adding padding + // in prolog is not supported + if (opts.compJitHideAlignBehindJmp && block->isLoopAlign() && (block != fgFirstBB)) + { + alignBlocksList* curr = new (this, CMK_FlowList) alignBlocksList(block); + + if (alignBBLists == nullptr) + { + alignBBLists = curr; + } + else + { + alignBB->next = curr; + } + + alignBB = curr; + } + } +#endif + assert(isRecompute); const weight_t weight = block->getBBWeight(this); diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index b2d92cf6be5a8..3eea01d9fe989 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -6124,7 +6124,7 @@ PhaseStatus Lowering::DoPhase() // Recompute local var ref counts again after liveness to reflect // impact of any dead code removal. Note this may leave us with // tracked vars that have zero refs. - comp->lvaComputeRefCounts(isRecompute, setSlotNumbers); + comp->lvaComputeRefCounts(isRecompute, setSlotNumbers, true); return PhaseStatus::MODIFIED_EVERYTHING; } From e7c07102f346714f4941ca4f48e2655aa1f42ba4 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 21 Oct 2021 18:16:34 -0700 Subject: [PATCH 02/24] Fix a problem where curIG==0 and loop might be emitted in curIG, adjust the targetIG to prevIG Add IGF_REMOVED_ALIGN flag for special scenarios --- src/coreclr/jit/emit.cpp | 46 +++++++++++++++++++++++++++-------- src/coreclr/jit/emit.h | 19 ++++++++++++--- src/coreclr/jit/emitxarch.cpp | 4 ++- 3 files changed, 55 insertions(+), 14 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 30e5d33a44c48..89b6db13a3b1e 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -4932,9 +4932,24 @@ void emitter::emitLongLoopAlign(unsigned short alignmentBoundary) // void emitter::emitConnectAlignInstrWithCurIG() { + JITDUMP("Mapping 'align' instruction in IG%02u to target IG%02u\n", emitRecentFirstAlign->idaIG->igNum, + emitCurIG->igNum); // Since we never align overlapping instructions, it is always guaranteed that // the emitRecentFirstAlign points to the loop that is in process of getting aligned. - emitRecentFirstAlign->idaTargetIG = emitCurIG; + + if (emitCurIGsize == 0) + { + // However, if current IG is still empty (e.g. codegen didn't generate moves if they were redundant) + // in that case, loop will start in emitCurIG itself. For such cases, make sure that targetIG is + // pointing to previous IG. + assert(emitPrevIG != emitCurIG); + + emitRecentFirstAlign->idaTargetIG = emitPrevIG; + } + else + { + emitRecentFirstAlign->idaTargetIG = emitCurIG; + } } //----------------------------------------------------------------------------- @@ -5024,9 +5039,14 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext) { loopSize += igInLoop->igSize; - if (igInLoop->endsWithAlignInstr()) + if (igInLoop->endsWithAlignInstr() || igInLoop->isAlignInstrRemoved()) { - // If igInLoop is marked as "IGF_HAS_ALIGN", the basic block flow detected a loop start. + // If igInLoop can be in one of the following state: + // IGF_HAS_ALIGN - igInLoop contains align instruction at the end, for next IG or some future IG. + // IGF_REMOVED_ALIGN - igInLoop had align instruction at the end, but was removed. + // + // For both cases, remove the padding bytes from igInLoop's size so it is not included in loopSize. + // // If the loop was formed because of forward jumps like the loop IG18 below, the backedge is not // set for them and such loops are not aligned. For such cases, the loop size threshold will never // be met and we would break as soon as loopSize > maxLoopSize. @@ -5039,9 +5059,9 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG // ... // jne IG05 // - // If igInLoop is a legitimate loop, and igInLoop's next IG is also a loop that needs alignment, - // then igInLoop should be the last IG of the current loop and should have backedge to current - // loop header. + // If igInLoop is a legitimate loop, and igInLoop's end with another 'align' instruction for different IG + // representing a loop that needs alignment, then igInLoop should be the last IG of the current loop and + // should have backedge to current loop header. // // Below, IG05 is the last IG of loop IG04-IG05 and its backedge points to IG04. // @@ -5198,7 +5218,7 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) assert(!markedCurrLoop); // This IG should no longer contain alignment instruction - alignInstr->idaIG->removeAlignInstr(); + alignInstr->removeAlignFlags(); markedCurrLoop = true; JITDUMP("** Skip alignment for current loop IG%02u ~ IG%02u because it encloses an aligned loop " @@ -5215,7 +5235,7 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) assert(alignInstr->idaIG->endsWithAlignInstr()); // This IG should no longer contain alignment instruction - alignInstr->idaIG->removeAlignInstr(); + alignInstr->removeAlignFlags(); markedLastLoop = true; JITDUMP("** Skip alignment for aligned loop IG%02u ~ IG%02u because it encloses the current loop " @@ -5275,7 +5295,7 @@ void emitter::emitLoopAlignAdjustments() insGroup* containingIG = alignInstr->idaIG; JITDUMP(" Adjusting 'align' instruction in IG%02u that is targeted for IG%02u \n", containingIG->igNum, - alignIG->igNum); + alignIG->igNext->igNum); // Since we only adjust the padding up to the next align instruction which is behind the jump, we make sure // that we take into account all the alignBytes we removed until that point. Hence " - alignBytesRemoved" @@ -5307,7 +5327,7 @@ void emitter::emitLoopAlignAdjustments() containingIG->igFlags |= IGF_UPD_ISZ; if (actualPaddingNeeded == 0) { - containingIG->removeAlignInstr(); + alignInstr->removeAlignFlags(); } #ifdef TARGET_XARCH @@ -8415,6 +8435,12 @@ insGroup* emitter::emitAllocAndLinkIG() ig->igFlags |= (emitCurIG->igFlags & IGF_PROPAGATE_MASK); +#ifdef FEATURE_LOOP_ALIGN + /* Save the prev IG */ + + emitPrevIG = emitCurIG; +#endif + /* Set the new IG as the current IG */ emitCurIG = ig; diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 3803488bca087..befd161e810fe 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -275,7 +275,10 @@ struct insGroup #define IGF_PLACEHOLDER 0x0100 // this is a placeholder group, to be filled in later #define IGF_EXTEND 0x0200 // this block is conceptually an extension of the previous block // and the emitter should continue to track GC info as if there was no new block. -#define IGF_HAS_ALIGN 0x0400 // this group contains an alignment instruction(s) in the end; +#define IGF_REMOVED_ALIGN 0x0400 // this group had alignment instruction(s) at the end, but was removed; + // Useful in scenario where an IG is part of a loop also contains align instruction + // for a different loop and later, we decide to not align that different loop. +#define IGF_HAS_ALIGN 0x0800 // this group contains an alignment instruction(s) in the end; // the next IG may or may not be the head of a loop that is needing alignment // This IG might end up with 'jmp' and hence 'align' instruction might be added // in this IG, after the 'jmp' instruction. @@ -351,9 +354,9 @@ struct insGroup return *(unsigned*)ptr; } - void removeAlignInstr() + bool isAlignInstrRemoved() { - igFlags &= ~IGF_HAS_ALIGN; + return (igFlags & IGF_REMOVED_ALIGN) != 0; } bool endsWithAlignInstr() @@ -1395,6 +1398,12 @@ class emitter insGroup* idaTargetIG; // The IG before the loop IG. // If no 'jmp' instructions were found until targetIG, // the 'align' will be placed in this IG. + + void removeAlignFlags() + { + idaIG->igFlags |= IGF_REMOVED_ALIGN; + idaIG->igFlags &= ~IGF_HAS_ALIGN; + } }; void emitCheckAlignFitInCurIG(unsigned short nAlignInstr); #endif // FEATURE_LOOP_ALIGN @@ -1732,6 +1741,9 @@ class emitter bool emitChkAlign; // perform some alignment checks #endif +#ifdef FEATURE_LOOP_ALIGN + insGroup* emitPrevIG; +#endif insGroup* emitCurIG; void emitSetShortJump(instrDescJmp* id); @@ -1805,6 +1817,7 @@ class emitter void emitLongLoopAlign(unsigned short alignmentBoundary); instrDescAlign* emitAlignInNextIG(instrDescAlign* alignInstr); void emitConnectAlignInstrWithCurIG(); + #endif void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index e58262af4e648..4b21759ade2f0 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -10063,7 +10063,9 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) #endif BYTE* dstRW = dst + writeableOffset; - dstRW = emitOutputNOP(dstRW, paddingToAdd); + if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 1) && !validatePadding && paddingToAdd >= 1) + // if (emitComp->opts.disAsm) + dstRW = emitOutputNOP(dstRW, paddingToAdd); return dstRW - writeableOffset; } From bd922aa6f74f2b4bdd73ad3fac08779bd3d5bb58 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 21 Oct 2021 18:16:00 -0700 Subject: [PATCH 03/24] Add stress mode to emit int3 for xarch --- src/coreclr/jit/emitxarch.cpp | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 4b21759ade2f0..9664fbd58f539 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -10063,8 +10063,35 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) #endif BYTE* dstRW = dst + writeableOffset; - if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 1) && !validatePadding && paddingToAdd >= 1) - // if (emitComp->opts.disAsm) + +#ifdef DEBUG + // Under STRESS_EMITTER, if this is the 'align' before the 'jmp' instruction, + // then add "int3" instruction. Sinc int3 takes 1 byte, we would only add + // it if paddingToAdd >= 1 byte. + + if(emitComp->compStressCompile(Compiler::STRESS_EMITTER, 1) && + !validatePadding && paddingToAdd >= 1) + { + size_t int3Code = insCodeMR(INS_BREAKPOINT); + // There is no good way to squeeze in "int3" as well as display it + // in the disassembly because there is no corresponding instrDesc for + // it. As such, leave it as is, the "0xCC" bytecode will be seen next + // to the nop instruction in disasm + // e.g. CC align [1 bytes for IG29] + // + //if (emitComp->opts.disAsm) + //{ + // emitDispInsAddr(dstRW); + + // emitDispInsOffs(0, false); + + // printf(" %-9s ; stress-mode injected interrupt\n", "int3"); + //} + dstRW += emitOutputByte(dstRW, int3Code); + paddingToAdd -= 1; + } +#endif + dstRW = emitOutputNOP(dstRW, paddingToAdd); return dstRW - writeableOffset; } From 4d0f912ef185b7e8a2b84b711c9687fa2e8ae1c8 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 22 Oct 2021 13:57:01 -0700 Subject: [PATCH 04/24] Add stress mode to emit bkpt for arm64 --- src/coreclr/jit/emitarm64.cpp | 17 +++++++++++++++++ src/coreclr/jit/emitxarch.cpp | 4 ++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index dbb464ea0010a..989cb6accfbe9 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -11453,6 +11453,23 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) #endif sz = sizeof(instrDescAlign); ins = INS_nop; + +#ifdef DEBUG + // Under STRESS_EMITTER, if this is the 'align' before the 'jmp' instruction, + // then add "bkpt" instruction. + instrDescAlign* alignInstr = (instrDescAlign*)id; + + if (/*emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) &&*/ + (alignInstr->idaIG != alignInstr->idaTargetIG) && !skipIns) + { + // There is no good way to squeeze in "int3" as well as display it + // in the disassembly because there is no corresponding instrDesc for + // it. As such, leave it as is, the "0xD43E0000" bytecode will be seen + // next to the nop instruction in disasm. + // e.g. D43E0000 align [4 bytes for IG07] + ins = INS_BREAKPOINT; + } +#endif } #endif // FEATURE_LOOP_ALIGN diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 9664fbd58f539..e800ad69ffa41 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -10069,14 +10069,14 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) // then add "int3" instruction. Sinc int3 takes 1 byte, we would only add // it if paddingToAdd >= 1 byte. - if(emitComp->compStressCompile(Compiler::STRESS_EMITTER, 1) && + if(emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) && !validatePadding && paddingToAdd >= 1) { size_t int3Code = insCodeMR(INS_BREAKPOINT); // There is no good way to squeeze in "int3" as well as display it // in the disassembly because there is no corresponding instrDesc for // it. As such, leave it as is, the "0xCC" bytecode will be seen next - // to the nop instruction in disasm + // to the nop instruction in disasm. // e.g. CC align [1 bytes for IG29] // //if (emitComp->opts.disAsm) From 8d64351541fbcaae58c164f52dfe4e4dbaf1a5d4 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 29 Oct 2021 10:20:06 -0700 Subject: [PATCH 05/24] Add a loop align instruction placement phase --- src/coreclr/jit/block.h | 39 +++--------------- src/coreclr/jit/codegen.h | 4 ++ src/coreclr/jit/codegenlinear.cpp | 64 +++++++---------------------- src/coreclr/jit/compiler.cpp | 67 +++++++++++++++++++++++++++++-- src/coreclr/jit/compiler.h | 8 ++-- src/coreclr/jit/compphases.h | 1 + src/coreclr/jit/emit.cpp | 22 +++------- src/coreclr/jit/emit.h | 2 +- src/coreclr/jit/emitxarch.cpp | 5 +-- src/coreclr/jit/lclvars.cpp | 28 ------------- src/coreclr/jit/optimizer.cpp | 5 ++- 11 files changed, 105 insertions(+), 140 deletions(-) diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index 8855333b9d327..e96151e8818d8 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -552,7 +552,7 @@ enum BasicBlockFlags : unsigned __int64 BBF_PATCHPOINT = MAKE_BBFLAG(36), // Block is a patchpoint BBF_HAS_CLASS_PROFILE = MAKE_BBFLAG(37), // BB contains a call needing a class profile BBF_PARTIAL_COMPILATION_PATCHPOINT = MAKE_BBFLAG(38), // Block is a partial compilation patchpoint - BBF_LOOP_ALIGN_ADDED = MAKE_BBFLAG(39), // Alignment instruction is added for this block. + BBF_HAS_ALIGN = MAKE_BBFLAG(39), // BB ends with 'align' instruction // The following are sets of flags. @@ -655,20 +655,14 @@ struct BasicBlock : private LIR::Range return ((bbFlags & BBF_LOOP_HEAD) != 0); } - bool isLoopAlignAdded() const - { - return ((bbFlags & BBF_LOOP_ALIGN_ADDED) != 0); - } - - void setLoopAlignAdded() + bool isLoopAlign() const { - assert(isLoopAlign()); - bbFlags |= BBF_LOOP_ALIGN_ADDED; + return ((bbFlags & BBF_LOOP_ALIGN) != 0); } - bool isLoopAlign() const + bool hasAlign() const { - return ((bbFlags & BBF_LOOP_ALIGN) != 0); + return ((bbFlags & BBF_HAS_ALIGN) != 0); } #ifdef DEBUG @@ -1882,29 +1876,6 @@ inline PredBlockList::iterator& PredBlockList::iterator::operator++() return *this; } -#if FEATURE_LOOP_ALIGN -// block list that needs alignment -// -struct alignBlocksList -{ -public: - alignBlocksList* next; - -private: - BasicBlock* m_block; - -public: - BasicBlock* getBlock() const - { - return m_block; - } - - alignBlocksList(BasicBlock* block) : next(nullptr), m_block(block) - { - } -}; -#endif // FEATURE_LOOP_ALIGN - // This enum represents a pre/post-visit action state to emulate a depth-first // spanning tree traversal of a tree or graph. enum DfsStackState diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 14ebb0d27e450..48a5576e87364 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -210,6 +210,10 @@ class CodeGen final : public CodeGenInterface void genInitializeRegisterState(); +#if FEATURE_LOOP_ALIGN + void genMarkBlocksForLoopAlignment(); +#endif + void genCodeForBBlist(); public: diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 1057dfe08be0b..590b92cd8f03a 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -172,10 +172,6 @@ void CodeGen::genCodeForBBlist() BasicBlock* block; -#if FEATURE_LOOP_ALIGN - alignBlocksList* blockToAlign = compiler->alignBBLists; -#endif - for (block = compiler->fgFirstBB; block != nullptr; block = block->bbNext) { @@ -749,17 +745,6 @@ void CodeGen::genCodeForBBlist() case BBJ_ALWAYS: inst_JMP(EJ_jmp, block->bbJumpDest); - -#if FEATURE_LOOP_ALIGN - // Add 'align' instruction if the alignment for 'blockToAlign' was not - // already added. - if ((blockToAlign != nullptr) && !blockToAlign->getBlock()->isLoopAlignAdded()) - { - assert(ShouldAlignLoops()); - - GetEmitter()->emitLoopAlignment(blockToAlign->getBlock()); - } -#endif FALLTHROUGH; case BBJ_COND: @@ -798,47 +783,28 @@ void CodeGen::genCodeForBBlist() } #if FEATURE_LOOP_ALIGN + if (block->hasAlign()) + { + // If this block has 'align' instruction in the end (identified by BBF_HAS_ALIGN), + // then need to add align instruction in the current "block". + // + // For non-adaptive alignment, add alignment instruction of size depending on the + // compJitAlignLoopBoundary. + // For adaptive alignment, alignment instruction will always be of 15 bytes for xarch + // and 16 bytes for arm64. + assert(ShouldAlignLoops()); - // If next block is the first block of a loop (identified by BBF_LOOP_ALIGN), - // then need to add align instruction in current "block", provided there was - // no "align" instruction added after one of the previous "jmp" instructions. - // - // If the "align" instruction was added previously (isLoopAlignAdded() == true), - // update its idaTargetIG to point to the current IG. The current IG is the one - // that is just before the IG having loop start. Setting idaTargetIG, establish the - // connection of "align" instruction to the loop it actually wants to align. - // - // For non-adaptive alignment, add alignment instruction of size depending on the - // compJitAlignLoopBoundary. - // For adaptive alignment, alignment instruction will always be of 15 bytes. + GetEmitter()->emitLoopAlignment(); + } if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign())) { if (compiler->opts.compJitHideAlignBehindJmp) { - // blockToAlign should not be null because that's the one - // that we are trying to align. - assert(blockToAlign != nullptr); - assert(block->bbNext == blockToAlign->getBlock()); - - // If there was no unconditional jmp until now, just handle it before the loop - if (!blockToAlign->getBlock()->isLoopAlignAdded()) - { - assert(ShouldAlignLoops()); - - GetEmitter()->emitLoopAlignment(blockToAlign->getBlock()); - } - + // The current IG is the one that is just before the IG having loop start. + // Establish a connection of recent align instruction emitted to the loop + // it actually is aligning using 'idaTargetIG'. GetEmitter()->emitConnectAlignInstrWithCurIG(); - - // Move to the next block to be aligned - blockToAlign = blockToAlign->next; - } - else - { - assert(ShouldAlignLoops()); - - GetEmitter()->emitLoopAlignment(nullptr); } } #endif diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 39fe14c921b7f..f8eac9a64b2c9 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -1947,10 +1947,6 @@ void Compiler::compInit(ArenaAllocator* pAlloc, #endif // FEATURE_SIMD compUsesThrowHelper = false; - -#if FEATURE_LOOP_ALIGN - alignBBLists = nullptr; -#endif } /***************************************************************************** @@ -5158,6 +5154,14 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl fgDebugCheckLinks(); #endif +#if FEATURE_LOOP_ALIGN + if (needsLoopAlignment) + { + // Place loop alignment instructions + DoPhase(this, PHASE_ALIGN_LOOPS, &Compiler::bbPlaceLoopAlignInstructions); + } +#endif + // Generate code codeGen->genGenerateCode(methodCodePtr, methodCodeSize); @@ -5214,6 +5218,61 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl #endif // FUNC_INFO_LOGGING } +#if FEATURE_LOOP_ALIGN + +//------------------------------------------------------------------------ +// bbPlaceLoopAlignInstructions: Iterate over all the blocks and determine +// the best position to place the 'align' instruction. After 'jmp' are +// preferred over block before loop and in case there are multiple blocks +// having 'jmp', the one that has lower weight is preferred. +// If the block having 'jmp' is hot than the block before loop, the align +// will still be placed after 'jmp' because the processor should be smart +// enough to not fetch extra instruction beyond jmp. +// +void Compiler::bbPlaceLoopAlignInstructions() +{ + assert(needsLoopAlignment); + + // Add align only if there were any loops that needed alignment + weight_t minBlockSoFar = INFINITE; + BasicBlock* bbHavingAlign = nullptr; + for (BasicBlock* const block : Blocks()) + { + // If there is a unconditional jump + if (opts.compJitHideAlignBehindJmp && (block->bbJumpKind == BBJ_ALWAYS)) + { + if (block->bbWeight < minBlockSoFar) + { + minBlockSoFar = block->bbWeight; + bbHavingAlign = block; + JITDUMP(FMT_BB ", bbWeight= %f ends with unconditional 'jmp' \n", block->bbNum, block->bbWeight); + } + } + + if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign())) + { + // If jmp was not found, then block before the loop start is where align instruction will be added. + if (bbHavingAlign == nullptr) + { + bbHavingAlign = block; + JITDUMP("Marking " FMT_BB " before the loop with BBF_HAS_ALIGN for loop at " FMT_BB "\n", block->bbNum, + block->bbNext->bbNum); + } + else + { + JITDUMP("Marking " FMT_BB " that ends with uncondition jump with BBF_HAS_ALIGN for loop at " FMT_BB + "\n", + bbHavingAlign->bbNum, block->bbNext->bbNum); + } + + bbHavingAlign->bbFlags |= BBF_HAS_ALIGN; + minBlockSoFar = INFINITE; + bbHavingAlign = nullptr; + } + } +} +#endif + //------------------------------------------------------------------------ // generatePatchpointInfo: allocate and fill in patchpoint info data, // and report it to the VM diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 9d6d55523c80a..bfa6dfc2fe69f 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3666,6 +3666,7 @@ class Compiler #endif BasicBlock* bbNewBasicBlock(BBjumpKinds jumpKind); + void bbPlaceLoopAlignInstructions(); /* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX @@ -6871,8 +6872,9 @@ class Compiler bool fgHasLoops; // True if this method has any loops, set in fgComputeReachability public: - LoopDsc* optLoopTable; // loop descriptor table - unsigned char optLoopCount; // number of tracked loops + LoopDsc* optLoopTable; // loop descriptor table + unsigned char optLoopCount; // number of tracked loops + bool needsLoopAlignment; // true if at least one loop was identified as alignment candidate. #ifdef DEBUG unsigned char loopAlignCandidates; // number of loops identified for alignment @@ -8335,7 +8337,7 @@ class Compiler #if FEATURE_LOOP_ALIGN public: - alignBlocksList* alignBBLists; +// alignBlocksList* alignBBLists; #endif // FEATURE_LOOP_ALIGN diff --git a/src/coreclr/jit/compphases.h b/src/coreclr/jit/compphases.h index bb992440f8e3d..b8805fca75352 100644 --- a/src/coreclr/jit/compphases.h +++ b/src/coreclr/jit/compphases.h @@ -87,6 +87,7 @@ CompPhaseNameMacro(PHASE_INSERT_GC_POLLS, "Insert GC Polls", CompPhaseNameMacro(PHASE_DETERMINE_FIRST_COLD_BLOCK, "Determine first cold block", "COLD-BLK", false, -1, true) CompPhaseNameMacro(PHASE_RATIONALIZE, "Rationalize IR", "RAT", false, -1, false) CompPhaseNameMacro(PHASE_SIMPLE_LOWERING, "Do 'simple' lowering", "SMP-LWR", false, -1, false) +CompPhaseNameMacro(PHASE_ALIGN_LOOPS, "Place 'align' instructions", "LOOP-ALIGN", false, -1, false) CompPhaseNameMacro(PHASE_LCLVARLIVENESS, "Local var liveness", "LIVENESS", true, -1, false) CompPhaseNameMacro(PHASE_LCLVARLIVENESS_INIT, "Local var liveness init", "LIV-INIT", false, PHASE_LCLVARLIVENESS, false) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 89b6db13a3b1e..22463ff8d3d45 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -1404,7 +1404,7 @@ void* emitter::emitAllocAnyInstr(size_t sz, emitAttr opsz) // the prolog/epilog placeholder groups ARE generated in order, and are // re-used. But generating additional groups would not work. if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 1) && emitCurIGinsCnt && !emitIGisInProlog(emitCurIG) && - !emitIGisInEpilog(emitCurIG) && !emitCurIG->isLoopAlign() + !emitIGisInEpilog(emitCurIG) && !emitCurIG->endsWithAlignInstr() #if defined(FEATURE_EH_FUNCLETS) && !emitIGisInFuncletProlog(emitCurIG) && !emitIGisInFuncletEpilog(emitCurIG) #endif // FEATURE_EH_FUNCLETS @@ -4834,7 +4834,7 @@ void emitter::emitLoopAlign(unsigned short paddingBytes, bool isFirstAlign) { // If align fits in current IG, then mark that it contains alignment // instruction in the end. - emitCurIG->igFlags |= IGF_LOOP_ALIGN; + emitCurIG->igFlags |= IGF_HAS_ALIGN; } /* Insert a pseudo-instruction to ensure that we align @@ -4845,12 +4845,12 @@ void emitter::emitLoopAlign(unsigned short paddingBytes, bool isFirstAlign) { // Mark this IG has alignment in the end, so during emitter we can check the instruction count // heuristics of all IGs that follows this IG that participate in a loop. - emitCurIG->igFlags |= IGF_LOOP_ALIGN; + emitCurIG->igFlags |= IGF_HAS_ALIGN; } else { // Otherwise, make sure it was already marked such. - assert(emitCurIG->isLoopAlign()); + assert(emitCurIG->endsWithAlignInstr()); } #if defined(TARGET_XARCH) @@ -4957,7 +4957,7 @@ void emitter::emitConnectAlignInstrWithCurIG() // mark it as IGF_HAS_ALIGN to indicate that a next or a future // IG is a loop that needs alignment. // -void emitter::emitLoopAlignment(BasicBlock* blockToAlign) +void emitter::emitLoopAlignment() { unsigned short paddingBytes; @@ -4991,16 +4991,6 @@ void emitter::emitLoopAlignment(BasicBlock* blockToAlign) assert(emitLastIns->idIns() == INS_align); - - if (emitComp->opts.compJitHideAlignBehindJmp) - { - // Mark that the alignment for this block is handled. Useful when hiding align behind the jmp. - blockToAlign->setLoopAlignAdded(); - } - else - { - assert(blockToAlign == nullptr); - } JITDUMP("Adding 'align' instruction of %d bytes in %s.\n", paddingBytes, emitLabelString(emitCurIG)); #ifdef DEBUG @@ -5295,7 +5285,7 @@ void emitter::emitLoopAlignAdjustments() insGroup* containingIG = alignInstr->idaIG; JITDUMP(" Adjusting 'align' instruction in IG%02u that is targeted for IG%02u \n", containingIG->igNum, - alignIG->igNext->igNum); + alignIG->igNum); // Since we only adjust the padding up to the next align instruction which is behind the jump, we make sure // that we take into account all the alignBytes we removed until that point. Hence " - alignBytesRemoved" diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index befd161e810fe..23a7c7b11bac9 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1807,7 +1807,7 @@ class emitter unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG_ARG(bool isAlignAdjusted)); // Get the smallest loop size - void emitLoopAlignment(BasicBlock* blockToAlign); + void emitLoopAlignment(); bool emitEndsWithAlignInstr(); // Validate if newLabel is appropriate void emitSetLoopBackEdge(BasicBlock* loopTopBlock); void emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index e800ad69ffa41..9e8253520a3d8 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -10069,8 +10069,7 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) // then add "int3" instruction. Sinc int3 takes 1 byte, we would only add // it if paddingToAdd >= 1 byte. - if(emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) && - !validatePadding && paddingToAdd >= 1) + if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) && !validatePadding && paddingToAdd >= 1) { size_t int3Code = insCodeMR(INS_BREAKPOINT); // There is no good way to squeeze in "int3" as well as display it @@ -10079,7 +10078,7 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) // to the nop instruction in disasm. // e.g. CC align [1 bytes for IG29] // - //if (emitComp->opts.disAsm) + // if (emitComp->opts.disAsm) //{ // emitDispInsAddr(dstRW); diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index e94f0c60037c0..1108cd53cef96 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4526,10 +4526,6 @@ void Compiler::lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers, bool c const bool oldLvaGenericsContextInUse = lvaGenericsContextInUse; lvaGenericsContextInUse = false; -#if FEATURE_LOOP_ALIGN - alignBlocksList* alignBB = nullptr; -#endif - JITDUMP("\n*** lvaComputeRefCounts -- explicit counts ***\n"); // Second, account for all explicit local variable references @@ -4537,30 +4533,6 @@ void Compiler::lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers, bool c { if (block->IsLIR()) { -#if FEATURE_LOOP_ALIGN - if (calculateAlign) - { - // Track the blocks that needs alignment - // except if it is first block, because currently adding padding - // in prolog is not supported - if (opts.compJitHideAlignBehindJmp && block->isLoopAlign() && (block != fgFirstBB)) - { - alignBlocksList* curr = new (this, CMK_FlowList) alignBlocksList(block); - - if (alignBBLists == nullptr) - { - alignBBLists = curr; - } - else - { - alignBB->next = curr; - } - - alignBB = curr; - } - } -#endif - assert(isRecompute); const weight_t weight = block->getBBWeight(this); diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index e3f0218bd7ba8..afbd00bde863c 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -20,8 +20,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void Compiler::optInit() { - optLoopsMarked = false; - fgHasLoops = false; + optLoopsMarked = false; + fgHasLoops = false; + needsLoopAlignment = false; /* Initialize the # of tracked loops to 0 */ optLoopCount = 0; From 9b9b616f4dd18e72340e7b84ec4242e3a11849ff Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 29 Oct 2021 11:24:13 -0700 Subject: [PATCH 06/24] review comments --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/emit.cpp | 5 ++++- src/coreclr/jit/emit.h | 20 +++++++++----------- src/coreclr/jit/jitconfigvalues.h | 3 +-- src/coreclr/jit/lclvars.cpp | 4 +--- src/coreclr/jit/lower.cpp | 2 +- 6 files changed, 17 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index bfa6dfc2fe69f..748c169c34780 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3974,7 +3974,7 @@ class Compiler void lvaSortByRefCount(); void lvaMarkLocalVars(); // Local variable ref-counting - void lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers, bool calculateAlign = false); + void lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers); void lvaMarkLocalVars(BasicBlock* block, bool isRecompute); void lvaAllocOutgoingArgSpaceVar(); // Set up lvaOutgoingArgSpaceVar diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 22463ff8d3d45..db0ecb253a1b3 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -4892,6 +4892,9 @@ void emitter::emitLoopAlign(unsigned short paddingBytes, bool isFirstAlign) // This emits more than one `INS_align` instruction depending on the // alignmentBoundary parameter. // +// Arguments: +// alignmentBoundary - The boundary at which loop needs to be aligned. +// void emitter::emitLongLoopAlign(unsigned short alignmentBoundary) { #if defined(TARGET_XARCH) @@ -4926,7 +4929,7 @@ void emitter::emitLongLoopAlign(unsigned short alignmentBoundary) //----------------------------------------------------------------------------- // emitConnectAlignInstrWithCurIG: If "align" instruction is not just before the loop start, -// setting targetIG lets us know the exact IG that the "align" +// setting idaTargetIG lets us know the exact IG that the "align" // instruction is trying to align. This is used during loop size // calculation. // diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 23a7c7b11bac9..45acf5dd6be8a 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -275,13 +275,11 @@ struct insGroup #define IGF_PLACEHOLDER 0x0100 // this is a placeholder group, to be filled in later #define IGF_EXTEND 0x0200 // this block is conceptually an extension of the previous block // and the emitter should continue to track GC info as if there was no new block. -#define IGF_REMOVED_ALIGN 0x0400 // this group had alignment instruction(s) at the end, but was removed; - // Useful in scenario where an IG is part of a loop also contains align instruction +#define IGF_REMOVED_ALIGN 0x0400 // this group had alignment instruction(s) at the end, but they were marked unused. + // Useful in a scenario where an IG that is part of a loop also contains align instruction // for a different loop and later, we decide to not align that different loop. -#define IGF_HAS_ALIGN 0x0800 // this group contains an alignment instruction(s) in the end; - // the next IG may or may not be the head of a loop that is needing alignment - // This IG might end up with 'jmp' and hence 'align' instruction might be added - // in this IG, after the 'jmp' instruction. +#define IGF_HAS_ALIGN 0x0800 // this group contains an alignment instruction(s) at the end to align either the next + // IG, or, if this IG contains with an unconditional branch, some subsequent IG. // Mask of IGF_* flags that should be propagated to new blocks when they are created. // This allows prologs and epilogs to be any number of IGs, but still be @@ -354,12 +352,12 @@ struct insGroup return *(unsigned*)ptr; } - bool isAlignInstrRemoved() + bool isAlignInstrRemoved() const { return (igFlags & IGF_REMOVED_ALIGN) != 0; } - bool endsWithAlignInstr() + bool endsWithAlignInstr() const { return (igFlags & IGF_HAS_ALIGN) != 0; } @@ -1396,8 +1394,8 @@ class emitter instrDescAlign* idaNext; // next align in the group/method insGroup* idaIG; // containing group insGroup* idaTargetIG; // The IG before the loop IG. - // If no 'jmp' instructions were found until targetIG, - // the 'align' will be placed in this IG. + // If no 'jmp' instructions were found until idaTargetIG, + // then idaTargetIG == idaIG. void removeAlignFlags() { @@ -1802,7 +1800,7 @@ class emitter instrDescAlign* emitAlignLast; // last align instruction in method // Points to the most recent added align instruction. If there are multiple align instructions like in arm64 or - // non-adpative alignment on xarch, this points to the first align instruction of the series of align instructions. + // non-adaptive alignment on xarch, this points to the first align instruction of the series of align instructions. instrDescAlign* emitRecentFirstAlign; unsigned getLoopSize(insGroup* igLoopHeader, diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 976e58fe62d08..82a70f01e7a00 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -70,8 +70,7 @@ CONFIG_INTEGER(JitAlignLoopAdaptive, CONFIG_INTEGER(JitHideAlignBehindJmp, W("JitHideAlignBehindJmp"), 1) // If set, try to hide align instruction (if any) behind an unconditional jump instruction (if any) - // that - // is present before the loop start. + // that is present before the loop start. // Print the alignment boundaries in disassembly. CONFIG_INTEGER(JitDasmWithAlignmentBoundaries, W("JitDasmWithAlignmentBoundaries"), 0) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 1108cd53cef96..0c865355afe1e 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -4407,8 +4407,6 @@ void Compiler::lvaMarkLocalVars() // groundwork for assertion prop, check type consistency, etc. // See lvaMarkLclRefs for details on what else goes on. // setSlotNumbers -- true if local slot numbers should be assigned. -// calculateAlign -- Populate the list of alignBlocksList chaining all the -// loop head blocks that needs alignment. // // Notes: // Some implicit references are given actual counts or weight bumps here @@ -4421,7 +4419,7 @@ void Compiler::lvaMarkLocalVars() // When optimizing we also recompute lvaGenericsContextInUse based // on specially flagged LCL_VAR appearances. // -void Compiler::lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers, bool calculateAlign) +void Compiler::lvaComputeRefCounts(bool isRecompute, bool setSlotNumbers) { JITDUMP("\n*** lvaComputeRefCounts ***\n"); unsigned lclNum = 0; diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 3eea01d9fe989..b2d92cf6be5a8 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -6124,7 +6124,7 @@ PhaseStatus Lowering::DoPhase() // Recompute local var ref counts again after liveness to reflect // impact of any dead code removal. Note this may leave us with // tracked vars that have zero refs. - comp->lvaComputeRefCounts(isRecompute, setSlotNumbers, true); + comp->lvaComputeRefCounts(isRecompute, setSlotNumbers); return PhaseStatus::MODIFIED_EVERYTHING; } From 63029750326bc2b5f173e30f824de3b733e9b805 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 29 Oct 2021 11:29:21 -0700 Subject: [PATCH 07/24] Change from unsigned short to unsigned --- src/coreclr/jit/emit.cpp | 26 +++++++++++++------------- src/coreclr/jit/emit.h | 6 +++--- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index db0ecb253a1b3..95cb9456c4d36 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -4807,9 +4807,9 @@ void emitter::emitJumpDistBind() // Arguments: // nAlignInstr - Number of align instructions about to be added. // -void emitter::emitCheckAlignFitInCurIG(unsigned short nAlignInstr) +void emitter::emitCheckAlignFitInCurIG(unsigned nAlignInstr) { - unsigned short instrDescSize = nAlignInstr * sizeof(instrDescAlign); + unsigned instrDescSize = nAlignInstr * sizeof(instrDescAlign); // Ensure that all align instructions fall in same IG. if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp) @@ -4824,7 +4824,7 @@ void emitter::emitCheckAlignFitInCurIG(unsigned short nAlignInstr) // So insert an alignment instruction of "paddingBytes" to ensure that // the code is properly aligned. // -void emitter::emitLoopAlign(unsigned short paddingBytes, bool isFirstAlign) +void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign) { // Determine if 'align' instruction about to be generated will // fall in current IG or next. @@ -4895,18 +4895,18 @@ void emitter::emitLoopAlign(unsigned short paddingBytes, bool isFirstAlign) // Arguments: // alignmentBoundary - The boundary at which loop needs to be aligned. // -void emitter::emitLongLoopAlign(unsigned short alignmentBoundary) +void emitter::emitLongLoopAlign(unsigned alignmentBoundary) { #if defined(TARGET_XARCH) - unsigned short nPaddingBytes = alignmentBoundary - 1; - unsigned short nAlignInstr = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE; - unsigned short insAlignCount = nPaddingBytes / MAX_ENCODED_SIZE; - unsigned short lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE; - unsigned short paddingBytes = MAX_ENCODED_SIZE; + unsigned nPaddingBytes = alignmentBoundary - 1; + unsigned nAlignInstr = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE; + unsigned insAlignCount = nPaddingBytes / MAX_ENCODED_SIZE; + unsigned lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE; + unsigned paddingBytes = MAX_ENCODED_SIZE; #elif defined(TARGET_ARM64) - unsigned short nAlignInstr = alignmentBoundary / INSTR_ENCODED_SIZE; - unsigned short insAlignCount = nAlignInstr; - unsigned short paddingBytes = INSTR_ENCODED_SIZE; + unsigned nAlignInstr = alignmentBoundary / INSTR_ENCODED_SIZE; + unsigned insAlignCount = nAlignInstr; + unsigned paddingBytes = INSTR_ENCODED_SIZE; #endif emitCheckAlignFitInCurIG(nAlignInstr); @@ -4962,7 +4962,7 @@ void emitter::emitConnectAlignInstrWithCurIG() // void emitter::emitLoopAlignment() { - unsigned short paddingBytes; + unsigned paddingBytes; #if defined(TARGET_XARCH) // For xarch, each align instruction can be maximum of MAX_ENCODED_SIZE bytes and if diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 45acf5dd6be8a..aab1d33f490ff 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1403,7 +1403,7 @@ class emitter idaIG->igFlags &= ~IGF_HAS_ALIGN; } }; - void emitCheckAlignFitInCurIG(unsigned short nAlignInstr); + void emitCheckAlignFitInCurIG(unsigned nAlignInstr); #endif // FEATURE_LOOP_ALIGN #if !defined(TARGET_ARM64) // This shouldn't be needed for ARM32, either, but I don't want to touch the ARM32 JIT. @@ -1811,8 +1811,8 @@ class emitter void emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool isAlignAdjusted)); - void emitLoopAlign(unsigned short paddingBytes, bool isFirstAlign); - void emitLongLoopAlign(unsigned short alignmentBoundary); + void emitLoopAlign(unsigned paddingBytes, bool isFirstAlign); + void emitLongLoopAlign(unsigned alignmentBoundary); instrDescAlign* emitAlignInNextIG(instrDescAlign* alignInstr); void emitConnectAlignInstrWithCurIG(); From d20da6d2de866c453d9e16569879b658dfdaaed3 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 10 Nov 2021 08:36:19 -0800 Subject: [PATCH 08/24] review comments around cleanup --- src/coreclr/jit/emit.cpp | 31 ++++++++++++++----------------- src/coreclr/jit/emit.h | 2 +- src/coreclr/jit/emitarm64.cpp | 4 ++-- src/coreclr/jit/emitxarch.cpp | 2 +- 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 95cb9456c4d36..f990e7eb56236 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -915,7 +915,7 @@ insGroup* emitter::emitSavIG(bool emitAdd) emitAlignLast = last; // Point to the first instruction of most recent // align instruction(s) added. - emitRecentFirstAlign = list; + emitAlignLastGroup = list; } #endif @@ -1074,7 +1074,7 @@ void emitter::emitBegFN(bool hasFramePtr #if FEATURE_LOOP_ALIGN /* We don't have any align instructions */ - emitAlignList = emitRecentFirstAlign = emitAlignLast = nullptr; + emitAlignList = emitAlignLastGroup = emitAlignLast = nullptr; emitCurIGAlignList = nullptr; #endif @@ -4867,7 +4867,7 @@ void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign) // For multiple align instructions, set the targetIG only for the // first align instruction id->idaTargetIG = emitCurIG; - emitRecentFirstAlign = id; + emitAlignLastGroup = id; } else { @@ -4935,7 +4935,7 @@ void emitter::emitLongLoopAlign(unsigned alignmentBoundary) // void emitter::emitConnectAlignInstrWithCurIG() { - JITDUMP("Mapping 'align' instruction in IG%02u to target IG%02u\n", emitRecentFirstAlign->idaIG->igNum, + JITDUMP("Mapping 'align' instruction in IG%02u to target IG%02u\n", emitAlignLastGroup->idaIG->igNum, emitCurIG->igNum); // Since we never align overlapping instructions, it is always guaranteed that // the emitRecentFirstAlign points to the loop that is in process of getting aligned. @@ -4947,11 +4947,11 @@ void emitter::emitConnectAlignInstrWithCurIG() // pointing to previous IG. assert(emitPrevIG != emitCurIG); - emitRecentFirstAlign->idaTargetIG = emitPrevIG; + emitAlignLastGroup->idaTargetIG = emitPrevIG; } else { - emitRecentFirstAlign->idaTargetIG = emitCurIG; + emitAlignLastGroup->idaTargetIG = emitCurIG; } } @@ -5582,24 +5582,21 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs return paddingToAdd; } -// emitAlignInNextIG: On xarch, for adaptive alignment, this will usually point to next instruction in +// emitAlignInNextIG: On xarch, for adaptive alignment, this will usually return the next instruction in // 'emitAlignList'. But for arm64 or non-adaptive alignment on xarch, where multiple // align instructions are emitted, this method will skip the 'align' instruction present // in the same IG and return the first instruction that is present in next IG. +// Arguments: +// alignInstr - Current 'align' instruction for which next IG's first 'align' should be returned. // emitter::instrDescAlign* emitter::emitAlignInNextIG(instrDescAlign* alignInstr) { -#if defined(TARGET_XARCH) - if (!emitComp->opts.compJitAlignLoopAdaptive) -#endif + // If there are multiple align instructions, skip the align instructions after + // the first align instruction and fast forward to the next IG + insGroup* alignIG = alignInstr->idaIG; + while ((alignInstr != nullptr) && (alignInstr->idaNext != nullptr) && (alignInstr->idaNext->idaIG == alignIG)) { - // If there are multiple align instructions, skip the align instructions after - // the first align instruction and fast forward to the next IG - insGroup* alignIG = alignInstr->idaIG; - while ((alignInstr != nullptr) && (alignInstr->idaNext != nullptr) && (alignInstr->idaNext->idaIG == alignIG)) - { - alignInstr = alignInstr->idaNext; - } + alignInstr = alignInstr->idaNext; } return alignInstr != nullptr ? alignInstr->idaNext : nullptr; } diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index aab1d33f490ff..853e5e8b2431d 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1801,7 +1801,7 @@ class emitter // Points to the most recent added align instruction. If there are multiple align instructions like in arm64 or // non-adaptive alignment on xarch, this points to the first align instruction of the series of align instructions. - instrDescAlign* emitRecentFirstAlign; + instrDescAlign* emitAlignLastGroup; unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG_ARG(bool isAlignAdjusted)); // Get the smallest loop size diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 989cb6accfbe9..290fedfc2fc18 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -11459,10 +11459,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // then add "bkpt" instruction. instrDescAlign* alignInstr = (instrDescAlign*)id; - if (/*emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) &&*/ + if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) && (alignInstr->idaIG != alignInstr->idaTargetIG) && !skipIns) { - // There is no good way to squeeze in "int3" as well as display it + // There is no good way to squeeze in "bkpt" as well as display it // in the disassembly because there is no corresponding instrDesc for // it. As such, leave it as is, the "0xD43E0000" bytecode will be seen // next to the nop instruction in disasm. diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 9e8253520a3d8..054aefc4ccba5 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -10066,7 +10066,7 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) #ifdef DEBUG // Under STRESS_EMITTER, if this is the 'align' before the 'jmp' instruction, - // then add "int3" instruction. Sinc int3 takes 1 byte, we would only add + // then add "int3" instruction. Since int3 takes 1 byte, we would only add // it if paddingToAdd >= 1 byte. if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) && !validatePadding && paddingToAdd >= 1) From c6a2d7061f736a0bddaa7f66e9a0e9a5692c400a Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 10 Nov 2021 09:38:47 -0800 Subject: [PATCH 09/24] emitForceNewIG --- src/coreclr/jit/emit.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index f990e7eb56236..185e0d201e796 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -4809,13 +4809,14 @@ void emitter::emitJumpDistBind() // void emitter::emitCheckAlignFitInCurIG(unsigned nAlignInstr) { - unsigned instrDescSize = nAlignInstr * sizeof(instrDescAlign); + //unsigned instrDescSize = nAlignInstr * sizeof(instrDescAlign); - // Ensure that all align instructions fall in same IG. - if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp) - { - emitForceNewIG = true; - } + //// Ensure that all align instructions fall in same IG. + //if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp) + //{ + // emitForceNewIG = true; + //} + emitForceNewIG = true; } //----------------------------------------------------------------------------- @@ -5032,8 +5033,9 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext) { loopSize += igInLoop->igSize; - if (igInLoop->endsWithAlignInstr() || igInLoop->isAlignInstrRemoved()) + if (igInLoop->endsWithAlignInstr() /*|| igInLoop->isAlignInstrRemoved()*/) { + //assert(false); // If igInLoop can be in one of the following state: // IGF_HAS_ALIGN - igInLoop contains align instruction at the end, for next IG or some future IG. // IGF_REMOVED_ALIGN - igInLoop had align instruction at the end, but was removed. From e9c5eec465a22a06c8bf0da75572f84d69b2e599 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 10 Nov 2021 14:13:44 -0800 Subject: [PATCH 10/24] Remove emitPrevIG --- src/coreclr/jit/emit.cpp | 29 ++++++----------------------- src/coreclr/jit/emit.h | 14 +------------- 2 files changed, 7 insertions(+), 36 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 185e0d201e796..45711988364b3 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -4941,19 +4941,10 @@ void emitter::emitConnectAlignInstrWithCurIG() // Since we never align overlapping instructions, it is always guaranteed that // the emitRecentFirstAlign points to the loop that is in process of getting aligned. - if (emitCurIGsize == 0) - { - // However, if current IG is still empty (e.g. codegen didn't generate moves if they were redundant) - // in that case, loop will start in emitCurIG itself. For such cases, make sure that targetIG is - // pointing to previous IG. - assert(emitPrevIG != emitCurIG); + emitAlignLastGroup->idaTargetIG = emitCurIG; - emitAlignLastGroup->idaTargetIG = emitPrevIG; - } - else - { - emitAlignLastGroup->idaTargetIG = emitCurIG; - } + // For a new IG to ensure that loop doesn't start from IG that idaTargetIG points to. + emitNxtIG(); } //----------------------------------------------------------------------------- @@ -5033,12 +5024,10 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext) { loopSize += igInLoop->igSize; - if (igInLoop->endsWithAlignInstr() /*|| igInLoop->isAlignInstrRemoved()*/) + if (igInLoop->endsWithAlignInstr()) { - //assert(false); - // If igInLoop can be in one of the following state: - // IGF_HAS_ALIGN - igInLoop contains align instruction at the end, for next IG or some future IG. - // IGF_REMOVED_ALIGN - igInLoop had align instruction at the end, but was removed. + // If IGF_HAS_ALIGN is present, igInLoop contains align instruction at the end, + // for next IG or some future IG. // // For both cases, remove the padding bytes from igInLoop's size so it is not included in loopSize. // @@ -8427,12 +8416,6 @@ insGroup* emitter::emitAllocAndLinkIG() ig->igFlags |= (emitCurIG->igFlags & IGF_PROPAGATE_MASK); -#ifdef FEATURE_LOOP_ALIGN - /* Save the prev IG */ - - emitPrevIG = emitCurIG; -#endif - /* Set the new IG as the current IG */ emitCurIG = ig; diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 853e5e8b2431d..c77831099420e 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -275,10 +275,7 @@ struct insGroup #define IGF_PLACEHOLDER 0x0100 // this is a placeholder group, to be filled in later #define IGF_EXTEND 0x0200 // this block is conceptually an extension of the previous block // and the emitter should continue to track GC info as if there was no new block. -#define IGF_REMOVED_ALIGN 0x0400 // this group had alignment instruction(s) at the end, but they were marked unused. - // Useful in a scenario where an IG that is part of a loop also contains align instruction - // for a different loop and later, we decide to not align that different loop. -#define IGF_HAS_ALIGN 0x0800 // this group contains an alignment instruction(s) at the end to align either the next +#define IGF_HAS_ALIGN 0x0400 // this group contains an alignment instruction(s) at the end to align either the next // IG, or, if this IG contains with an unconditional branch, some subsequent IG. // Mask of IGF_* flags that should be propagated to new blocks when they are created. @@ -352,11 +349,6 @@ struct insGroup return *(unsigned*)ptr; } - bool isAlignInstrRemoved() const - { - return (igFlags & IGF_REMOVED_ALIGN) != 0; - } - bool endsWithAlignInstr() const { return (igFlags & IGF_HAS_ALIGN) != 0; @@ -1399,7 +1391,6 @@ class emitter void removeAlignFlags() { - idaIG->igFlags |= IGF_REMOVED_ALIGN; idaIG->igFlags &= ~IGF_HAS_ALIGN; } }; @@ -1739,9 +1730,6 @@ class emitter bool emitChkAlign; // perform some alignment checks #endif -#ifdef FEATURE_LOOP_ALIGN - insGroup* emitPrevIG; -#endif insGroup* emitCurIG; void emitSetShortJump(instrDescJmp* id); From c1c5db3f06beab2e57c9517b8ed15cbf9b2fbfc8 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 10 Nov 2021 14:27:49 -0800 Subject: [PATCH 11/24] Revert change to forceNewIG for align instruction --- src/coreclr/jit/emit.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 45711988364b3..88307bb5ea888 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -4809,14 +4809,13 @@ void emitter::emitJumpDistBind() // void emitter::emitCheckAlignFitInCurIG(unsigned nAlignInstr) { - //unsigned instrDescSize = nAlignInstr * sizeof(instrDescAlign); - - //// Ensure that all align instructions fall in same IG. - //if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp) - //{ - // emitForceNewIG = true; - //} - emitForceNewIG = true; + unsigned instrDescSize = nAlignInstr * sizeof(instrDescAlign); + + // Ensure that all align instructions fall in same IG. + if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp) + { + emitForceNewIG = true; + } } //----------------------------------------------------------------------------- From b8a9742f09b5c63e42e4601a4b3cee9235aee687 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 10 Nov 2021 23:14:05 -0800 Subject: [PATCH 12/24] Use loopAlignCandidates --- src/coreclr/jit/compiler.cpp | 4 ++-- src/coreclr/jit/compiler.h | 3 +-- src/coreclr/jit/emit.cpp | 4 ---- src/coreclr/jit/morph.cpp | 10 +++++++--- src/coreclr/jit/optimizer.cpp | 17 +++++++++++------ 5 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index f8eac9a64b2c9..7064474bb2623 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -5155,7 +5155,7 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl #endif #if FEATURE_LOOP_ALIGN - if (needsLoopAlignment) + if (loopAlignCandidates > 0) { // Place loop alignment instructions DoPhase(this, PHASE_ALIGN_LOOPS, &Compiler::bbPlaceLoopAlignInstructions); @@ -5231,7 +5231,7 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // void Compiler::bbPlaceLoopAlignInstructions() { - assert(needsLoopAlignment); + assert(loopAlignCandidates > 0); // Add align only if there were any loops that needed alignment weight_t minBlockSoFar = INFINITE; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 748c169c34780..f85a66be9d6ad 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6874,10 +6874,9 @@ class Compiler public: LoopDsc* optLoopTable; // loop descriptor table unsigned char optLoopCount; // number of tracked loops - bool needsLoopAlignment; // true if at least one loop was identified as alignment candidate. + unsigned char loopAlignCandidates; // number of loops identified for alignment #ifdef DEBUG - unsigned char loopAlignCandidates; // number of loops identified for alignment unsigned char loopsAligned; // number of loops actually aligned #endif // DEBUG diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 88307bb5ea888..4518d5dc31c51 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -4986,10 +4986,6 @@ void emitter::emitLoopAlignment() assert(emitLastIns->idIns() == INS_align); JITDUMP("Adding 'align' instruction of %d bytes in %s.\n", paddingBytes, emitLabelString(emitCurIG)); - -#ifdef DEBUG - emitComp->loopAlignCandidates++; -#endif // DEBUG } //----------------------------------------------------------------------------- diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 4c18c7f918822..b217c5cfbcd91 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -15321,9 +15321,13 @@ bool Compiler::fgFoldConditional(BasicBlock* block) optLoopTable[loopNum].lpFlags |= LPFLG_REMOVED; #if FEATURE_LOOP_ALIGN - optLoopTable[loopNum].lpTop->bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Removing LOOP_ALIGN flag from bogus loop in " FMT_BB "\n", - optLoopTable[loopNum].lpTop->bbNum); + if (optLoopTable[loopNum].lpTop->isLoopAlign()) + { + loopAlignCandidates--; + optLoopTable[loopNum].lpTop->bbFlags &= ~BBF_LOOP_ALIGN; + JITDUMP("Removing LOOP_ALIGN flag from bogus loop in " FMT_BB "\n", + optLoopTable[loopNum].lpTop->bbNum); + } #endif #ifdef DEBUG diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index afbd00bde863c..29df623acaffd 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -22,14 +22,13 @@ void Compiler::optInit() { optLoopsMarked = false; fgHasLoops = false; - needsLoopAlignment = false; + loopAlignCandidates = 0; /* Initialize the # of tracked loops to 0 */ optLoopCount = 0; optLoopTable = nullptr; #ifdef DEBUG - loopAlignCandidates = 0; loopsAligned = 0; #endif @@ -384,6 +383,7 @@ void Compiler::optUnmarkLoopBlocks(BasicBlock* begBlk, BasicBlock* endBlk) if (begBlk->isLoopAlign()) { // Clear the loop alignment bit on the head of a loop, since it's no longer a loop. + loopAlignCandidates--; begBlk->bbFlags &= ~BBF_LOOP_ALIGN; JITDUMP("Removing LOOP_ALIGN flag from removed loop in " FMT_BB "\n", begBlk->bbNum); } @@ -2517,7 +2517,7 @@ void Compiler::optIdentifyLoopsForAlignment() weight_t topWeight = top->getBBWeight(this); if (topWeight >= (opts.compJitAlignLoopMinBlockWeight * BB_UNITY_WEIGHT)) { - top->bbFlags |= BBF_LOOP_ALIGN; + loopAlignCandidates++; JITDUMP(FMT_LP " that starts at " FMT_BB " needs alignment, weight=" FMT_WT ".\n", loopInd, top->bbNum, topWeight); } @@ -3782,6 +3782,7 @@ PhaseStatus Compiler::optUnrollLoops() { if (block->isLoopAlign()) { + loopAlignCandidates--; block->bbFlags &= ~BBF_LOOP_ALIGN; JITDUMP("Removing LOOP_ALIGN flag from unrolled loop in " FMT_BB "\n", block->bbNum); } @@ -7618,9 +7619,13 @@ void Compiler::AddContainsCallAllContainingLoops(unsigned lnum) if (optLoopTable[lnum].lpChild == BasicBlock::NOT_IN_LOOP) { BasicBlock* top = optLoopTable[lnum].lpTop; - top->bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Removing LOOP_ALIGN flag for " FMT_LP " that starts at " FMT_BB " because loop has a call.\n", lnum, - top->bbNum); + if (top->isLoopAlign()) + { + loopAlignCandidates--; + top->bbFlags &= ~BBF_LOOP_ALIGN; + JITDUMP("Removing LOOP_ALIGN flag for " FMT_LP " that starts at " FMT_BB " because loop has a call.\n", + lnum, top->bbNum); + } } #endif From db98ec2f30e0d020855f2f8c8c0da496da28aeb3 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 11 Nov 2021 00:03:26 -0800 Subject: [PATCH 13/24] Use loopHeadIG reference --- src/coreclr/jit/codegenlinear.cpp | 2 +- src/coreclr/jit/emit.cpp | 61 +++++++++++++++++-------------- src/coreclr/jit/emit.h | 16 +++++--- src/coreclr/jit/emitarm64.cpp | 6 +-- src/coreclr/jit/emitxarch.cpp | 8 ++-- 5 files changed, 52 insertions(+), 41 deletions(-) diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 590b92cd8f03a..933d9c5f6de37 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -803,7 +803,7 @@ void CodeGen::genCodeForBBlist() { // The current IG is the one that is just before the IG having loop start. // Establish a connection of recent align instruction emitted to the loop - // it actually is aligning using 'idaTargetIG'. + // it actually is aligning using 'idaLoopHeadPredIG'. GetEmitter()->emitConnectAlignInstrWithCurIG(); } } diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 4518d5dc31c51..89b8866bbd455 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -4864,14 +4864,14 @@ void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign) if (isFirstAlign) { - // For multiple align instructions, set the targetIG only for the + // For multiple align instructions, set the idaLoopHeadPredIG only for the // first align instruction - id->idaTargetIG = emitCurIG; + id->idaLoopHeadPredIG = emitCurIG; emitAlignLastGroup = id; } else { - id->idaTargetIG = nullptr; + id->idaLoopHeadPredIG = nullptr; } /* Append this instruction to this IG's alignment list */ @@ -4929,20 +4929,23 @@ void emitter::emitLongLoopAlign(unsigned alignmentBoundary) //----------------------------------------------------------------------------- // emitConnectAlignInstrWithCurIG: If "align" instruction is not just before the loop start, -// setting idaTargetIG lets us know the exact IG that the "align" -// instruction is trying to align. This is used during loop size -// calculation. +// setting idaLoopHeadPredIG lets us know the exact IG that the "align" +// instruction is trying to align. This is used to track the last IG that +// needs alignment after which VEX encoding optimization is enabled. +// +// TODO: Once over-estimation problem is solved, consider replacing +// idaLoopHeadPredIG with idaLoopHeadIG itself. // void emitter::emitConnectAlignInstrWithCurIG() { JITDUMP("Mapping 'align' instruction in IG%02u to target IG%02u\n", emitAlignLastGroup->idaIG->igNum, emitCurIG->igNum); // Since we never align overlapping instructions, it is always guaranteed that - // the emitRecentFirstAlign points to the loop that is in process of getting aligned. + // the emitAlignLastGroup points to the loop that is in process of getting aligned. - emitAlignLastGroup->idaTargetIG = emitCurIG; + emitAlignLastGroup->idaLoopHeadPredIG = emitCurIG; - // For a new IG to ensure that loop doesn't start from IG that idaTargetIG points to. + // For a new IG to ensure that loop doesn't start from IG that idaLoopHeadPredIG points to. emitNxtIG(); } @@ -5192,7 +5195,7 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) { // Find the IG that has 'align' instruction to align the current loop // and clear the IGF_HAS_ALIGN flag. - if (!alignCurrentLoop && (alignInstr->idaTargetIG->igNext == dstIG)) + if (!alignCurrentLoop && (alignInstr->loopHeadIG() == dstIG)) { assert(!markedCurrLoop); @@ -5207,8 +5210,8 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) // Find the IG that has 'align' instruction to align the last loop // and clear the IGF_HAS_ALIGN flag. - if (!alignLastLoop && (alignInstr->idaTargetIG->igNext != nullptr) && - (alignInstr->idaTargetIG->igNext->igNum == emitLastLoopStart)) + if (!alignLastLoop && (alignInstr->loopHeadIG() != nullptr) && + (alignInstr->loopHeadIG()->igNum == emitLastLoopStart)) { assert(!markedLastLoop); assert(alignInstr->idaIG->endsWithAlignInstr()); @@ -5270,16 +5273,17 @@ void emitter::emitLoopAlignAdjustments() { assert(alignInstr->idIns() == INS_align); - insGroup* alignIG = alignInstr->idaTargetIG; + insGroup* loopHeadPredIG = alignInstr->idaLoopHeadPredIG; + insGroup* loopHeadIG = alignInstr->loopHeadIG(); insGroup* containingIG = alignInstr->idaIG; JITDUMP(" Adjusting 'align' instruction in IG%02u that is targeted for IG%02u \n", containingIG->igNum, - alignIG->igNum); + loopHeadIG->igNum); // Since we only adjust the padding up to the next align instruction which is behind the jump, we make sure // that we take into account all the alignBytes we removed until that point. Hence " - alignBytesRemoved" - loopIGOffset = alignIG->igNext->igOffs - alignBytesRemoved; + loopIGOffset = loopHeadIG->igOffs - alignBytesRemoved; // igSize also includes INS_align instruction, take it off. loopIGOffset -= estimatedPaddingNeeded; @@ -5289,7 +5293,7 @@ void emitter::emitLoopAlignAdjustments() unsigned actualPaddingNeeded = containingIG->endsWithAlignInstr() - ? emitCalculatePaddingForLoopAlignment(alignIG, loopIGOffset DEBUG_ARG(false)) + ? emitCalculatePaddingForLoopAlignment(loopHeadIG, loopIGOffset DEBUG_ARG(false)) : 0; assert(estimatedPaddingNeeded >= actualPaddingNeeded); @@ -5358,7 +5362,7 @@ void emitter::emitLoopAlignAdjustments() assert(instrAdjusted == 0); } - JITDUMP("Adjusted alignment of %s from %u to %u.\n", emitLabelString(alignIG), estimatedPaddingNeeded, + JITDUMP("Adjusted alignment for %s from %u to %u.\n", emitLabelString(loopHeadIG), estimatedPaddingNeeded, actualPaddingNeeded); JITDUMP("Adjusted size of %s from %u to %u.\n", emitLabelString(containingIG), (containingIG->igSize + diff), containingIG->igSize); @@ -5383,7 +5387,8 @@ void emitter::emitLoopAlignAdjustments() { // Record the last loop IG that will be aligned. No overestimation // adjustment will be done after emitLastAlignedIgNum. - emitLastAlignedIgNum = alignIG->igNum; + JITDUMP("Recording last aligned IG: %s\n", emitLabelString(loopHeadPredIG)); + emitLastAlignedIgNum = loopHeadPredIG->igNum; } } @@ -5397,7 +5402,7 @@ void emitter::emitLoopAlignAdjustments() // end of 'ig' so the loop that starts after 'ig' is aligned. // // Arguments: -// ig - The IG before the IG that has the loop head that need to be aligned. +// loopHeadIG - The IG that has the loop head that need to be aligned. // offset - The offset at which the IG that follows 'ig' starts. // isAlignAdjusted - Determine if adjustments are done to the align instructions or not. // During generating code, it is 'false' (because we haven't adjusted the size yet). @@ -5425,14 +5430,14 @@ void emitter::emitLoopAlignAdjustments() // 3b. If the loop already fits in minimum alignmentBoundary blocks, then return 0. // already best aligned // 3c. return paddingNeeded. // -unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool isAlignAdjusted)) +unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* loopHeadIG, size_t offset DEBUG_ARG(bool isAlignAdjusted)) { unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary; // No padding if loop is already aligned if ((offset & (alignmentBoundary - 1)) == 0) { - JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %dB boundary.'\n", emitLabelString(ig->igNext), + JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %dB boundary.'\n", emitLabelString(loopHeadIG), alignmentBoundary); return 0; } @@ -5452,12 +5457,12 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs maxLoopSize = emitComp->opts.compJitAlignLoopMaxCodeSize; } - unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize DEBUG_ARG(isAlignAdjusted)); + unsigned loopSize = getLoopSize(loopHeadIG, maxLoopSize DEBUG_ARG(isAlignAdjusted)); // No padding if loop is big if (loopSize > maxLoopSize) { - JITDUMP(";; Skip alignment: 'Loop at %s is big. LoopSize= %d, MaxLoopSize= %d.'\n", emitLabelString(ig->igNext), + JITDUMP(";; Skip alignment: 'Loop at %s is big. LoopSize= %d, MaxLoopSize= %d.'\n", emitLabelString(loopHeadIG), loopSize, maxLoopSize); return 0; } @@ -5494,7 +5499,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs { skipPadding = true; JITDUMP(";; Skip alignment: 'Loop at %s already aligned at %uB boundary.'\n", - emitLabelString(ig->igNext), alignmentBoundary); + emitLabelString(loopHeadIG), alignmentBoundary); } // Check if the alignment exceeds new maxPadding limit else if (nPaddingBytes > nMaxPaddingBytes) @@ -5502,7 +5507,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs skipPadding = true; JITDUMP(";; Skip alignment: 'Loop at %s PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, " "AlignmentBoundary= %dB.'\n", - emitLabelString(ig->igNext), nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary); + emitLabelString(loopHeadIG), nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary); } } @@ -5525,7 +5530,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs { // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. JITDUMP(";; Skip alignment: 'Loop at %s is aligned to fit in %d blocks of %d chunks.'\n", - emitLabelString(ig->igNext), minBlocksNeededForLoop, alignmentBoundary); + emitLabelString(loopHeadIG), minBlocksNeededForLoop, alignmentBoundary); } } } @@ -5554,12 +5559,12 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs { // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment. JITDUMP(";; Skip alignment: 'Loop at %s is aligned to fit in %d blocks of %d chunks.'\n", - emitLabelString(ig->igNext), minBlocksNeededForLoop, alignmentBoundary); + emitLabelString(loopHeadIG), minBlocksNeededForLoop, alignmentBoundary); } } JITDUMP(";; Calculated padding to add %d bytes to align %s at %dB boundary.\n", paddingToAdd, - emitLabelString(ig->igNext), alignmentBoundary); + emitLabelString(loopHeadIG), alignmentBoundary); // Either no padding is added because it is too expensive or the offset gets aligned // to the alignment boundary diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index c77831099420e..2d806adb7a2c9 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1383,11 +1383,17 @@ class emitter #if FEATURE_LOOP_ALIGN struct instrDescAlign : instrDesc { - instrDescAlign* idaNext; // next align in the group/method - insGroup* idaIG; // containing group - insGroup* idaTargetIG; // The IG before the loop IG. - // If no 'jmp' instructions were found until idaTargetIG, - // then idaTargetIG == idaIG. + instrDescAlign* idaNext; // next align in the group/method + insGroup* idaIG; // containing group + insGroup* idaLoopHeadPredIG; // The IG before the loop IG. + // If no 'jmp' instructions were found until idaLoopHeadPredIG, + // then idaLoopHeadPredIG == idaIG. + + inline insGroup* loopHeadIG() + { + assert(idaLoopHeadPredIG); + return idaLoopHeadPredIG->igNext; + } void removeAlignFlags() { diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 290fedfc2fc18..c43abe1f14436 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -11460,7 +11460,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) instrDescAlign* alignInstr = (instrDescAlign*)id; if (emitComp->compStressCompile(Compiler::STRESS_EMITTER, 50) && - (alignInstr->idaIG != alignInstr->idaTargetIG) && !skipIns) + (alignInstr->idaIG != alignInstr->idaLoopHeadPredIG) && !skipIns) { // There is no good way to squeeze in "bkpt" as well as display it // in the disassembly because there is no corresponding instrDesc for @@ -13351,9 +13351,9 @@ void emitter::emitDispIns( printf("[%d bytes", id->idIsEmptyAlign() ? 0 : INSTR_ENCODED_SIZE); // targetIG is only set for 1st of the series of align instruction - if ((alignInstrId->idaTargetIG != nullptr) && (alignInstrId->idaTargetIG->igNext != nullptr)) + if ((alignInstrId->idaLoopHeadPredIG != nullptr) && (alignInstrId->loopHeadIG() != nullptr)) { - printf(" for IG%02u", alignInstrId->idaTargetIG->igNext->igNum); + printf(" for IG%02u", alignInstrId->loopHeadIG()->igNum); } printf("]"); } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 054aefc4ccba5..1a95d7bc10e3a 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -9833,9 +9833,9 @@ void emitter::emitDispIns( instrDescAlign* alignInstrId = (instrDescAlign*)id; printf("[%d bytes", alignInstrId->idCodeSize()); // targetIG is only set for 1st of the series of align instruction - if ((alignInstrId->idaTargetIG != nullptr) && (alignInstrId->idaTargetIG->igNext != nullptr)) + if ((alignInstrId->idaLoopHeadPredIG != nullptr) && (alignInstrId->loopHeadIG() != nullptr)) { - printf(" for IG%02u", alignInstrId->idaTargetIG->igNext->igNum); + printf(" for IG%02u", alignInstrId->loopHeadIG()->igNum); } printf("]"); } @@ -10031,7 +10031,7 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) // For cases where 'align' was placed behing a 'jmp' in an IG that does not // immediately preced the loop IG, we do not know in advance the offset of // IG having loop. For such cases, skip the padding calculation validation. - bool validatePadding = alignInstr->idaIG == alignInstr->idaTargetIG; + bool validatePadding = alignInstr->idaIG == alignInstr->idaLoopHeadPredIG; #endif // Candidate for loop alignment @@ -10050,7 +10050,7 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) #ifdef DEBUG if (validatePadding) { - unsigned paddingNeeded = emitCalculatePaddingForLoopAlignment(((instrDescAlign*)id)->idaIG, (size_t)dst, true); + unsigned paddingNeeded = emitCalculatePaddingForLoopAlignment(((instrDescAlign*)id)->idaIG->igNext, (size_t)dst, true); // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking if (emitComp->opts.compJitAlignLoopAdaptive) From 5ab9edcfcc197078d10ce8c2c845d0604ecf86a4 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 11 Nov 2021 00:37:14 -0800 Subject: [PATCH 14/24] jit format --- src/coreclr/jit/compiler.h | 8 ++++---- src/coreclr/jit/emit.cpp | 11 ++++++----- src/coreclr/jit/emitxarch.cpp | 3 ++- src/coreclr/jit/optimizer.cpp | 6 +++--- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index f85a66be9d6ad..3c386e9fe1f20 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6872,13 +6872,13 @@ class Compiler bool fgHasLoops; // True if this method has any loops, set in fgComputeReachability public: - LoopDsc* optLoopTable; // loop descriptor table - unsigned char optLoopCount; // number of tracked loops + LoopDsc* optLoopTable; // loop descriptor table + unsigned char optLoopCount; // number of tracked loops unsigned char loopAlignCandidates; // number of loops identified for alignment #ifdef DEBUG - unsigned char loopsAligned; // number of loops actually aligned -#endif // DEBUG + unsigned char loopsAligned; // number of loops actually aligned +#endif // DEBUG bool optRecordLoop(BasicBlock* head, BasicBlock* top, diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 89b8866bbd455..307119bbd7002 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -1075,7 +1075,7 @@ void emitter::emitBegFN(bool hasFramePtr /* We don't have any align instructions */ emitAlignList = emitAlignLastGroup = emitAlignLast = nullptr; - emitCurIGAlignList = nullptr; + emitCurIGAlignList = nullptr; #endif /* We have not recorded any live sets */ @@ -4867,7 +4867,7 @@ void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign) // For multiple align instructions, set the idaLoopHeadPredIG only for the // first align instruction id->idaLoopHeadPredIG = emitCurIG; - emitAlignLastGroup = id; + emitAlignLastGroup = id; } else { @@ -5275,7 +5275,7 @@ void emitter::emitLoopAlignAdjustments() insGroup* loopHeadPredIG = alignInstr->idaLoopHeadPredIG; insGroup* loopHeadIG = alignInstr->loopHeadIG(); - insGroup* containingIG = alignInstr->idaIG; + insGroup* containingIG = alignInstr->idaIG; JITDUMP(" Adjusting 'align' instruction in IG%02u that is targeted for IG%02u \n", containingIG->igNum, loopHeadIG->igNum); @@ -5430,7 +5430,8 @@ void emitter::emitLoopAlignAdjustments() // 3b. If the loop already fits in minimum alignmentBoundary blocks, then return 0. // already best aligned // 3c. return paddingNeeded. // -unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* loopHeadIG, size_t offset DEBUG_ARG(bool isAlignAdjusted)) +unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* loopHeadIG, + size_t offset DEBUG_ARG(bool isAlignAdjusted)) { unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary; @@ -5578,7 +5579,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* loopHeadIG, siz // align instructions are emitted, this method will skip the 'align' instruction present // in the same IG and return the first instruction that is present in next IG. // Arguments: -// alignInstr - Current 'align' instruction for which next IG's first 'align' should be returned. +// alignInstr - Current 'align' instruction for which next IG's first 'align' should be returned. // emitter::instrDescAlign* emitter::emitAlignInNextIG(instrDescAlign* alignInstr) { diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 1a95d7bc10e3a..6f922c75478c3 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -10050,7 +10050,8 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst) #ifdef DEBUG if (validatePadding) { - unsigned paddingNeeded = emitCalculatePaddingForLoopAlignment(((instrDescAlign*)id)->idaIG->igNext, (size_t)dst, true); + unsigned paddingNeeded = + emitCalculatePaddingForLoopAlignment(((instrDescAlign*)id)->idaIG->igNext, (size_t)dst, true); // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking if (emitComp->opts.compJitAlignLoopAdaptive) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 29df623acaffd..1760fa6547c58 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -20,8 +20,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void Compiler::optInit() { - optLoopsMarked = false; - fgHasLoops = false; + optLoopsMarked = false; + fgHasLoops = false; loopAlignCandidates = 0; /* Initialize the # of tracked loops to 0 */ @@ -29,7 +29,7 @@ void Compiler::optInit() optLoopTable = nullptr; #ifdef DEBUG - loopsAligned = 0; + loopsAligned = 0; #endif /* Keep track of the number of calls and indirect calls made by this method */ From c8a9e016e778b5f5b34e7a540aa038ea188c80fb Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 11 Nov 2021 00:39:54 -0800 Subject: [PATCH 15/24] Remove unneeded method --- src/coreclr/jit/codegen.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 48a5576e87364..14ebb0d27e450 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -210,10 +210,6 @@ class CodeGen final : public CodeGenInterface void genInitializeRegisterState(); -#if FEATURE_LOOP_ALIGN - void genMarkBlocksForLoopAlignment(); -#endif - void genCodeForBBlist(); public: From 5bb1563ddbbc0eb7812da47fe1cf0937b1f08ee3 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 11 Nov 2021 00:59:16 -0800 Subject: [PATCH 16/24] Misc changes --- src/coreclr/jit/compiler.h | 6 ------ src/coreclr/jit/emit.cpp | 8 +++++--- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 3c386e9fe1f20..6b5eea79f0766 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8334,12 +8334,6 @@ class Compiler weight_t refCntWtdStkDbl); #endif // DOUBLE_ALIGN -#if FEATURE_LOOP_ALIGN -public: -// alignBlocksList* alignBBLists; - -#endif // FEATURE_LOOP_ALIGN - bool IsFullPtrRegMapRequired() { return codeGen->IsFullPtrRegMapRequired(); diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 307119bbd7002..1367206423f8c 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -5193,9 +5193,11 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) bool markedCurrLoop = alignCurrentLoop; while ((alignInstr != nullptr)) { + insGroup* loopHeadIG = alignInstr->loopHeadIG(); + // Find the IG that has 'align' instruction to align the current loop // and clear the IGF_HAS_ALIGN flag. - if (!alignCurrentLoop && (alignInstr->loopHeadIG() == dstIG)) + if (!alignCurrentLoop && (loopHeadIG == dstIG)) { assert(!markedCurrLoop); @@ -5210,8 +5212,8 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) // Find the IG that has 'align' instruction to align the last loop // and clear the IGF_HAS_ALIGN flag. - if (!alignLastLoop && (alignInstr->loopHeadIG() != nullptr) && - (alignInstr->loopHeadIG()->igNum == emitLastLoopStart)) + if (!alignLastLoop && (loopHeadIG != nullptr) && + (loopHeadIG->igNum == emitLastLoopStart)) { assert(!markedLastLoop); assert(alignInstr->idaIG->endsWithAlignInstr()); From 2c6e81db581d291938fa47786b794a03fdcabc7b Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 12 Nov 2021 15:03:05 -0800 Subject: [PATCH 17/24] Review feedback --- src/coreclr/jit/compiler.cpp | 44 ++++++++++++++++++++++-------------- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/emit.cpp | 20 +++++++++++----- 3 files changed, 42 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 7064474bb2623..f8a167ddd34f2 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -5155,11 +5155,8 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl #endif #if FEATURE_LOOP_ALIGN - if (loopAlignCandidates > 0) - { - // Place loop alignment instructions - DoPhase(this, PHASE_ALIGN_LOOPS, &Compiler::bbPlaceLoopAlignInstructions); - } + // Place loop alignment instructions + DoPhase(this, PHASE_ALIGN_LOOPS, &Compiler::placeLoopAlignInstructions); #endif // Generate code @@ -5221,20 +5218,26 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl #if FEATURE_LOOP_ALIGN //------------------------------------------------------------------------ -// bbPlaceLoopAlignInstructions: Iterate over all the blocks and determine -// the best position to place the 'align' instruction. After 'jmp' are -// preferred over block before loop and in case there are multiple blocks +// placeLoopAlignInstructions: Iterate over all the blocks and determine +// the best position to place the 'align' instruction. Inserting 'align' +// instructions after an unconditional branch is preferred over inserting +// in the block before the loop. In case there are multiple blocks // having 'jmp', the one that has lower weight is preferred. -// If the block having 'jmp' is hot than the block before loop, the align -// will still be placed after 'jmp' because the processor should be smart -// enough to not fetch extra instruction beyond jmp. +// If the block having 'jmp' is hotter than the block before the loop, +// the align will still be placed after 'jmp' because the processor should +// be smart enough to not fetch extra instruction beyond jmp. // -void Compiler::bbPlaceLoopAlignInstructions() +void Compiler::placeLoopAlignInstructions() { - assert(loopAlignCandidates > 0); + if (loopAlignCandidates == 0) + { + return; + } + + int loopsToProcess = loopAlignCandidates; // Add align only if there were any loops that needed alignment - weight_t minBlockSoFar = INFINITE; + weight_t minBlockSoFar = BB_MAX_WEIGHT; BasicBlock* bbHavingAlign = nullptr; for (BasicBlock* const block : Blocks()) { @@ -5245,7 +5248,7 @@ void Compiler::bbPlaceLoopAlignInstructions() { minBlockSoFar = block->bbWeight; bbHavingAlign = block; - JITDUMP(FMT_BB ", bbWeight= %f ends with unconditional 'jmp' \n", block->bbNum, block->bbWeight); + JITDUMP(FMT_BB ", bbWeight=" FMT_WT " ends with unconditional 'jmp' \n", block->bbNum, block->bbWeight); } } @@ -5260,16 +5263,23 @@ void Compiler::bbPlaceLoopAlignInstructions() } else { - JITDUMP("Marking " FMT_BB " that ends with uncondition jump with BBF_HAS_ALIGN for loop at " FMT_BB + JITDUMP("Marking " FMT_BB " that ends with unconditional jump with BBF_HAS_ALIGN for loop at " FMT_BB "\n", bbHavingAlign->bbNum, block->bbNext->bbNum); } bbHavingAlign->bbFlags |= BBF_HAS_ALIGN; - minBlockSoFar = INFINITE; + minBlockSoFar = BB_MAX_WEIGHT; bbHavingAlign = nullptr; } + + if (--loopsToProcess == 0) + { + break; + } } + + assert(loopsToProcess == 0); } #endif diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 6b5eea79f0766..d99d3affdf5d4 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3666,7 +3666,7 @@ class Compiler #endif BasicBlock* bbNewBasicBlock(BBjumpKinds jumpKind); - void bbPlaceLoopAlignInstructions(); + void placeLoopAlignInstructions(); /* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 1367206423f8c..ca6a10c45dc68 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -913,8 +913,13 @@ insGroup* emitter::emitSavIG(bool emitAdd) } emitAlignLast = last; + // Point to the first instruction of most recent // align instruction(s) added. + // + // Since emitCurIGAlignList is created in inverse of + // program order, the `list` reverses that in forms it + // in correct order. emitAlignLastGroup = list; } @@ -4820,9 +4825,13 @@ void emitter::emitCheckAlignFitInCurIG(unsigned nAlignInstr) //----------------------------------------------------------------------------- // -// The next instruction will be a loop head entry point -// So insert an alignment instruction of "paddingBytes" to ensure that -// the code is properly aligned. +// emitLoopAlign: The next instruction will be a loop head entry point +// So insert an alignment instruction of "paddingBytes" to ensure that +// the code is properly aligned. +// Arguments: +// paddingBytes - Number of padding bytes to insert. +// isFirstAlign - For multiple 'align' instructions case, if this is the first +// 'align' instruction of that group. // void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign) { @@ -4885,7 +4894,7 @@ void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign) //----------------------------------------------------------------------------- // -// The next instruction will be a loop head entry point +// emitLongLoopAlign: The next instruction will be a loop head entry point // So insert alignment instruction(s) here to ensure that // we can properly align the code. // @@ -5212,8 +5221,7 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock) // Find the IG that has 'align' instruction to align the last loop // and clear the IGF_HAS_ALIGN flag. - if (!alignLastLoop && (loopHeadIG != nullptr) && - (loopHeadIG->igNum == emitLastLoopStart)) + if (!alignLastLoop && (loopHeadIG != nullptr) && (loopHeadIG->igNum == emitLastLoopStart)) { assert(!markedLastLoop); assert(alignInstr->idaIG->endsWithAlignInstr()); From bbc2ac51b9cd37a02955f7d343913a5b10ff6b96 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 12 Nov 2021 18:38:50 -0800 Subject: [PATCH 18/24] Do not include align behind Jmp in PerfScore calculation --- src/coreclr/jit/codegenlinear.cpp | 2 +- src/coreclr/jit/compiler.cpp | 9 +++++---- src/coreclr/jit/emit.cpp | 20 ++++++++++++-------- src/coreclr/jit/emit.h | 11 ++++++++--- src/coreclr/jit/emitarm64.cpp | 24 ++++++++++++++---------- src/coreclr/jit/emitxarch.cpp | 7 +++++-- 6 files changed, 45 insertions(+), 28 deletions(-) diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 933d9c5f6de37..4f9cfa00752d6 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -794,7 +794,7 @@ void CodeGen::genCodeForBBlist() // and 16 bytes for arm64. assert(ShouldAlignLoops()); - GetEmitter()->emitLoopAlignment(); + GetEmitter()->emitLoopAlignment(DEBUG_ARG1(block->bbJumpKind == BBJ_ALWAYS)); } if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign())) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index f8a167ddd34f2..e7d505b27e419 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -5271,11 +5271,12 @@ void Compiler::placeLoopAlignInstructions() bbHavingAlign->bbFlags |= BBF_HAS_ALIGN; minBlockSoFar = BB_MAX_WEIGHT; bbHavingAlign = nullptr; - } - if (--loopsToProcess == 0) - { - break; + + if (--loopsToProcess == 0) + { + break; + } } } diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index ca6a10c45dc68..f81de138b8bcb 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -4833,7 +4833,7 @@ void emitter::emitCheckAlignFitInCurIG(unsigned nAlignInstr) // isFirstAlign - For multiple 'align' instructions case, if this is the first // 'align' instruction of that group. // -void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign) +void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign DEBUG_ARG(bool isPlacedBehindJmp)) { // Determine if 'align' instruction about to be generated will // fall in current IG or next. @@ -4883,6 +4883,10 @@ void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign) id->idaLoopHeadPredIG = nullptr; } +#ifdef DEBUG + id->isPlacedAfterJmp = isPlacedBehindJmp; +#endif + /* Append this instruction to this IG's alignment list */ id->idaNext = emitCurIGAlignList; @@ -4904,7 +4908,7 @@ void emitter::emitLoopAlign(unsigned paddingBytes, bool isFirstAlign) // Arguments: // alignmentBoundary - The boundary at which loop needs to be aligned. // -void emitter::emitLongLoopAlign(unsigned alignmentBoundary) +void emitter::emitLongLoopAlign(unsigned alignmentBoundary DEBUG_ARG(bool isPlacedBehindJmp)) { #if defined(TARGET_XARCH) unsigned nPaddingBytes = alignmentBoundary - 1; @@ -4926,13 +4930,13 @@ void emitter::emitLongLoopAlign(unsigned alignmentBoundary) bool isFirstAlign = true; while (insAlignCount) { - emitLoopAlign(paddingBytes, isFirstAlign); + emitLoopAlign(paddingBytes, isFirstAlign DEBUG_ARG(isPlacedBehindJmp)); insAlignCount--; isFirstAlign = false; } #if defined(TARGET_XARCH) - emitLoopAlign(lastInsAlignSize, isFirstAlign); + emitLoopAlign(lastInsAlignSize, isFirstAlign DEBUG_ARG(isPlacedBehindJmp)); #endif } @@ -4963,7 +4967,7 @@ void emitter::emitConnectAlignInstrWithCurIG() // mark it as IGF_HAS_ALIGN to indicate that a next or a future // IG is a loop that needs alignment. // -void emitter::emitLoopAlignment() +void emitter::emitLoopAlignment(DEBUG_ARG1(bool isPlacedBehindJmp)) { unsigned paddingBytes; @@ -4973,13 +4977,13 @@ void emitter::emitLoopAlignment() if ((emitComp->opts.compJitAlignLoopBoundary > 16) && (!emitComp->opts.compJitAlignLoopAdaptive)) { paddingBytes = emitComp->opts.compJitAlignLoopBoundary; - emitLongLoopAlign(paddingBytes); + emitLongLoopAlign(paddingBytes DEBUG_ARG(isPlacedBehindJmp)); } else { emitCheckAlignFitInCurIG(1); paddingBytes = MAX_ENCODED_SIZE; - emitLoopAlign(paddingBytes, true); + emitLoopAlign(paddingBytes, true DEBUG_ARG(isPlacedBehindJmp)); } #elif defined(TARGET_ARM64) // For Arm64, each align instruction is 4-bytes long because of fixed-length encoding. @@ -4992,7 +4996,7 @@ void emitter::emitLoopAlignment() { paddingBytes = emitComp->opts.compJitAlignLoopBoundary; } - emitLongLoopAlign(paddingBytes); + emitLongLoopAlign(paddingBytes DEBUG_ARG(isPlacedBehindJmp)); #endif assert(emitLastIns->idIns() == INS_align); diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 2d806adb7a2c9..875a80e0f4b9e 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1388,6 +1388,11 @@ class emitter insGroup* idaLoopHeadPredIG; // The IG before the loop IG. // If no 'jmp' instructions were found until idaLoopHeadPredIG, // then idaLoopHeadPredIG == idaIG. +#ifdef DEBUG + bool isPlacedAfterJmp; // Is the 'align' instruction placed after jmp. Used to decide + // if the instruction cost should be included in PerfScore + // calculation or not. +#endif inline insGroup* loopHeadIG() { @@ -1799,14 +1804,14 @@ class emitter unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG_ARG(bool isAlignAdjusted)); // Get the smallest loop size - void emitLoopAlignment(); + void emitLoopAlignment(DEBUG_ARG1(bool isPlacedBehindJmp)); bool emitEndsWithAlignInstr(); // Validate if newLabel is appropriate void emitSetLoopBackEdge(BasicBlock* loopTopBlock); void emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool isAlignAdjusted)); - void emitLoopAlign(unsigned paddingBytes, bool isFirstAlign); - void emitLongLoopAlign(unsigned alignmentBoundary); + void emitLoopAlign(unsigned paddingBytes, bool isFirstAlign DEBUG_ARG(bool isPlacedBehindJmp)); + void emitLongLoopAlign(unsigned alignmentBoundary DEBUG_ARG(bool isPlacedBehindJmp)); instrDescAlign* emitAlignInNextIG(instrDescAlign* alignInstr); void emitConnectAlignInstrWithCurIG(); diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index c43abe1f14436..c7c8c37b1d741 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -14607,23 +14607,27 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case IF_SN_0A: // bkpt, brk, nop - if (id->idIsEmptyAlign()) + + if (id->idIns() == INS_align) { - // We're not going to generate any instruction, so it doesn't count for PerfScore. - result.insThroughput = PERFSCORE_THROUGHPUT_ZERO; - result.insLatency = PERFSCORE_LATENCY_ZERO; - } + if ((id->idInsOpt() == INS_OPTS_NONE) || ((instrDescAlign*)id)->isPlacedAfterJmp) + { + // Either we're not going to generate 'align' instruction, or the 'align' + // instruction is placed immediately after unconditional jmp. + // In both cases, don't count for PerfScore. + + result.insThroughput = PERFSCORE_THROUGHPUT_ZERO; + result.insLatency = PERFSCORE_LATENCY_ZERO; + break; + } else if (ins == INS_yield) { // @ToDo - find out the actual latency, match x86/x64 for now result.insThroughput = PERFSCORE_THROUGHPUT_140C; result.insLatency = PERFSCORE_LATENCY_140C; } - else - { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency = PERFSCORE_LATENCY_ZERO; - } + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_ZERO; break; case IF_SI_0B: // dmb, dsb, isb diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 6f922c75478c3..8246f2de377fb 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -14872,9 +14872,12 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins { case INS_align: #if FEATURE_LOOP_ALIGN - if (id->idCodeSize() == 0) + if ((id->idCodeSize() == 0) || ((instrDescAlign*)id)->isPlacedAfterJmp) { - // We're not going to generate any instruction, so it doesn't count for PerfScore. + // Either we're not going to generate 'align' instruction, or the 'align' + // instruction is placed immediately after unconditional jmp. + // In both cases, don't count for PerfScore. + result.insThroughput = PERFSCORE_THROUGHPUT_ZERO; result.insLatency = PERFSCORE_LATENCY_ZERO; break; From 64bba410fd7fd3c1676dd4642734b8c569b44a6e Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Sun, 14 Nov 2021 21:17:46 -0800 Subject: [PATCH 19/24] jit format and fix a bug --- src/coreclr/jit/compiler.cpp | 1 - src/coreclr/jit/emit.h | 12 ++++++------ src/coreclr/jit/optimizer.cpp | 12 +++++++++--- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index e7d505b27e419..d282e904864d8 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -5272,7 +5272,6 @@ void Compiler::placeLoopAlignInstructions() minBlockSoFar = BB_MAX_WEIGHT; bbHavingAlign = nullptr; - if (--loopsToProcess == 0) { break; diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 875a80e0f4b9e..c851df9df80a1 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1389,9 +1389,9 @@ class emitter // If no 'jmp' instructions were found until idaLoopHeadPredIG, // then idaLoopHeadPredIG == idaIG. #ifdef DEBUG - bool isPlacedAfterJmp; // Is the 'align' instruction placed after jmp. Used to decide - // if the instruction cost should be included in PerfScore - // calculation or not. + bool isPlacedAfterJmp; // Is the 'align' instruction placed after jmp. Used to decide + // if the instruction cost should be included in PerfScore + // calculation or not. #endif inline insGroup* loopHeadIG() @@ -1804,14 +1804,14 @@ class emitter unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize DEBUG_ARG(bool isAlignAdjusted)); // Get the smallest loop size - void emitLoopAlignment(DEBUG_ARG1(bool isPlacedBehindJmp)); + void emitLoopAlignment(DEBUG_ARG1(bool isPlacedBehindJmp)); bool emitEndsWithAlignInstr(); // Validate if newLabel is appropriate void emitSetLoopBackEdge(BasicBlock* loopTopBlock); void emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool isAlignAdjusted)); - void emitLoopAlign(unsigned paddingBytes, bool isFirstAlign DEBUG_ARG(bool isPlacedBehindJmp)); - void emitLongLoopAlign(unsigned alignmentBoundary DEBUG_ARG(bool isPlacedBehindJmp)); + void emitLoopAlign(unsigned paddingBytes, bool isFirstAlign DEBUG_ARG(bool isPlacedBehindJmp)); + void emitLongLoopAlign(unsigned alignmentBoundary DEBUG_ARG(bool isPlacedBehindJmp)); instrDescAlign* emitAlignInNextIG(instrDescAlign* alignInstr); void emitConnectAlignInstrWithCurIG(); diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 1760fa6547c58..a58c2c25a89d2 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2517,9 +2517,15 @@ void Compiler::optIdentifyLoopsForAlignment() weight_t topWeight = top->getBBWeight(this); if (topWeight >= (opts.compJitAlignLoopMinBlockWeight * BB_UNITY_WEIGHT)) { - loopAlignCandidates++; - JITDUMP(FMT_LP " that starts at " FMT_BB " needs alignment, weight=" FMT_WT ".\n", loopInd, - top->bbNum, topWeight); + // Sometimes with JitOptRepeat > 1, we might end up finding the loops twice. In such + // cases, make sure to count them just once. + if (!first->isLoopAlign()) + { + loopAlignCandidates++; + top->bbFlags |= BBF_LOOP_ALIGN; + JITDUMP(FMT_LP " that starts at " FMT_BB " needs alignment, weight=" FMT_WT ".\n", loopInd, + top->bbNum, top->getBBWeight(this)); + } } else { From 1e24fcbb3e46de3c86fcf4f047c24e03b6c5469a Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Mon, 15 Nov 2021 14:14:35 -0800 Subject: [PATCH 20/24] fix the loopCandidates == 0 scenario --- src/coreclr/jit/fgbasic.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index 0f4631afd11ad..75f6bc2016a7c 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -4649,6 +4649,7 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) if (block->isLoopAlign()) { + loopAlignCandidates++; succBlock->bbFlags |= BBF_LOOP_ALIGN; JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " for loop# %d.", block->bbNum, succBlock->bbNum, block->bbNatLoopNum); @@ -4665,6 +4666,12 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) skipUnmarkLoop = true; } + // Make sure that we don't double count the loop align candidates. + if (block->isLoopAlign()) + { + loopAlignCandidates--; + } + // If this is the first Cold basic block update fgFirstColdBlock if (block == fgFirstColdBlock) { From b301fa5e4779517b2094f2303f127312cefb823e Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Tue, 16 Nov 2021 11:46:43 -0800 Subject: [PATCH 21/24] Add unmarkLoopAlign(), add check for fgFirstBB --- src/coreclr/jit/block.cpp | 19 +++++++++++++++++++ src/coreclr/jit/block.h | 2 ++ src/coreclr/jit/compiler.cpp | 8 ++++++++ src/coreclr/jit/fgbasic.cpp | 9 +++------ src/coreclr/jit/morph.cpp | 11 ++--------- src/coreclr/jit/optimizer.cpp | 26 ++++---------------------- 6 files changed, 38 insertions(+), 37 deletions(-) diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp index 4104bd7d0cab3..5b588a65c8db2 100644 --- a/src/coreclr/jit/block.cpp +++ b/src/coreclr/jit/block.cpp @@ -1662,3 +1662,22 @@ BBswtDesc::BBswtDesc(Compiler* comp, const BBswtDesc* other) bbsDstTab[i] = other->bbsDstTab[i]; } } + +//------------------------------------------------------------------------ +// unmarkLoopAlign: Unmarks the LOOP_ALIGN flag from the block and reduce the +// loop alignment count. +// +// Arguments: +// compiler - Compiler instance +// reason - Reason to print in JITDUMP +// +void BasicBlock::unmarkLoopAlign(Compiler* compiler DEBUG_ARG(const char* reason)) +{ + // Make sure we unmark just and count just once. + if (isLoopAlign()) + { + compiler->loopAlignCandidates--; + bbFlags &= ~BBF_LOOP_ALIGN; + JITDUMP("Unmarking LOOP_ALIGN from " FMT_BB ". Reason= %s.", bbNum, reason); + } +} diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h index e96151e8818d8..3b8ecf7f0dacf 100644 --- a/src/coreclr/jit/block.h +++ b/src/coreclr/jit/block.h @@ -660,6 +660,8 @@ struct BasicBlock : private LIR::Range return ((bbFlags & BBF_LOOP_ALIGN) != 0); } + void unmarkLoopAlign(Compiler* comp DEBUG_ARG(const char* reason)); + bool hasAlign() const { return ((bbFlags & BBF_HAS_ALIGN) != 0); diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index d282e904864d8..01ff6587b4c15 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -5241,6 +5241,14 @@ void Compiler::placeLoopAlignInstructions() BasicBlock* bbHavingAlign = nullptr; for (BasicBlock* const block : Blocks()) { + if ((block == fgFirstBB) && block->isLoopAlign()) + { + // Adding align instruction in prolog is not supported + // hence skip the align block if it is the first block. + loopsToProcess--; + continue; + } + // If there is a unconditional jump if (opts.compJitHideAlignBehindJmp && (block->bbJumpKind == BBJ_ALWAYS)) { diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index 75f6bc2016a7c..04cf8cdf3cec9 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -4666,12 +4666,6 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) skipUnmarkLoop = true; } - // Make sure that we don't double count the loop align candidates. - if (block->isLoopAlign()) - { - loopAlignCandidates--; - } - // If this is the first Cold basic block update fgFirstColdBlock if (block == fgFirstColdBlock) { @@ -4808,6 +4802,9 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable) block->bbFlags |= BBF_REMOVED; } + // If this was marked for alignment, remove it + block->unmarkLoopAlign(this DEBUG_ARG("Removed block")); + if (bPrev != nullptr) { switch (bPrev->bbJumpKind) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index b217c5cfbcd91..3840aec920826 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -15320,15 +15320,8 @@ bool Compiler::fgFoldConditional(BasicBlock* block) * Remove the loop from the table */ optLoopTable[loopNum].lpFlags |= LPFLG_REMOVED; -#if FEATURE_LOOP_ALIGN - if (optLoopTable[loopNum].lpTop->isLoopAlign()) - { - loopAlignCandidates--; - optLoopTable[loopNum].lpTop->bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Removing LOOP_ALIGN flag from bogus loop in " FMT_BB "\n", - optLoopTable[loopNum].lpTop->bbNum); - } -#endif + + optLoopTable[loopNum].lpTop->unmarkLoopAlign(this DEBUG_ARG("Bogus loop")); #ifdef DEBUG if (verbose) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index a58c2c25a89d2..4e935a5f64b3f 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -379,15 +379,7 @@ void Compiler::optUnmarkLoopBlocks(BasicBlock* begBlk, BasicBlock* endBlk) JITDUMP("\n"); -#if FEATURE_LOOP_ALIGN - if (begBlk->isLoopAlign()) - { - // Clear the loop alignment bit on the head of a loop, since it's no longer a loop. - loopAlignCandidates--; - begBlk->bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Removing LOOP_ALIGN flag from removed loop in " FMT_BB "\n", begBlk->bbNum); - } -#endif + begBlk->unmarkLoopAlign(this DEBUG_ARG("Removed loop")); } /***************************************************************************************************** @@ -3786,12 +3778,7 @@ PhaseStatus Compiler::optUnrollLoops() #if FEATURE_LOOP_ALIGN for (block = head->bbNext;; block = block->bbNext) { - if (block->isLoopAlign()) - { - loopAlignCandidates--; - block->bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Removing LOOP_ALIGN flag from unrolled loop in " FMT_BB "\n", block->bbNum); - } + block->unmarkLoopAlign(this DEBUG_ARG("Unrolled loop")); if (block == bottom) { @@ -7625,13 +7612,8 @@ void Compiler::AddContainsCallAllContainingLoops(unsigned lnum) if (optLoopTable[lnum].lpChild == BasicBlock::NOT_IN_LOOP) { BasicBlock* top = optLoopTable[lnum].lpTop; - if (top->isLoopAlign()) - { - loopAlignCandidates--; - top->bbFlags &= ~BBF_LOOP_ALIGN; - JITDUMP("Removing LOOP_ALIGN flag for " FMT_LP " that starts at " FMT_BB " because loop has a call.\n", - lnum, top->bbNum); - } + + top->unmarkLoopAlign(this DEBUG_ARG("Loop with call")); } #endif From 57759d0bf5e909b180962370335299085d0034f7 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Tue, 16 Nov 2021 12:04:12 -0800 Subject: [PATCH 22/24] merge conflict fix --- src/coreclr/jit/optimizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 4e935a5f64b3f..7378561146d56 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2511,7 +2511,7 @@ void Compiler::optIdentifyLoopsForAlignment() { // Sometimes with JitOptRepeat > 1, we might end up finding the loops twice. In such // cases, make sure to count them just once. - if (!first->isLoopAlign()) + if (!top->isLoopAlign()) { loopAlignCandidates++; top->bbFlags |= BBF_LOOP_ALIGN; From ef0e8592e405d9764d0be0a7f0537f5d00e9d687 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Tue, 16 Nov 2021 14:21:07 -0800 Subject: [PATCH 23/24] Add missing } --- src/coreclr/jit/emitarm64.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index c7c8c37b1d741..3de8faafc13c6 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -14620,6 +14620,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insLatency = PERFSCORE_LATENCY_ZERO; break; } + } else if (ins == INS_yield) { // @ToDo - find out the actual latency, match x86/x64 for now From 976b253a9d2625889cf1f3273ba0a666973a6df2 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 17 Nov 2021 18:13:39 -0800 Subject: [PATCH 24/24] Grammar nit Co-authored-by: Bruce Forstall --- src/coreclr/jit/block.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp index 5b588a65c8db2..62add7f24dd3f 100644 --- a/src/coreclr/jit/block.cpp +++ b/src/coreclr/jit/block.cpp @@ -1673,7 +1673,7 @@ BBswtDesc::BBswtDesc(Compiler* comp, const BBswtDesc* other) // void BasicBlock::unmarkLoopAlign(Compiler* compiler DEBUG_ARG(const char* reason)) { - // Make sure we unmark just and count just once. + // Make sure we unmark and count just once. if (isLoopAlign()) { compiler->loopAlignCandidates--;