From 2ab4d0963fbb485b5198552cab10be138c08009c Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Sun, 1 Sep 2024 19:40:34 +0200 Subject: [PATCH 01/16] WIP: Generate compact unwinding friendly frames on Apple platforms --- src/coreclr/jit/codegen.h | 2 + src/coreclr/jit/codegenarm64.cpp | 60 ++++++- src/coreclr/jit/codegencommon.cpp | 21 +++ src/coreclr/jit/codegeninterface.h | 1 + src/coreclr/jit/lclvars.cpp | 14 +- src/coreclr/jit/optcse.h | 1 + .../Compiler/ObjectWriter/MachNative.cs | 13 ++ .../Compiler/ObjectWriter/MachObjectWriter.cs | 166 +++++++++++++++--- 8 files changed, 243 insertions(+), 35 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 1e53616c2d8e7..1211bffad758b 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -654,8 +654,10 @@ class CodeGen final : public CodeGenInterface #ifdef TARGET_ARM64 virtual void SetSaveFpLrWithAllCalleeSavedRegisters(bool value); virtual bool IsSaveFpLrWithAllCalleeSavedRegisters() const; + virtual void SetReverseAndPairCalleeSavedRegisters(bool value); bool genSaveFpLrWithAllCalleeSavedRegisters; bool genForceFuncletFrameType5; + bool genReverseAndPairCalleeSavedRegisters; #endif // TARGET_ARM64 //------------------------------------------------------------------------- diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index ac28ebd30a19b..563b24fddfbdd 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -845,12 +845,20 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, i for (int i = 0; i < regStack.Height(); ++i) { - RegPair regPair = regStack.Bottom(i); + RegPair regPair = genReverseAndPairCalleeSavedRegisters ? regStack.Top(i) : regStack.Bottom(i); if (regPair.reg2 != REG_NA) { // We can use a STP instruction. - genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, REG_IP0, - nullptr); + if (genReverseAndPairCalleeSavedRegisters) + { + genPrologSaveRegPair(regPair.reg2, regPair.reg1, spOffset, spDelta, false, + REG_IP0, nullptr); + } + else + { + genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, + REG_IP0, nullptr); + } spOffset += 2 * slotSize; } @@ -926,8 +934,9 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe // Save integer registers at higher addresses than floating-point registers. + regMaskTP maskSaveRegsFrame = regsToSaveMask & (RBM_FP | RBM_LR); regMaskTP maskSaveRegsFloat = regsToSaveMask & RBM_ALLFLOAT; - regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat; + regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat & ~maskSaveRegsFrame; if (maskSaveRegsFloat != RBM_NONE) { @@ -939,6 +948,14 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe if (maskSaveRegsInt != RBM_NONE) { genSaveCalleeSavedRegisterGroup(maskSaveRegsInt, spDelta, lowestCalleeSavedOffset); + spDelta = 0; + lowestCalleeSavedOffset += genCountBits(maskSaveRegsInt) * FPSAVE_REGSIZE_BYTES; + } + + if (maskSaveRegsFrame != RBM_NONE) + { + genPrologSaveRegPair(REG_FP, REG_LR, lowestCalleeSavedOffset, spDelta, false, REG_IP0, + nullptr); // No need to update spDelta, lowestCalleeSavedOffset since they're not used after this. } } @@ -970,13 +987,21 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta stackDelta = spDelta; } - RegPair regPair = regStack.Top(i); + RegPair regPair = genReverseAndPairCalleeSavedRegisters ? regStack.Bottom(i) : regStack.Top(i); if (regPair.reg2 != REG_NA) { spOffset -= 2 * slotSize; - genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair, REG_IP1, - nullptr); + if (genReverseAndPairCalleeSavedRegisters) + { + genEpilogRestoreRegPair(regPair.reg2, regPair.reg1, spOffset, stackDelta, false, + REG_IP1, nullptr); + } + else + { + genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair, + REG_IP1, nullptr); + } } else { @@ -1043,11 +1068,19 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in // Save integer registers at higher addresses than floating-point registers. + regMaskTP maskRestoreRegsFrame = regsToRestoreMask & (RBM_FP | RBM_LR); regMaskTP maskRestoreRegsFloat = regsToRestoreMask & RBM_ALLFLOAT; - regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat; + regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat & ~maskRestoreRegsFrame; // Restore in the opposite order of saving. + if (maskRestoreRegsFrame != RBM_NONE) + { + int spFrameDelta = (maskRestoreRegsFloat != RBM_NONE || maskRestoreRegsInt != RBM_NONE) ? 0 : spDelta; + spOffset -= 2 * REGSIZE_BYTES; + genEpilogRestoreRegPair(REG_FP, REG_LR, spOffset, spFrameDelta, false, REG_IP1, nullptr); + } + if (maskRestoreRegsInt != RBM_NONE) { int spIntDelta = (maskRestoreRegsFloat != RBM_NONE) ? 0 : spDelta; // should we delay the SP adjustment? @@ -5142,6 +5175,17 @@ bool CodeGen::IsSaveFpLrWithAllCalleeSavedRegisters() const return genSaveFpLrWithAllCalleeSavedRegisters; } +//--------------------------------------------------------------------- +// SetReverseAndPairCalleeSavedRegisters - Set the variable that indicates if X19-X28 registers +// callee-saved registers are stored in reverse order and we always store them as pairs even if +// one register in the pair is unused. +// +void CodeGen::SetReverseAndPairCalleeSavedRegisters(bool value) +{ + JITDUMP("Setting genReverseAndPairCalleeSavedRegisters to %s\n", dspBool(value)); + genReverseAndPairCalleeSavedRegisters = value; +} + /***************************************************************************** * Emit a call to a helper function. * diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 9920f9846d273..b88fd731d1f0b 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -255,6 +255,7 @@ CodeGen::CodeGen(Compiler* theCompiler) #ifdef TARGET_ARM64 genSaveFpLrWithAllCalleeSavedRegisters = false; genForceFuncletFrameType5 = false; + genReverseAndPairCalleeSavedRegisters = false; #endif // TARGET_ARM64 } @@ -4846,6 +4847,26 @@ void CodeGen::genFinalizeFrame() } #endif // TARGET_ARM +#ifdef TARGET_ARM64 + if (genReverseAndPairCalleeSavedRegisters) + { + // Make sure we push the registers in pairs if possible. If we only allocate a contiguous + // block of registers this should add at most one integer and at most one floating point + // register to the list. The stack has to be 16-byte aligned, so in worst case it results + // in allocating 16 bytes more space on stack if odd number of integer and odd number of + // FP registers were occupied. Same number of instructions will be generated, just the + // STR instructions are replaced with STP (store pair). + regMaskTP maskModifiedRegs = regSet.rsGetModifiedRegsMask(); + regMaskTP maskPairRegs = + ((maskModifiedRegs & (RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14)).getLow() << 1) | + ((maskModifiedRegs & (RBM_R19 | RBM_R21 | RBM_R23 | RBM_R25 | RBM_R27)).getLow() << 1); + if (maskPairRegs != RBM_NONE) + { + regSet.rsSetRegsModified(maskPairRegs); + } + } +#endif + #ifdef DEBUG if (verbose) { diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h index 608c72c22d48d..e5fe01f0e040b 100644 --- a/src/coreclr/jit/codegeninterface.h +++ b/src/coreclr/jit/codegeninterface.h @@ -232,6 +232,7 @@ class CodeGenInterface #ifdef TARGET_ARM64 virtual void SetSaveFpLrWithAllCalleeSavedRegisters(bool value) = 0; virtual bool IsSaveFpLrWithAllCalleeSavedRegisters() const = 0; + virtual void SetReverseAndPairCalleeSavedRegisters(bool value) = 0; #endif // TARGET_ARM64 regNumber genGetThisArgReg(GenTreeCall* call) const; diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index d4adbd6b9907a..9dac693fdfda5 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -6060,9 +6060,17 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0) { - // Default configuration - codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) || - opts.compDbgEnC || compStressCompile(STRESS_GENERIC_VARN, 20)); + if (IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform) + { + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); + codeGen->SetReverseAndPairCalleeSavedRegisters(true); + } + else + { + // Default configuration + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) || + opts.compDbgEnC || compStressCompile(STRESS_GENERIC_VARN, 20)); + } } else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1) { diff --git a/src/coreclr/jit/optcse.h b/src/coreclr/jit/optcse.h index 48fba4e50e13a..bed240752b1b9 100644 --- a/src/coreclr/jit/optcse.h +++ b/src/coreclr/jit/optcse.h @@ -317,6 +317,7 @@ class CSE_Heuristic : public CSE_HeuristicCommon bool PromotionCheck(CSE_Candidate* candidate); void AdjustHeuristic(CSE_Candidate* candidate); bool ConsiderTree(GenTree* tree, bool isReturn); + bool IsLargeFrame() { return largeFrame; } const char* Name() const { diff --git a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs index 14db96a935474..8b3af9392a7b1 100644 --- a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs +++ b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs @@ -120,5 +120,18 @@ internal static class MachNative public const uint PLATFORM_TVOSSIMULATOR = 8; public const uint PLATFORM_WATCHOSSIMULATOR = 9; public const uint PLATFORM_DRIVERKIT = 10; + + public const uint UNWIND_ARM64_MODE_FRAMELESS = 0x02000000; + public const uint UNWIND_ARM64_MODE_DWARF = 0x03000000; + public const uint UNWIND_ARM64_MODE_FRAME = 0x04000000; + public const uint UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001; + public const uint UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002; + public const uint UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004; + public const uint UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008; + public const uint UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010; + public const uint UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100; + public const uint UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200; + public const uint UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400; + public const uint UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800; } } diff --git a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs index 2424ec434126c..3be7fe14b89c2 100644 --- a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs +++ b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs @@ -752,26 +752,148 @@ void EmitCompactUnwindSymbol(string symbolName) private protected override string ExternCName(string name) => "_" + name; - // This represents the following DWARF code: - // DW_CFA_advance_loc: 4 - // DW_CFA_def_cfa_offset: +16 - // DW_CFA_offset: W29 -16 - // DW_CFA_offset: W30 -8 - // DW_CFA_advance_loc: 4 - // DW_CFA_def_cfa_register: W29 - // which is generated for the following frame prolog/epilog: - // stp fp, lr, [sp, #-10]! - // mov fp, sp - // ... - // ldp fp, lr, [sp], #0x10 - // ret - private static ReadOnlySpan DwarfArm64EmptyFrame => new byte[] + private static uint GetArm64CompactUnwindCode(byte[] blobData) { - 0x04, 0x00, 0xFF, 0xFF, 0x10, 0x00, 0x00, 0x00, - 0x04, 0x02, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x04, 0x02, 0x1E, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x08, 0x01, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00 - }; + if (blobData == null || blobData.Length == 0) + { + return UNWIND_ARM64_MODE_FRAMELESS; + } + + Debug.Assert(blobData.Length % 8 == 0); + + short spReg = -1; + + int codeOffset = 0; + short cfaRegister = spReg; + int cfaOffset = 0; + int spOffset = 0; + + const int REG_DWARF_X19 = 19; + const int REG_DWARF_X30 = 30; + const int REG_DWARF_FP = 29; + const int REG_DWARF_D8 = 72; + const int REG_DWARF_D15 = 79; + const int REG_IDX_X19 = 0; + const int REG_IDX_X28 = 9; + const int REG_IDX_FP = 10; + const int REG_IDX_LR = 11; + const int REG_IDX_D8 = 12; + const int REG_IDX_D15 = 19; + Span registerOffset = stackalloc int[20]; + + registerOffset.Fill(int.MinValue); + + int offset = 0; + while (offset < blobData.Length) + { + codeOffset = Math.Max(codeOffset, blobData[offset++]); + CFI_OPCODE opcode = (CFI_OPCODE)blobData[offset++]; + short dwarfReg = BinaryPrimitives.ReadInt16LittleEndian(blobData.AsSpan(offset)); + offset += sizeof(short); + int cfiOffset = BinaryPrimitives.ReadInt32LittleEndian(blobData.AsSpan(offset)); + offset += sizeof(int); + + switch (opcode) + { + case CFI_OPCODE.CFI_DEF_CFA_REGISTER: + cfaRegister = dwarfReg; + + if (spOffset != 0) + { + for (int i = 0; i < registerOffset.Length; i++) + if (registerOffset[i] != int.MinValue) + registerOffset[i] -= spOffset; + + cfaOffset += spOffset; + spOffset = 0; + } + + break; + + case CFI_OPCODE.CFI_REL_OFFSET: + Debug.Assert(cfaRegister == spReg); + if (dwarfReg >= REG_DWARF_X19 && dwarfReg <= REG_DWARF_X30) // X19 - X30 + registerOffset[dwarfReg - REG_DWARF_X19 + REG_IDX_X19] = cfiOffset; + else if (dwarfReg >= REG_DWARF_D8 && dwarfReg <= REG_DWARF_D15) // D8 - D15 + registerOffset[dwarfReg - REG_DWARF_D8 + REG_IDX_D8] = cfiOffset; + else + return UNWIND_ARM64_MODE_DWARF; + break; + + case CFI_OPCODE.CFI_ADJUST_CFA_OFFSET: + if (cfaRegister != spReg) + { + cfaOffset += cfiOffset; + } + else + { + spOffset += cfiOffset; + + for (int i = 0; i < registerOffset.Length; i++) + if (registerOffset[i] != int.MinValue) + registerOffset[i] += cfiOffset; + } + break; + } + } + + uint unwindCode; + int nextOffset; + + if (cfaRegister == REG_DWARF_FP && + cfaOffset == 16 && + registerOffset[REG_IDX_FP] == -16 && + registerOffset[REG_IDX_LR] == -8) + { + unwindCode = UNWIND_ARM64_MODE_FRAME; + nextOffset = -24; + } + else if (cfaRegister == -1 && spOffset <= 65520 && + registerOffset[REG_IDX_FP] == int.MinValue && registerOffset[REG_IDX_LR] == int.MinValue) + { + uint encodedSpOffset = (uint)(spOffset / 16) << 12; + unwindCode = UNWIND_ARM64_MODE_FRAMELESS | encodedSpOffset; + nextOffset = spOffset - 8; + } + else + { + return UNWIND_ARM64_MODE_DWARF; + } + + for (int i = REG_IDX_X19; i < REG_IDX_X28; i += 2) + { + if (registerOffset[i] == int.MinValue) + { + if (registerOffset[i + 1] != int.MinValue) + return UNWIND_ARM64_MODE_DWARF; + } + else if (registerOffset[i] == nextOffset) + { + if (registerOffset[i + 1] != nextOffset - 8) + return UNWIND_ARM64_MODE_DWARF; + nextOffset -= 16; + unwindCode |= UNWIND_ARM64_FRAME_X19_X20_PAIR << (i >> 1); + } + } + + for (int i = REG_IDX_D8; i < REG_IDX_D15; i += 2) + { + if (registerOffset[i] == int.MinValue) + { + if (registerOffset[i + 1] != int.MinValue) + return UNWIND_ARM64_MODE_DWARF; + } + else if (registerOffset[i] == nextOffset) + { + if (registerOffset[i + 1] != nextOffset - 8) + return UNWIND_ARM64_MODE_DWARF; + nextOffset -= 16; + unwindCode |= UNWIND_ARM64_FRAME_D8_D9_PAIR << (i >> 1); + } + } + + return unwindCode; + } private protected override bool EmitCompactUnwinding(string startSymbolName, ulong length, string lsdaSymbolName, byte[] blob) { @@ -779,11 +901,7 @@ private protected override bool EmitCompactUnwinding(string startSymbolName, ulo if (_cpuType == CPU_TYPE_ARM64) { - if (blob.AsSpan().SequenceEqual(DwarfArm64EmptyFrame)) - { - // Frame-based encoding, no saved registers - encoding = 0x04000000; - } + encoding = GetArm64CompactUnwindCode(blob); } _compactUnwindCodes.Add(new CompactUnwindCode( From d0c07645111c098cec1aab018fd445b0748079a8 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Tue, 3 Sep 2024 22:58:27 +0200 Subject: [PATCH 02/16] WIP: Use SP-based addresses in lvaFrameAddress --- src/coreclr/jit/compiler.h | 2 ++ src/coreclr/jit/compiler.hpp | 22 ++++++++++++++++++++++ src/coreclr/jit/lclvars.cpp | 3 ++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index a23a650a805f2..20a377836e6ea 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -4207,6 +4207,8 @@ class Compiler #ifdef TARGET_ARM int lvaFrameAddress(int varNum, bool mustBeFPBased, regNumber* pBaseReg, int addrModeOffset, bool isFloatUsage); +#elif defined(TARGET_ARM64) + int lvaFrameAddress(int varNum, bool* pFPbased, bool mustBeFPBased = false); #else int lvaFrameAddress(int varNum, bool* pFPbased); #endif diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp index 4f70a01a8a79e..6e383056ec25d 100644 --- a/src/coreclr/jit/compiler.hpp +++ b/src/coreclr/jit/compiler.hpp @@ -2638,6 +2638,9 @@ inline int Compiler::lvaFrameAddress( int varNum, bool mustBeFPBased, regNumber* pBaseReg, int addrModeOffset, bool isFloatUsage) +#elif defined(TARGET_ARM64) + int + Compiler::lvaFrameAddress(int varNum, bool* pFPbased, bool mustBeFPBased) #else int Compiler::lvaFrameAddress(int varNum, bool* pFPbased) @@ -2807,6 +2810,25 @@ inline { *pBaseReg = REG_SPBASE; } +#elif defined(TARGET_ARM64) + if (FPbased && !mustBeFPBased && //varNum < 0 && + //!compLocallocUsed && !ehAnyFunclets() && //funCurrentFunc()->funKind == FUNC_ROOT && + !codeGen->isFramePointerRequired() && + varOffset < 0 && + //varOffset < -0xFF && + lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT && + codeGen->IsSaveFpLrWithAllCalleeSavedRegisters()) + { + int spVarOffset = varOffset + codeGen->genSPtoFPdelta(); + FPbased = false; + varOffset = spVarOffset; + } + else + { + JITDUMP("skipping lvaFrameAddress opt skipped - compLocallocUsed: %s lvaDoneFrameLayout: %d saveFpLr: %s funCurrentFunc()->funKind: %d\n", dspBool(compLocallocUsed), lvaDoneFrameLayout, dspBool(codeGen->IsSaveFpLrWithAllCalleeSavedRegisters()), funCurrentFunc()->funKind); + } + JITDUMP("lvaFrameAddress %d FPbased: %s varOffset: %d\n", varNum, dspBool(FPbased), varOffset); + *pFPbased = FPbased; #else *pFPbased = FPbased; #endif diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 9dac693fdfda5..892502e7337d0 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -6060,7 +6060,8 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0) { - if (IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform) + if (IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform && + !codeGen->isFramePointerRequired()) { codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); codeGen->SetReverseAndPairCalleeSavedRegisters(true); From 472b73c2f2290a5667d9ec8b515a3178161533db Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Tue, 10 Sep 2024 16:16:50 +0200 Subject: [PATCH 03/16] WIP --- src/coreclr/jit/codegen.h | 1 - src/coreclr/jit/codegenarm64.cpp | 11 ------ src/coreclr/jit/codegencommon.cpp | 43 +++++++++++++++++++++++- src/coreclr/jit/codegeninterface.h | 1 - src/coreclr/jit/compiler.h | 2 -- src/coreclr/jit/compiler.hpp | 7 +--- src/coreclr/jit/lclvars.cpp | 54 +++++++++--------------------- 7 files changed, 59 insertions(+), 60 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 1211bffad758b..73ecd14e03c0d 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -654,7 +654,6 @@ class CodeGen final : public CodeGenInterface #ifdef TARGET_ARM64 virtual void SetSaveFpLrWithAllCalleeSavedRegisters(bool value); virtual bool IsSaveFpLrWithAllCalleeSavedRegisters() const; - virtual void SetReverseAndPairCalleeSavedRegisters(bool value); bool genSaveFpLrWithAllCalleeSavedRegisters; bool genForceFuncletFrameType5; bool genReverseAndPairCalleeSavedRegisters; diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 563b24fddfbdd..c9b648d451411 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -5175,17 +5175,6 @@ bool CodeGen::IsSaveFpLrWithAllCalleeSavedRegisters() const return genSaveFpLrWithAllCalleeSavedRegisters; } -//--------------------------------------------------------------------- -// SetReverseAndPairCalleeSavedRegisters - Set the variable that indicates if X19-X28 registers -// callee-saved registers are stored in reverse order and we always store them as pairs even if -// one register in the pair is unused. -// -void CodeGen::SetReverseAndPairCalleeSavedRegisters(bool value) -{ - JITDUMP("Setting genReverseAndPairCalleeSavedRegisters to %s\n", dspBool(value)); - genReverseAndPairCalleeSavedRegisters = value; -} - /***************************************************************************** * Emit a call to a helper function. * diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index b88fd731d1f0b..9cfafc4e4d438 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -4848,8 +4848,12 @@ void CodeGen::genFinalizeFrame() #endif // TARGET_ARM #ifdef TARGET_ARM64 - if (genReverseAndPairCalleeSavedRegisters) + if (compiler->IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform) { + JITDUMP("Setting genReverseAndPairCalleeSavedRegisters = true"); + + genReverseAndPairCalleeSavedRegisters = true; + // Make sure we push the registers in pairs if possible. If we only allocate a contiguous // block of registers this should add at most one integer and at most one floating point // register to the list. The stack has to be 16-byte aligned, so in worst case it results @@ -5040,6 +5044,43 @@ void CodeGen::genFinalizeFrame() compiler->lvaAssignFrameOffsets(Compiler::FINAL_FRAME_LAYOUT); +#ifdef TARGET_ARM64 + // Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is + // a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we + // need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value, + // and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the + // frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress. + // (It should be legal to use these frame types for every frame). + + if (compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 0) + { + // For Apple NativeAOT ABI we try to save the FP/LR registers on top to get canonical frame layout + // that can be represented with compact unwinding information. In order to maintain code quality we + // only do it when we can use SP-based addressing (!isFramePointerRequired) through lvaFrameAddress + // optimization, or if the whole frame is small enough that the negative FP-based addressing can + // address the whole frame. + if (compiler->IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform && + (!isFramePointerRequired() || compiler->compLclFrameSize + (compiler->compCalleeRegsPushed * REGSIZE_BYTES) < 0x100)) + { + SetSaveFpLrWithAllCalleeSavedRegisters(true); + } + else + { + // Default configuration + SetSaveFpLrWithAllCalleeSavedRegisters((compiler->getNeedsGSSecurityCookie() && compiler->compLocallocUsed) || + compiler->opts.compDbgEnC || compiler->compStressCompile(Compiler::STRESS_GENERIC_VARN, 20)); + } + } + else if (compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 1) + { + SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames + } + else if ((compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 3)) + { + SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames + } +#endif // TARGET_ARM64 + #ifdef DEBUG if (compiler->opts.dspCode || compiler->opts.disAsm || compiler->opts.disAsm2 || verbose) { diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h index e5fe01f0e040b..608c72c22d48d 100644 --- a/src/coreclr/jit/codegeninterface.h +++ b/src/coreclr/jit/codegeninterface.h @@ -232,7 +232,6 @@ class CodeGenInterface #ifdef TARGET_ARM64 virtual void SetSaveFpLrWithAllCalleeSavedRegisters(bool value) = 0; virtual bool IsSaveFpLrWithAllCalleeSavedRegisters() const = 0; - virtual void SetReverseAndPairCalleeSavedRegisters(bool value) = 0; #endif // TARGET_ARM64 regNumber genGetThisArgReg(GenTreeCall* call) const; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 20a377836e6ea..a23a650a805f2 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -4207,8 +4207,6 @@ class Compiler #ifdef TARGET_ARM int lvaFrameAddress(int varNum, bool mustBeFPBased, regNumber* pBaseReg, int addrModeOffset, bool isFloatUsage); -#elif defined(TARGET_ARM64) - int lvaFrameAddress(int varNum, bool* pFPbased, bool mustBeFPBased = false); #else int lvaFrameAddress(int varNum, bool* pFPbased); #endif diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp index 6e383056ec25d..3a90337abcd64 100644 --- a/src/coreclr/jit/compiler.hpp +++ b/src/coreclr/jit/compiler.hpp @@ -2638,9 +2638,6 @@ inline int Compiler::lvaFrameAddress( int varNum, bool mustBeFPBased, regNumber* pBaseReg, int addrModeOffset, bool isFloatUsage) -#elif defined(TARGET_ARM64) - int - Compiler::lvaFrameAddress(int varNum, bool* pFPbased, bool mustBeFPBased) #else int Compiler::lvaFrameAddress(int varNum, bool* pFPbased) @@ -2811,11 +2808,9 @@ inline *pBaseReg = REG_SPBASE; } #elif defined(TARGET_ARM64) - if (FPbased && !mustBeFPBased && //varNum < 0 && - //!compLocallocUsed && !ehAnyFunclets() && //funCurrentFunc()->funKind == FUNC_ROOT && + if (FPbased && !codeGen->isFramePointerRequired() && varOffset < 0 && - //varOffset < -0xFF && lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT && codeGen->IsSaveFpLrWithAllCalleeSavedRegisters()) { diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 892502e7337d0..42cd969f863f5 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5673,7 +5673,20 @@ void Compiler::lvaFixVirtualFrameOffsets() // We set FP to be after LR, FP delta += 2 * REGSIZE_BYTES; } -#elif defined(TARGET_AMD64) || defined(TARGET_ARM64) +#elif defined(TARGET_ARM64) + else + { + // FP is used. + delta += codeGen->genTotalFrameSize() - codeGen->genSPtoFPdelta(); + if (codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() && isFramePointerUsed()) // Note that currently we always have + // a frame pointer + { + // We set FP to be after LR, FP + delta += 2 * REGSIZE_BYTES; + } + JITDUMP("--- delta bump %d for FP frame\n", delta); + } +#elif defined(TARGET_AMD64) else { // FP is used. @@ -6050,39 +6063,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() codeGen->setFramePointerUsed(codeGen->isFramePointerRequired()); } -#ifdef TARGET_ARM64 - // Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is - // a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we - // need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value, - // and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the - // frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress. - // (It should be legal to use these frame types for every frame). - - if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0) - { - if (IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform && - !codeGen->isFramePointerRequired()) - { - codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); - codeGen->SetReverseAndPairCalleeSavedRegisters(true); - } - else - { - // Default configuration - codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) || - opts.compDbgEnC || compStressCompile(STRESS_GENERIC_VARN, 20)); - } - } - else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1) - { - codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames - } - else if ((opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (opts.compJitSaveFpLrWithCalleeSavedRegisters == 3)) - { - codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames - } -#endif // TARGET_ARM64 - #ifdef TARGET_XARCH // On x86/amd64, the return address has already been pushed by the call instruction in the caller. stkOffs -= TARGET_POINTER_SIZE; // return address; @@ -6144,8 +6124,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() stkOffs -= initialStkOffs; } - if (codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() || !isFramePointerUsed()) // Note that currently we always have - // a frame pointer + if (!isFramePointerUsed()) // Note that currently we always have a frame pointer { stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES; } @@ -6825,8 +6804,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() #endif // TARGET_AMD64 #ifdef TARGET_ARM64 - if (!codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() && isFramePointerUsed()) // Note that currently we always have - // a frame pointer + if (isFramePointerUsed()) // Note that currently we always have a frame pointer { // Create space for saving FP and LR. stkOffs -= 2 * REGSIZE_BYTES; From f9570887a060ed656c739136c90f15287b28f78e Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Tue, 10 Sep 2024 16:56:11 +0200 Subject: [PATCH 04/16] Remove unneeded method --- src/coreclr/jit/optcse.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/coreclr/jit/optcse.h b/src/coreclr/jit/optcse.h index bed240752b1b9..48fba4e50e13a 100644 --- a/src/coreclr/jit/optcse.h +++ b/src/coreclr/jit/optcse.h @@ -317,7 +317,6 @@ class CSE_Heuristic : public CSE_HeuristicCommon bool PromotionCheck(CSE_Candidate* candidate); void AdjustHeuristic(CSE_Candidate* candidate); bool ConsiderTree(GenTree* tree, bool isReturn); - bool IsLargeFrame() { return largeFrame; } const char* Name() const { From a45efd3d0ed3ca6cb0eaef0b32d5b14ddfec363b Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Tue, 10 Sep 2024 17:08:04 +0200 Subject: [PATCH 05/16] JIT format --- src/coreclr/jit/codegenarm64.cpp | 13 +++++-------- src/coreclr/jit/codegencommon.cpp | 16 +++++++++------- src/coreclr/jit/compiler.hpp | 16 ++++++++-------- src/coreclr/jit/lclvars.cpp | 4 ++-- 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index c9b648d451411..d56a92ed0d5ba 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -851,13 +851,12 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, i // We can use a STP instruction. if (genReverseAndPairCalleeSavedRegisters) { - genPrologSaveRegPair(regPair.reg2, regPair.reg1, spOffset, spDelta, false, - REG_IP0, nullptr); + genPrologSaveRegPair(regPair.reg2, regPair.reg1, spOffset, spDelta, false, REG_IP0, nullptr); } else { - genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, - REG_IP0, nullptr); + genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, REG_IP0, + nullptr); } spOffset += 2 * slotSize; @@ -954,8 +953,7 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe if (maskSaveRegsFrame != RBM_NONE) { - genPrologSaveRegPair(REG_FP, REG_LR, lowestCalleeSavedOffset, spDelta, false, REG_IP0, - nullptr); + genPrologSaveRegPair(REG_FP, REG_LR, lowestCalleeSavedOffset, spDelta, false, REG_IP0, nullptr); // No need to update spDelta, lowestCalleeSavedOffset since they're not used after this. } } @@ -994,8 +992,7 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta if (genReverseAndPairCalleeSavedRegisters) { - genEpilogRestoreRegPair(regPair.reg2, regPair.reg1, spOffset, stackDelta, false, - REG_IP1, nullptr); + genEpilogRestoreRegPair(regPair.reg2, regPair.reg1, spOffset, stackDelta, false, REG_IP1, nullptr); } else { diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 9cfafc4e4d438..91b769d8f2743 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -4861,9 +4861,8 @@ void CodeGen::genFinalizeFrame() // FP registers were occupied. Same number of instructions will be generated, just the // STR instructions are replaced with STP (store pair). regMaskTP maskModifiedRegs = regSet.rsGetModifiedRegsMask(); - regMaskTP maskPairRegs = - ((maskModifiedRegs & (RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14)).getLow() << 1) | - ((maskModifiedRegs & (RBM_R19 | RBM_R21 | RBM_R23 | RBM_R25 | RBM_R27)).getLow() << 1); + regMaskTP maskPairRegs = ((maskModifiedRegs & (RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14)).getLow() << 1) | + ((maskModifiedRegs & (RBM_R19 | RBM_R21 | RBM_R23 | RBM_R25 | RBM_R27)).getLow() << 1); if (maskPairRegs != RBM_NONE) { regSet.rsSetRegsModified(maskPairRegs); @@ -5060,22 +5059,25 @@ void CodeGen::genFinalizeFrame() // optimization, or if the whole frame is small enough that the negative FP-based addressing can // address the whole frame. if (compiler->IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform && - (!isFramePointerRequired() || compiler->compLclFrameSize + (compiler->compCalleeRegsPushed * REGSIZE_BYTES) < 0x100)) + (!isFramePointerRequired() || + compiler->compLclFrameSize + (compiler->compCalleeRegsPushed * REGSIZE_BYTES) < 0x100)) { SetSaveFpLrWithAllCalleeSavedRegisters(true); } else { // Default configuration - SetSaveFpLrWithAllCalleeSavedRegisters((compiler->getNeedsGSSecurityCookie() && compiler->compLocallocUsed) || - compiler->opts.compDbgEnC || compiler->compStressCompile(Compiler::STRESS_GENERIC_VARN, 20)); + SetSaveFpLrWithAllCalleeSavedRegisters( + (compiler->getNeedsGSSecurityCookie() && compiler->compLocallocUsed) || compiler->opts.compDbgEnC || + compiler->compStressCompile(Compiler::STRESS_GENERIC_VARN, 20)); } } else if (compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 1) { SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames } - else if ((compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 3)) + else if ((compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || + (compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 3)) { SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames } diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp index 3a90337abcd64..4e536adc1e612 100644 --- a/src/coreclr/jit/compiler.hpp +++ b/src/coreclr/jit/compiler.hpp @@ -2808,19 +2808,19 @@ inline *pBaseReg = REG_SPBASE; } #elif defined(TARGET_ARM64) - if (FPbased && - !codeGen->isFramePointerRequired() && - varOffset < 0 && - lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT && - codeGen->IsSaveFpLrWithAllCalleeSavedRegisters()) + if (FPbased && !codeGen->isFramePointerRequired() && varOffset < 0 && + lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT && codeGen->IsSaveFpLrWithAllCalleeSavedRegisters()) { int spVarOffset = varOffset + codeGen->genSPtoFPdelta(); - FPbased = false; - varOffset = spVarOffset; + FPbased = false; + varOffset = spVarOffset; } else { - JITDUMP("skipping lvaFrameAddress opt skipped - compLocallocUsed: %s lvaDoneFrameLayout: %d saveFpLr: %s funCurrentFunc()->funKind: %d\n", dspBool(compLocallocUsed), lvaDoneFrameLayout, dspBool(codeGen->IsSaveFpLrWithAllCalleeSavedRegisters()), funCurrentFunc()->funKind); + JITDUMP( + "skipping lvaFrameAddress opt skipped - compLocallocUsed: %s lvaDoneFrameLayout: %d saveFpLr: %s funCurrentFunc()->funKind: %d\n", + dspBool(compLocallocUsed), lvaDoneFrameLayout, dspBool(codeGen->IsSaveFpLrWithAllCalleeSavedRegisters()), + funCurrentFunc()->funKind); } JITDUMP("lvaFrameAddress %d FPbased: %s varOffset: %d\n", varNum, dspBool(FPbased), varOffset); *pFPbased = FPbased; diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 42cd969f863f5..5060fcc4a75b7 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5678,8 +5678,8 @@ void Compiler::lvaFixVirtualFrameOffsets() { // FP is used. delta += codeGen->genTotalFrameSize() - codeGen->genSPtoFPdelta(); - if (codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() && isFramePointerUsed()) // Note that currently we always have - // a frame pointer + if (codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() && isFramePointerUsed()) // Note that currently we always + // have a frame pointer { // We set FP to be after LR, FP delta += 2 * REGSIZE_BYTES; From c655ee8f302e33a84edb7cbbcd9cacda2abe08d0 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Tue, 10 Sep 2024 22:29:25 +0200 Subject: [PATCH 06/16] Fix up order of operations --- src/coreclr/jit/lclvars.cpp | 61 +++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 5060fcc4a75b7..09af4e80575ef 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5678,8 +5678,8 @@ void Compiler::lvaFixVirtualFrameOffsets() { // FP is used. delta += codeGen->genTotalFrameSize() - codeGen->genSPtoFPdelta(); - if (codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() && isFramePointerUsed()) // Note that currently we always - // have a frame pointer + if (!codeGen->IsSaveFpLrWithAllCalleeSavedRegisters()) // Note that currently we always + // have a frame pointer { // We set FP to be after LR, FP delta += 2 * REGSIZE_BYTES; @@ -6124,16 +6124,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() stkOffs -= initialStkOffs; } - if (!isFramePointerUsed()) // Note that currently we always have a frame pointer - { - stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES; - } - else - { - // Subtract off FP and LR. - assert(compCalleeRegsPushed >= 2); - stkOffs -= (compCalleeRegsPushed - 2) * REGSIZE_BYTES; - } + stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES; #elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) @@ -6803,14 +6794,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() } #endif // TARGET_AMD64 -#ifdef TARGET_ARM64 - if (isFramePointerUsed()) // Note that currently we always have a frame pointer - { - // Create space for saving FP and LR. - stkOffs -= 2 * REGSIZE_BYTES; - } -#endif // TARGET_ARM64 - #if FEATURE_FIXED_OUT_ARGS if (lvaOutgoingArgSpaceSize > 0) { @@ -6848,6 +6831,44 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() noway_assert(compLclFrameSize + originalFrameSize == (unsigned)-(stkOffs + (pushedCount * (int)TARGET_POINTER_SIZE))); + +#ifdef TARGET_ARM64 + // Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is + // a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we + // need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value, + // and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the + // frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress. + // (It should be legal to use these frame types for every frame). + + if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0) + { + // For Apple NativeAOT ABI we try to save the FP/LR registers on top to get canonical frame layout + // that can be represented with compact unwinding information. In order to maintain code quality we + // only do it when we can use SP-based addressing (!isFramePointerRequired) through lvaFrameAddress + // optimization, or if the whole frame is small enough that the negative FP-based addressing can + // address the whole frame. + if (IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform && + (!codeGen->isFramePointerRequired() || codeGen->genTotalFrameSize() < 0x100)) + { + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); + } + else + { + // Default configuration + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) || + opts.compDbgEnC || + compStressCompile(Compiler::STRESS_GENERIC_VARN, 20)); + } + } + else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1) + { + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames + } + else if ((opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (opts.compJitSaveFpLrWithCalleeSavedRegisters == 3)) + { + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames + } +#endif // TARGET_ARM64 } //------------------------------------------------------------------------ From 0336a6aa95ca10d2278315e2579c1a78556069d9 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Wed, 11 Sep 2024 08:57:39 +0200 Subject: [PATCH 07/16] Commit forgotten change --- src/coreclr/jit/codegencommon.cpp | 40 ------------------------------- 1 file changed, 40 deletions(-) diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 91b769d8f2743..881b4ff0cc566 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -5043,46 +5043,6 @@ void CodeGen::genFinalizeFrame() compiler->lvaAssignFrameOffsets(Compiler::FINAL_FRAME_LAYOUT); -#ifdef TARGET_ARM64 - // Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is - // a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we - // need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value, - // and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the - // frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress. - // (It should be legal to use these frame types for every frame). - - if (compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 0) - { - // For Apple NativeAOT ABI we try to save the FP/LR registers on top to get canonical frame layout - // that can be represented with compact unwinding information. In order to maintain code quality we - // only do it when we can use SP-based addressing (!isFramePointerRequired) through lvaFrameAddress - // optimization, or if the whole frame is small enough that the negative FP-based addressing can - // address the whole frame. - if (compiler->IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform && - (!isFramePointerRequired() || - compiler->compLclFrameSize + (compiler->compCalleeRegsPushed * REGSIZE_BYTES) < 0x100)) - { - SetSaveFpLrWithAllCalleeSavedRegisters(true); - } - else - { - // Default configuration - SetSaveFpLrWithAllCalleeSavedRegisters( - (compiler->getNeedsGSSecurityCookie() && compiler->compLocallocUsed) || compiler->opts.compDbgEnC || - compiler->compStressCompile(Compiler::STRESS_GENERIC_VARN, 20)); - } - } - else if (compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 1) - { - SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames - } - else if ((compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || - (compiler->opts.compJitSaveFpLrWithCalleeSavedRegisters == 3)) - { - SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames - } -#endif // TARGET_ARM64 - #ifdef DEBUG if (compiler->opts.dspCode || compiler->opts.disAsm || compiler->opts.disAsm2 || verbose) { From bab12cf96a4210e64196607cbe8784be5fcade26 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Wed, 11 Sep 2024 16:25:37 +0200 Subject: [PATCH 08/16] Fix the logic for applying fp/lr delta in lvaFixVirtualFrameOffsets --- src/coreclr/jit/lclvars.cpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 09af4e80575ef..2a6f2190ad908 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5649,6 +5649,9 @@ void Compiler::lvaFixVirtualFrameOffsets() // The delta to be added to virtual offset to adjust it relative to frame pointer or SP int delta = 0; +#if defined(TARGET_ARM64) + int fpLrDelta = 0; +#endif #ifdef TARGET_XARCH delta += REGSIZE_BYTES; // pushed PC (return address) for x86/x64 @@ -5682,7 +5685,8 @@ void Compiler::lvaFixVirtualFrameOffsets() // have a frame pointer { // We set FP to be after LR, FP - delta += 2 * REGSIZE_BYTES; + fpLrDelta += 2 * REGSIZE_BYTES; + delta += fpLrDelta; } JITDUMP("--- delta bump %d for FP frame\n", delta); } @@ -5774,15 +5778,23 @@ void Compiler::lvaFixVirtualFrameOffsets() if (doAssignStkOffs) { - JITDUMP("-- V%02u was %d, now %d\n", lclNum, varDsc->GetStackOffset(), varDsc->GetStackOffset() + delta); - varDsc->SetStackOffset(varDsc->GetStackOffset() + delta); + int localDelta = delta; + +#if defined(TARGET_ARM64) + if (varDsc->GetStackOffset() >= 0) + localDelta -= fpLrDelta; +#endif + + JITDUMP("-- V%02u was %d, now %d\n", lclNum, varDsc->GetStackOffset(), + varDsc->GetStackOffset() + localDelta); + varDsc->SetStackOffset(varDsc->GetStackOffset() + localDelta); #if DOUBLE_ALIGN if (genDoubleAlign() && !codeGen->isFramePointerUsed()) { if (varDsc->lvFramePointerBased) { - varDsc->SetStackOffset(varDsc->GetStackOffset() - delta); + varDsc->SetStackOffset(varDsc->GetStackOffset() - localDelta); // We need to re-adjust the offsets of the parameters so they are EBP // relative rather than stack/frame pointer relative From 71adb58ca830447450a6fe927c21ab2528e2c7fd Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Wed, 11 Sep 2024 16:26:47 +0200 Subject: [PATCH 09/16] Fix few SP-based instruction encoding issues --- src/coreclr/jit/emitarm64.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 4153f4ec5080c..7b862a704ccfb 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -7869,6 +7869,8 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va { emitAttr size = EA_SIZE(attr); insFormat fmt = IF_NONE; + insOpts opt = INS_OPTS_NONE; + regNumber reg3 = REG_NA; unsigned scale = 0; bool isLdrStr = false; bool isSimple = true; @@ -7934,7 +7936,17 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va { regNumber rsvdReg = codeGen->rsGetRsvdReg(); codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm); - fmt = IF_DR_3A; // add reg1,reg2,rsvdReg + imm = 0; + if (encodingZRtoSP(reg2) == REG_SP) + { + fmt = IF_DR_3C; // add reg1,sp,rsvdReg + opt = INS_OPTS_LSL; + reg3 = rsvdReg; + } + else + { + fmt = IF_DR_3A; // add reg1,reg2,rsvdReg + } } break; @@ -8043,10 +8055,11 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va id->idIns(ins); id->idInsFmt(fmt); - id->idInsOpt(INS_OPTS_NONE); + id->idInsOpt(opt); id->idReg1(reg1); id->idReg2(reg2); + id->idReg3(reg3); id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs); id->idSetIsLclVar(); @@ -8085,7 +8098,6 @@ void emitter::emitIns_R_R_S_S( // TODO-ARM64-CQ: with compLocallocUsed, should we use REG_SAVED_LOCALLOC_SP instead? regNumber reg3 = FPbased ? REG_FPBASE : REG_SPBASE; - reg3 = encodingSPtoZR(reg3); bool useRegForAdr = true; ssize_t imm = disp; @@ -8117,6 +8129,8 @@ void emitter::emitIns_R_R_S_S( imm = 0; } + reg3 = encodingSPtoZR(reg3); + assert(fmt != IF_NONE); instrDesc* id = emitNewInstrCns(attr1, imm); From 9bd0ab6d2d09369a1b95bfe95d42d3a1571e976a Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Wed, 11 Sep 2024 17:58:50 +0200 Subject: [PATCH 10/16] OSR test --- src/coreclr/jit/lclvars.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 2a6f2190ad908..63c9dcaf5f4ab 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5781,7 +5781,7 @@ void Compiler::lvaFixVirtualFrameOffsets() int localDelta = delta; #if defined(TARGET_ARM64) - if (varDsc->GetStackOffset() >= 0) + if (varDsc->GetStackOffset() >= 0 || lvaIsOSRLocal(lclNum)) localDelta -= fpLrDelta; #endif From bbf7e229ce7a43543211b6d727aa2a2530af6bc0 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Wed, 11 Sep 2024 19:43:12 +0200 Subject: [PATCH 11/16] Mark lvaMonAcquired as lvIsOSRLocal when doing OSR --- src/coreclr/jit/flowgraph.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index 17544e160b8d9..3a63e74b679d0 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -1522,6 +1522,10 @@ void Compiler::fgAddSyncMethodEnterExit() } #endif } + else + { + lvaTable[lvaMonAcquired].lvIsOSRLocal = true; + } // Make a copy of the 'this' pointer to be used in the handler so it does // not inhibit enregistration of all uses of the variable. We cannot do From d369f25eafd92aba965b2de90d7d6380004a8272 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Wed, 11 Sep 2024 21:03:01 +0200 Subject: [PATCH 12/16] Update asserts --- src/coreclr/jit/compiler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 2b9425e6fe502..8e0e6a6a1290a 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -10655,7 +10655,7 @@ bool Compiler::lvaIsOSRLocal(unsigned varNum) { // Sanity check for promoted fields of OSR locals. // - if (varNum >= info.compLocalsCount) + if (varNum >= info.compLocalsCount && varNum != lvaMonAcquired) { assert(varDsc->lvIsStructField); assert(varDsc->lvParentLcl < info.compLocalsCount); From 4db5e57783bf97f91e1ef1993a11acb2ee7b5202 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Wed, 11 Sep 2024 22:29:40 +0200 Subject: [PATCH 13/16] One more take at lvaMonAcquired --- src/coreclr/jit/compiler.cpp | 2 +- src/coreclr/jit/flowgraph.cpp | 4 ---- src/coreclr/jit/lclvars.cpp | 6 ++++++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 8e0e6a6a1290a..2b9425e6fe502 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -10655,7 +10655,7 @@ bool Compiler::lvaIsOSRLocal(unsigned varNum) { // Sanity check for promoted fields of OSR locals. // - if (varNum >= info.compLocalsCount && varNum != lvaMonAcquired) + if (varNum >= info.compLocalsCount) { assert(varDsc->lvIsStructField); assert(varDsc->lvParentLcl < info.compLocalsCount); diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index 3a63e74b679d0..17544e160b8d9 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -1522,10 +1522,6 @@ void Compiler::fgAddSyncMethodEnterExit() } #endif } - else - { - lvaTable[lvaMonAcquired].lvIsOSRLocal = true; - } // Make a copy of the 'this' pointer to be used in the handler so it does // not inhibit enregistration of all uses of the variable. We cannot do diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 63c9dcaf5f4ab..f1abc238b1098 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5687,6 +5687,12 @@ void Compiler::lvaFixVirtualFrameOffsets() // We set FP to be after LR, FP fpLrDelta += 2 * REGSIZE_BYTES; delta += fpLrDelta; + + if ((lvaMonAcquired != BAD_VAR_NUM) && opts.IsOSR()) + { + int offset = lvaTable[lvaMonAcquired].GetStackOffset() - fpLrDelta; + lvaTable[lvaMonAcquired].SetStackOffset(offset); + } } JITDUMP("--- delta bump %d for FP frame\n", delta); } From 1ffc69ddcbbbb26381de3bd8f6ab1c5da5d1f58e Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Thu, 12 Sep 2024 09:11:13 +0200 Subject: [PATCH 14/16] More comprehensive approach for OSR --- src/coreclr/jit/lclvars.cpp | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index f1abc238b1098..b79055ebfde0d 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5649,9 +5649,8 @@ void Compiler::lvaFixVirtualFrameOffsets() // The delta to be added to virtual offset to adjust it relative to frame pointer or SP int delta = 0; -#if defined(TARGET_ARM64) - int fpLrDelta = 0; -#endif + int frameLocalsDelta = 0; + int frameBoundary = 0; #ifdef TARGET_XARCH delta += REGSIZE_BYTES; // pushed PC (return address) for x86/x64 @@ -5685,16 +5684,10 @@ void Compiler::lvaFixVirtualFrameOffsets() // have a frame pointer { // We set FP to be after LR, FP - fpLrDelta += 2 * REGSIZE_BYTES; - delta += fpLrDelta; - - if ((lvaMonAcquired != BAD_VAR_NUM) && opts.IsOSR()) - { - int offset = lvaTable[lvaMonAcquired].GetStackOffset() - fpLrDelta; - lvaTable[lvaMonAcquired].SetStackOffset(offset); - } + frameLocalsDelta = 2 * REGSIZE_BYTES; + frameBoundary = opts.IsOSR() ? -info.compPatchpointInfo->TotalFrameSize() : 0; } - JITDUMP("--- delta bump %d for FP frame\n", delta); + JITDUMP("--- delta bump %d for FP frame, %d inside frame\n", delta, frameLocalsDelta); } #elif defined(TARGET_AMD64) else @@ -5785,11 +5778,8 @@ void Compiler::lvaFixVirtualFrameOffsets() if (doAssignStkOffs) { int localDelta = delta; - -#if defined(TARGET_ARM64) - if (varDsc->GetStackOffset() >= 0 || lvaIsOSRLocal(lclNum)) - localDelta -= fpLrDelta; -#endif + if (frameLocalsDelta != 0 && varDsc->GetStackOffset() < frameBoundary) + localDelta += frameLocalsDelta; JITDUMP("-- V%02u was %d, now %d\n", lclNum, varDsc->GetStackOffset(), varDsc->GetStackOffset() + localDelta); @@ -5822,10 +5812,10 @@ void Compiler::lvaFixVirtualFrameOffsets() assert(codeGen->regSet.tmpAllFree()); for (TempDsc* temp = codeGen->regSet.tmpListBeg(); temp != nullptr; temp = codeGen->regSet.tmpListNxt(temp)) { - temp->tdAdjustTempOffs(delta); + temp->tdAdjustTempOffs(delta + frameLocalsDelta); } - lvaCachedGenericContextArgOffs += delta; + lvaCachedGenericContextArgOffs += delta + frameLocalsDelta; #if FEATURE_FIXED_OUT_ARGS From 2c9ff2412e8d9f0bf14c413af40707f0f2bd98c3 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Thu, 12 Sep 2024 13:07:10 +0200 Subject: [PATCH 15/16] VarArgs fix, this OSR fix --- src/coreclr/jit/lclvars.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index b79055ebfde0d..6fa626f512137 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5648,9 +5648,9 @@ void Compiler::lvaFixVirtualFrameOffsets() #endif // The delta to be added to virtual offset to adjust it relative to frame pointer or SP - int delta = 0; + int delta = 0; int frameLocalsDelta = 0; - int frameBoundary = 0; + int frameBoundary = 0; #ifdef TARGET_XARCH delta += REGSIZE_BYTES; // pushed PC (return address) for x86/x64 @@ -5685,7 +5685,9 @@ void Compiler::lvaFixVirtualFrameOffsets() { // We set FP to be after LR, FP frameLocalsDelta = 2 * REGSIZE_BYTES; - frameBoundary = opts.IsOSR() ? -info.compPatchpointInfo->TotalFrameSize() : 0; + frameBoundary = opts.IsOSR() ? -info.compPatchpointInfo->TotalFrameSize() : 0; + if (info.compIsVarArgs) + frameBoundary -= MAX_REG_ARG * REGSIZE_BYTES; } JITDUMP("--- delta bump %d for FP frame, %d inside frame\n", delta, frameLocalsDelta); } @@ -5815,7 +5817,9 @@ void Compiler::lvaFixVirtualFrameOffsets() temp->tdAdjustTempOffs(delta + frameLocalsDelta); } - lvaCachedGenericContextArgOffs += delta + frameLocalsDelta; + lvaCachedGenericContextArgOffs += delta; + if (lvaCachedGenericContextArgOffs < frameBoundary) + lvaCachedGenericContextArgOffs += frameLocalsDelta; #if FEATURE_FIXED_OUT_ARGS From 04902b92cc5c0db948ad624b8d08909a24bd98ad Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Thu, 12 Sep 2024 13:38:50 +0200 Subject: [PATCH 16/16] Fix last commit --- src/coreclr/jit/lclvars.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 6fa626f512137..27cbcd19a783c 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5817,9 +5817,9 @@ void Compiler::lvaFixVirtualFrameOffsets() temp->tdAdjustTempOffs(delta + frameLocalsDelta); } - lvaCachedGenericContextArgOffs += delta; if (lvaCachedGenericContextArgOffs < frameBoundary) lvaCachedGenericContextArgOffs += frameLocalsDelta; + lvaCachedGenericContextArgOffs += delta; #if FEATURE_FIXED_OUT_ARGS