From 671c81a9e44345c6f6300b756a1a4eb1db4e6045 Mon Sep 17 00:00:00 2001 From: Aman Khalid Date: Fri, 27 May 2022 14:11:31 -0700 Subject: [PATCH 1/7] Add hot/cold splitting test job to jit-runtime-experimental --- eng/pipelines/common/templates/runtimes/run-test-job.yml | 1 + src/tests/Common/testenvironment.proj | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/eng/pipelines/common/templates/runtimes/run-test-job.yml b/eng/pipelines/common/templates/runtimes/run-test-job.yml index f7f310dfb9742..fc7c2e991220f 100644 --- a/eng/pipelines/common/templates/runtimes/run-test-job.yml +++ b/eng/pipelines/common/templates/runtimes/run-test-job.yml @@ -572,6 +572,7 @@ jobs: - jitosr_stress - jitosr_pgo - jitosr_stress_random + - jitosr_stress_split - jitpartialcompilation - jitpartialcompilation_osr - jitpartialcompilation_osr_pgo diff --git a/src/tests/Common/testenvironment.proj b/src/tests/Common/testenvironment.proj index fcc3e3926febc..3a1b4b129bc73 100644 --- a/src/tests/Common/testenvironment.proj +++ b/src/tests/Common/testenvironment.proj @@ -36,13 +36,16 @@ COMPlus_EnableSSE42; COMPlus_EnableSSSE3; COMPlus_ForceRelocs; + COMPlus_GCgen0size; COMPlus_GCStress; COMPlus_GCName; COMPlus_gcServer; COMPlus_HeapVerify; COMPlus_JITMinOpts; COMPlus_JitELTHookEnabled; + COMPlus_JitFakeProcedureSplitting; COMPlus_JitStress; + COMPlus_JitStressProcedureSplitting; COMPlus_JitStressRegs; COMPlus_TailcallStress; COMPlus_ReadyToRun; @@ -188,6 +191,7 @@ + From 60ad81619943d1220f832381172a6eeb76d218bf Mon Sep 17 00:00:00 2001 From: Aman Khalid Date: Wed, 1 Jun 2022 16:50:32 -0700 Subject: [PATCH 2/7] Implement branching between hot/cold sections on ARM64. - Remove NYIs/control flow preventing code splitting on ARM64. - Update emitter::emitIns_J() to keep jumps between hot/cold sections long. - Update emitter::emitOutputLJ() to emit long jumps for both conditional and unconditional branches between hot/cold sections, and report relocations to the runtime. - Update long ldr pseudoinstruction to instead use ld1 instruction when loading 16-byte constants into vector registers; ldr implementation temporarily loads the constant into a general integer register, which does not support 16-byte values. --- src/coreclr/jit/compiler.cpp | 5 - src/coreclr/jit/emit.cpp | 7 - src/coreclr/jit/emit.h | 2 +- src/coreclr/jit/emitarm64.cpp | 264 +++++++++++++++++++++++----------- src/coreclr/jit/emitarm64.h | 2 + 5 files changed, 180 insertions(+), 100 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 00dfb10b8072b..4cebf9c489c74 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3199,11 +3199,6 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.compProcedureSplitting = jitFlags->IsSet(JitFlags::JIT_FLAG_PROCSPLIT) || enableFakeSplitting; -#ifdef TARGET_ARM64 - // TODO-ARM64-NYI: enable hot/cold splitting - opts.compProcedureSplitting = false; -#endif // TARGET_ARM64 - #ifdef DEBUG opts.compProcedureSplittingEH = opts.compProcedureSplitting; #endif // DEBUG diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 10c5c096f8934..ed81c1348fe79 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -4280,7 +4280,6 @@ void emitter::emitJumpDistBind() else if (emitIsUncondJump(jmp)) { // Nothing to do; we don't shrink these. - assert(jmp->idjShort); ssz = JMP_SIZE_SMALL; } else if (emitIsLoadLabel(jmp)) @@ -6045,12 +6044,6 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, #ifdef TARGET_ARM64 // For arm64, we want to allocate JIT data always adjacent to code similar to what native compiler does. // This way allows us to use a single `ldr` to access such data like float constant/jmp table. - if (emitTotalColdCodeSize > 0) - { - // JIT data might be far away from the cold code. - NYI_ARM64("Need to handle fix-up to data from cold code."); - } - UNATIVE_OFFSET roDataAlignmentDelta = 0; if (emitConsDsc.dsdOffs && (emitConsDsc.alignment == TARGET_POINTER_SIZE)) { diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index b93fded455f32..ef4f92b420e8c 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -996,7 +996,7 @@ class emitter case IF_LARGELDC: if (isVectorRegister(idReg1())) { - // adrp + ldr + fmov + // (adrp + ldr + fmov) or (adrp + add + ld1) size = 12; } else diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 864ba862edddd..ac906cc1730c6 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -8421,10 +8421,12 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount) switch (ins) { case INS_bl_local: + idjShort = true; + // Fall through. case INS_b: // Unconditional jump is a single form. - idjShort = true; - fmt = IF_BI_0A; + // Assume is long in case we cross hot/cold sections. + fmt = IF_BI_0A; break; case INS_beq: @@ -8469,7 +8471,6 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount) id->idAddr()->iiaBBlabel = dst; // Skip unconditional jump that has a single form. - // TODO-ARM64-NYI: enable hot/cold splittingNYI. // The target needs to be relocated. if (!idjShort) { @@ -9799,38 +9800,67 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) { // Update addrReg with the reserved integer register // since we cannot use dstReg (vector) to load constant directly from memory. - addrReg = id->idReg2(); + + // If loading a 16-byte value, we will need to load directly into dstReg. + // Thus, encode addrReg for the ld1 instruction. + if (opSize == EA_16BYTE) + { + addrReg = encodingSPtoZR(id->idReg2()); + } + else + { + addrReg = id->idReg2(); + } + assert(isGeneralRegister(addrReg)); } + ins = INS_adrp; fmt = IF_DI_1E; dst = emitOutputShortAddress(dst, ins, fmt, relPageAddr, addrReg); - // ldr x, [x, page offs] -- load constant from page address + page offset into integer register. ssize_t imm12 = (ssize_t)dstAddr & 0xFFF; // 12 bits assert(isValidUimm12(imm12)); - ins = INS_ldr; - fmt = IF_LS_2B; - dst = emitOutputShortConstant(dst, ins, fmt, imm12, addrReg, opSize); - // fmov v, d -- copy constant in integer register to vector register. - // This is needed only for vector constant. - if (addrReg != dstReg) + // Special case: emit add + ld1 instructions for loading 16-byte data into vector register. + if (isVectorRegister(dstReg) && (opSize == EA_16BYTE)) { - // fmov Vd,Rn DV_2I X00111100X100111 000000nnnnnddddd 1E27 0000 Vd,Rn - // (scalar, from general) - assert(isVectorRegister(dstReg) && isGeneralRegister(addrReg)); - ins = INS_fmov; - fmt = IF_DV_2I; - code_t code = emitInsCode(ins, fmt); + const emitAttr elemSize = EA_1BYTE; + const insOpts opt = optMakeArrangement(opSize, elemSize); - code |= insEncodeReg_Vd(dstReg); // ddddd - code |= insEncodeReg_Rn(addrReg); // nnnnn - if (id->idOpSize() == EA_8BYTE) + assert(isGeneralRegisterOrSP(addrReg)); + assert(isValidVectorElemsize(elemSize)); + assert(isValidArrangement(opSize, opt)); + + // Calculate page addr + page offs, then emit ld1 instruction. + dst = emitOutputVectorConstant(dst, imm12, dstReg, addrReg, opSize, elemSize); + } + else + { + // ldr x, [x, 0] -- load constant from address into integer register. + ins = INS_ldr; + fmt = IF_LS_2B; + dst = emitOutputShortConstant(dst, ins, fmt, imm12, addrReg, opSize); + + // fmov v, d -- copy constant in integer register to vector register. + // This is needed only for vector constant. + if (addrReg != dstReg) { - code |= 0x80400000; // X ... X + // fmov Vd,Rn DV_2I X00111100X100111 000000nnnnnddddd 1E27 0000 Vd,Rn + // (scalar, from general) + assert(isVectorRegister(dstReg) && isGeneralRegister(addrReg)); + ins = INS_fmov; + fmt = IF_DV_2I; + code_t code = emitInsCode(ins, fmt); + + code |= insEncodeReg_Vd(dstReg); // ddddd + code |= insEncodeReg_Rn(addrReg); // nnnnn + if (id->idOpSize() == EA_8BYTE) + { + code |= 0x80400000; // X ... X + } + dst += emitOutput_Instr(dst, code); } - dst += emitOutput_Instr(dst, code); } } } @@ -9933,12 +9963,6 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) /* For forward jumps, record the address of the distance value */ id->idjTemp.idjAddr = (distVal > 0) ? dst : NULL; - if (emitJumpCrossHotColdBoundary(srcOffs, dstOffs)) - { - assert(!id->idjShort); - NYI_ARM64("Relocation Support for long address"); - } - assert(insOptsNone(id->idInsOpt())); if (isJump) @@ -9949,75 +9973,114 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) assert(!id->idjKeepLong); assert(emitJumpCrossHotColdBoundary(srcOffs, dstOffs) == false); assert((fmt == IF_BI_0A) || (fmt == IF_BI_0B) || (fmt == IF_BI_1A) || (fmt == IF_BI_1B)); + dst = emitOutputShortBranch(dst, ins, fmt, distVal, id); } else { - // Long conditional jump - assert(fmt == IF_LARGEJMP); - // This is a pseudo-instruction format representing a large conditional branch, to allow - // us to get a greater branch target range than we can get by using a straightforward conditional - // branch. It is encoded as a short conditional branch that branches around a long unconditional - // branch. - // - // Conceptually, we have: - // - // b L_target - // - // The code we emit is: - // - // b L_not // 4 bytes. Note that we reverse the condition. - // b L_target // 4 bytes - // L_not: - // - // Note that we don't actually insert any blocks: we simply encode "b L_not" as a branch with - // the correct offset. Note also that this works for both integer and floating-point conditions, because - // the condition inversion takes ordered/unordered into account, preserving NaN behavior. For example, - // "GT" (greater than) is inverted to "LE" (less than, equal, or unordered). + // Long conditional/unconditional jump - instruction reverseIns; - insFormat reverseFmt; + if (fmt == IF_LARGEJMP) + { + // This is a pseudo-instruction format representing a large conditional branch, to allow + // us to get a greater branch target range than we can get by using a straightforward conditional + // branch. It is encoded as a short conditional branch that branches around a long unconditional + // branch. + // + // Conceptually, we have: + // + // b L_target + // + // The code we emit is: + // + // b L_not // 4 bytes. Note that we reverse the condition. + // b L_target // 4 bytes + // L_not: + // + // Note that we don't actually insert any blocks: we simply encode "b L_not" as a branch with + // the correct offset. Note also that this works for both integer and floating-point conditions, because + // the condition inversion takes ordered/unordered into account, preserving NaN behavior. For example, + // "GT" (greater than) is inverted to "LE" (less than, equal, or unordered). - switch (ins) + instruction reverseIns; + insFormat reverseFmt; + + switch (ins) + { + case INS_cbz: + reverseIns = INS_cbnz; + reverseFmt = IF_BI_1A; + break; + case INS_cbnz: + reverseIns = INS_cbz; + reverseFmt = IF_BI_1A; + break; + case INS_tbz: + reverseIns = INS_tbnz; + reverseFmt = IF_BI_1B; + break; + case INS_tbnz: + reverseIns = INS_tbz; + reverseFmt = IF_BI_1B; + break; + default: + reverseIns = emitJumpKindToIns(emitReverseJumpKind(emitInsToJumpKind(ins))); + reverseFmt = IF_BI_0B; + } + + dst = emitOutputShortBranch(dst, + reverseIns, // reverse the conditional instruction + reverseFmt, 8, /* 8 bytes from start of this large conditional + pseudo-instruction to L_not. */ + id); + + // Now, pretend we've got a normal unconditional branch, and fall through to the code to emit that. + ins = INS_b; + fmt = IF_BI_0A; + + // The distVal was computed based on the beginning of the pseudo-instruction, + // So subtract the size of the conditional branch so that it is relative to the + // unconditional branch. + distVal -= 4; + } + + assert(fmt == IF_BI_0A); + assert((distVal & 1) == 0); + code_t code = emitInsCode(ins, fmt); + const bool recordRelocation = emitComp->opts.compReloc && emitJumpCrossHotColdBoundary(srcOffs, dstOffs); + + if (recordRelocation) { - case INS_cbz: - reverseIns = INS_cbnz; - reverseFmt = IF_BI_1A; - break; - case INS_cbnz: - reverseIns = INS_cbz; - reverseFmt = IF_BI_1A; - break; - case INS_tbz: - reverseIns = INS_tbnz; - reverseFmt = IF_BI_1B; - break; - case INS_tbnz: - reverseIns = INS_tbz; - reverseFmt = IF_BI_1B; - break; - default: - reverseIns = emitJumpKindToIns(emitReverseJumpKind(emitInsToJumpKind(ins))); - reverseFmt = IF_BI_0B; + // dst isn't an actual final target location, just some intermediate + // location. Thus we cannot make any guarantees about distVal (not + // even the direction/sign). Instead we don't encode any offset and + // rely on the relocation to do all the work } + else + { + // Branch offset encodings are scaled by 4. + noway_assert((distVal & 3) == 0); + distVal >>= 2; + noway_assert(isValidSimm26(distVal)); - dst = - emitOutputShortBranch(dst, - reverseIns, // reverse the conditional instruction - reverseFmt, - 8, /* 8 bytes from start of this large conditional pseudo-instruction to L_not. */ - id); + // Insert offset into unconditional branch instruction + distVal &= 0x3FFFFFFLL; + code |= distVal; + } - // Now, pretend we've got a normal unconditional branch, and fall through to the code to emit that. - ins = INS_b; - fmt = IF_BI_0A; + const unsigned instrSize = emitOutput_Instr(dst, code); - // The distVal was computed based on the beginning of the pseudo-instruction, - // So subtract the size of the conditional branch so that it is relative to the - // unconditional branch. - distVal -= 4; - } + if (recordRelocation) + { + assert(id->idjKeepLong); + if (emitComp->info.compMatchedVM) + { + void* target = emitOffsetToPtr(dstOffs); + emitRecordRelocation((void*)dst, target, IMAGE_REL_ARM64_BRANCH26); + } + } - dst = emitOutputShortBranch(dst, ins, fmt, distVal, id); + dst += instrSize; + } } else if (loadLabel) { @@ -10138,7 +10201,7 @@ BYTE* emitter::emitOutputShortConstant( ssize_t loBits = (imm & 3); noway_assert(loBits == 0); - ssize_t distVal = imm >>= 2; // load offset encodings are scaled by 4. + ssize_t distVal = imm >> 2; // load offset encodings are scaled by 4. noway_assert(isValidSimm19(distVal)); @@ -10206,6 +10269,33 @@ BYTE* emitter::emitOutputShortConstant( return dst; } + +/***************************************************************************** + * + * Output instructions to load a constant into a vector register. + */ +BYTE* emitter::emitOutputVectorConstant( + BYTE* dst, ssize_t imm, regNumber dstReg, regNumber addrReg, emitAttr opSize, emitAttr elemSize) +{ + // add addrReg, addrReg, page offs -- compute address = page addr + page offs. + code_t code = emitInsCode(INS_add, IF_DI_2A); // DI_2A X0010001shiiiiii iiiiiinnnnnddddd 1100 0000 imm(i12, sh) + code |= insEncodeDatasize(EA_8BYTE); // X - use EA_8BYTE, as we are calculating 64-bit address + code |= ((code_t)imm << 10); // iiiiiiiiiiii + code |= insEncodeReg_Rd(addrReg); // ddddd + code |= insEncodeReg_Rn(addrReg); // nnnnn + dst += emitOutput_Instr(dst, code); + + // ld1 dstReg, addrReg -- load constant at address in addrReg into dstReg. + code = emitInsCode(INS_ld1, IF_LS_2D); // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn + code |= insEncodeVectorsize(opSize); // Q + code |= insEncodeVLSElemsize(elemSize); // ss + code |= insEncodeReg_Rn(addrReg); // nnnnn + code |= insEncodeReg_Vt(dstReg); // ttttt + dst += emitOutput_Instr(dst, code); + + return dst; +} + /***************************************************************************** * * Output a call instruction. diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h index 0279b3360e75d..a2ebb460e014b 100644 --- a/src/coreclr/jit/emitarm64.h +++ b/src/coreclr/jit/emitarm64.h @@ -861,6 +861,8 @@ BYTE* emitOutputShortBranch(BYTE* dst, instruction ins, insFormat fmt, ssize_t d BYTE* emitOutputShortAddress(BYTE* dst, instruction ins, insFormat fmt, ssize_t distVal, regNumber reg); BYTE* emitOutputShortConstant( BYTE* dst, instruction ins, insFormat fmt, ssize_t distVal, regNumber reg, emitAttr opSize); +BYTE* emitOutputVectorConstant( + BYTE* dst, ssize_t distVal, regNumber dstReg, regNumber addrReg, emitAttr opSize, emitAttr elemSize); /***************************************************************************** * From d2bbed8a05d7c14420b437a179330e5f405a6148 Mon Sep 17 00:00:00 2001 From: Aman Khalid Date: Mon, 13 Jun 2022 17:20:18 -0700 Subject: [PATCH 3/7] Enable fake hot/cold splitting on ARM64 This commit contains fixes for various bugs exposed by enabling fake hot/cold splitting on ARM64: - Branches between hot/cold sections are now always long. - The pseudoinstruction for loading a constant from the cold section did not support loading 16-byte data into vector registers, as it temporarily loaded the constant into an 8-byte integer register. Now, 16-byte constants are loaded directly into vector registers via an `ld1` instruction. - Tests involving loading 16-byte constants exposed the data section is not always aligned to its largest constant. Now, the data section is always aligned to `emitConsDsc.alignment` when calling `eeAllocMem`. - Asserts/NYIs blocking hot/cold splitting on ARM64 have been removed. Fake hot/cold splitting requires we fake unwind info by treating each split function as one hot section. A more architecture-agnostic approach for this has been applied. --- src/coreclr/jit/compiler.h | 4 -- src/coreclr/jit/emit.cpp | 9 +--- src/coreclr/jit/emitarm64.cpp | 6 ++- src/coreclr/jit/unwind.cpp | 18 ++++++- src/coreclr/jit/unwindamd64.cpp | 67 ++++++++++----------------- src/coreclr/jit/unwindarm.cpp | 45 +++++++++++++----- src/coreclr/jit/unwindx86.cpp | 63 ++++++++++--------------- src/tests/Common/testenvironment.proj | 1 - 8 files changed, 108 insertions(+), 105 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 13a7791b18d6e..7779d45b9ba5e 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8000,10 +8000,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void unwindReserveFuncHelper(FuncInfoDsc* func, bool isHotCode); void unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pColdCode, bool isHotCode); -#ifdef DEBUG - void fakeUnwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode); -#endif // DEBUG - #endif // TARGET_AMD64 || (TARGET_X86 && FEATURE_EH_FUNCLETS) UNATIVE_OFFSET unwindGetCurrentOffset(FuncInfoDsc* func); diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 54ff4fa9985f4..fe3b6c9a34ce7 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -6045,13 +6045,8 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, // For arm64/LoongArch64, we want to allocate JIT data always adjacent to code similar to what native compiler does. // This way allows us to use a single `ldr` to access such data like float constant/jmp table. - UNATIVE_OFFSET roDataAlignmentDelta = 0; - if (emitConsDsc.dsdOffs && (emitConsDsc.alignment == TARGET_POINTER_SIZE)) - { - UNATIVE_OFFSET roDataAlignment = TARGET_POINTER_SIZE; // 8 Byte align by default. - roDataAlignmentDelta = (UNATIVE_OFFSET)ALIGN_UP(emitTotalHotCodeSize, roDataAlignment) - emitTotalHotCodeSize; - assert((roDataAlignmentDelta == 0) || (roDataAlignmentDelta == 4)); - } + const UNATIVE_OFFSET roDataAlignmentDelta = + (UNATIVE_OFFSET)ALIGN_UP(emitTotalHotCodeSize, emitConsDsc.alignment) - emitTotalHotCodeSize; args.hotCodeSize = emitTotalHotCodeSize + roDataAlignmentDelta + emitConsDsc.dsdOffs; args.coldCodeSize = emitTotalColdCodeSize; diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index ac906cc1730c6..bd2f5fa519192 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -8422,7 +8422,8 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount) { case INS_bl_local: idjShort = true; - // Fall through. + fmt = IF_BI_0A; + break; case INS_b: // Unconditional jump is a single form. // Assume is long in case we cross hot/cold sections. @@ -9825,6 +9826,9 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) // Special case: emit add + ld1 instructions for loading 16-byte data into vector register. if (isVectorRegister(dstReg) && (opSize == EA_16BYTE)) { + // Low 4 bits should be 0 -- 16-byte JIT data should be aligned on 16 bytes. + assert((imm12 & 15) == 0); + const emitAttr elemSize = EA_1BYTE; const insOpts opt = optMakeArrangement(opSize, elemSize); diff --git a/src/coreclr/jit/unwind.cpp b/src/coreclr/jit/unwind.cpp index 6ad60a064f35c..63c4ed716cf39 100644 --- a/src/coreclr/jit/unwind.cpp +++ b/src/coreclr/jit/unwind.cpp @@ -69,7 +69,16 @@ void Compiler::unwindGetFuncLocations(FuncInfoDsc* func, // The hot section only goes up to the cold section assert(fgFirstFuncletBB == nullptr); - *ppEndLoc = new (this, CMK_UnwindInfo) emitLocation(ehEmitCookie(fgFirstColdBlock)); +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting()) + { + *ppEndLoc = nullptr; // If fake-splitting, "trick" VM by pretending entire function is hot. + } + else +#endif // DEBUG + { + *ppEndLoc = new (this, CMK_UnwindInfo) emitLocation(ehEmitCookie(fgFirstColdBlock)); + } } else { @@ -259,6 +268,13 @@ void Compiler::unwindEmitFuncCFI(FuncInfoDsc* func, void* pHotCode, void* pColdC DWORD unwindCodeBytes = 0; BYTE* pUnwindBlock = nullptr; +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting()) + { + pColdCode = nullptr; + } +#endif // DEBUG + if (func->startLoc == nullptr) { startOffset = 0; diff --git a/src/coreclr/jit/unwindamd64.cpp b/src/coreclr/jit/unwindamd64.cpp index 2c8e90fa5a944..88cefbe31ed5e 100644 --- a/src/coreclr/jit/unwindamd64.cpp +++ b/src/coreclr/jit/unwindamd64.cpp @@ -656,18 +656,17 @@ void Compiler::unwindReserve() // void Compiler::unwindReserveFunc(FuncInfoDsc* func) { -#ifdef DEBUG - if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr)) + unwindReserveFuncHelper(func, true); + + if (fgFirstColdBlock != nullptr) { - assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets. - unwindReserveFuncHelper(func, true); - } - else +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting()) + { + assert(func->funKind == FUNC_ROOT); // No splitting of funclets. + } + else #endif // DEBUG - { - unwindReserveFuncHelper(func, true); - - if (fgFirstColdBlock != nullptr) { unwindReserveFuncHelper(func, false); } @@ -859,7 +858,17 @@ void Compiler::unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pCo if (isHotCode) { - assert(endOffset <= info.compTotalHotCodeSize); +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr)) + { + assert(endOffset <= info.compNativeCodeSize); + } + else +#endif // DEBUG + { + assert(endOffset <= info.compTotalHotCodeSize); + } + pColdCode = nullptr; } else @@ -890,43 +899,17 @@ void Compiler::unwindEmitFunc(FuncInfoDsc* func, void* pHotCode, void* pColdCode static_assert_no_msg(FUNC_HANDLER == (FuncKind)CORJIT_FUNC_HANDLER); static_assert_no_msg(FUNC_FILTER == (FuncKind)CORJIT_FUNC_FILTER); -#ifdef DEBUG - if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != nullptr)) + unwindEmitFuncHelper(func, pHotCode, pColdCode, true); + + if (pColdCode != nullptr) { - fakeUnwindEmitFuncHelper(func, pHotCode); - } - else +#ifdef DEBUG + if (!JitConfig.JitFakeProcedureSplitting()) #endif // DEBUG - { - unwindEmitFuncHelper(func, pHotCode, pColdCode, true); - - if (pColdCode != nullptr) { unwindEmitFuncHelper(func, pHotCode, pColdCode, false); } } } -#ifdef DEBUG -void Compiler::fakeUnwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode) -{ - assert(fgFirstColdBlock != nullptr); - assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets. - - const UNATIVE_OFFSET startOffset = 0; - const UNATIVE_OFFSET endOffset = info.compNativeCodeSize; - const DWORD unwindCodeBytes = sizeof(func->unwindCodes) - func->unwindCodeSlot; - BYTE* pUnwindBlock = &func->unwindCodes[func->unwindCodeSlot]; - - if (opts.dspUnwind) - { - DumpUnwindInfo(true, startOffset, endOffset, (const UNWIND_INFO* const)pUnwindBlock); - } - - // Pass pColdCode = nullptr; VM allocs unwind info for combined hot/cold section - eeAllocUnwindInfo((BYTE*)pHotCode, nullptr, startOffset, endOffset, unwindCodeBytes, pUnwindBlock, - (CorJitFuncKind)func->funKind); -} -#endif // DEBUG - #endif // TARGET_AMD64 diff --git a/src/coreclr/jit/unwindarm.cpp b/src/coreclr/jit/unwindarm.cpp index 1eb7456250cbb..8a14c6edbb832 100644 --- a/src/coreclr/jit/unwindarm.cpp +++ b/src/coreclr/jit/unwindarm.cpp @@ -563,13 +563,20 @@ void Compiler::unwindReserve() void Compiler::unwindReserveFunc(FuncInfoDsc* func) { BOOL isFunclet = (func->funKind == FUNC_ROOT) ? FALSE : TRUE; - bool funcHasColdSection = false; + bool funcHasColdSection = (fgFirstColdBlock != nullptr); + +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting() && funcHasColdSection) + { + funcHasColdSection = false; // "Trick" the VM into thinking we don't have a cold section. + } +#endif // DEBUG #if defined(FEATURE_CFI_SUPPORT) if (generateCFIUnwindCodes()) { DWORD unwindCodeBytes = 0; - if (fgFirstColdBlock != nullptr) + if (funcHasColdSection) { eeReserveUnwindInfo(isFunclet, true /*isColdCode*/, unwindCodeBytes); } @@ -584,7 +591,7 @@ void Compiler::unwindReserveFunc(FuncInfoDsc* func) // cold section. This needs to be done before we split into fragments, as each // of the hot and cold sections can have multiple fragments. - if (fgFirstColdBlock != NULL) + if (funcHasColdSection) { assert(!isFunclet); // TODO-CQ: support hot/cold splitting with EH @@ -595,8 +602,6 @@ void Compiler::unwindReserveFunc(FuncInfoDsc* func) func->uwiCold = new (this, CMK_UnwindInfo) UnwindInfo(); func->uwiCold->InitUnwindInfo(this, startLoc, endLoc); func->uwiCold->HotColdSplitCodes(&func->uwi); - - funcHasColdSection = true; } // First we need to split the function or funclet into fragments that are no larger @@ -1604,11 +1609,19 @@ void UnwindFragmentInfo::Allocate( UNATIVE_OFFSET endOffset; UNATIVE_OFFSET codeSize; - // We don't support hot/cold splitting with EH, so if there is cold code, this - // better not be a funclet! - // TODO-CQ: support funclets in cold code - - noway_assert(isHotCode || funKind == CORJIT_FUNC_ROOT); +// We don't support hot/cold splitting with EH, so if there is cold code, this +// better not be a funclet! +// TODO-CQ: support funclets in cold code +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != NULL)) + { + noway_assert(isHotCode && (funKind == CORJIT_FUNC_ROOT)); + } + else +#endif // DEBUG + { + noway_assert(isHotCode || (funKind == CORJIT_FUNC_ROOT)); + } // Compute the final size, and start and end offsets of the fragment @@ -1656,7 +1669,17 @@ void UnwindFragmentInfo::Allocate( if (isHotCode) { - assert(endOffset <= uwiComp->info.compTotalHotCodeSize); +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != NULL)) + { + assert(endOffset <= uwiComp->info.compNativeCodeSize); + } + else +#endif // DEBUG + { + assert(endOffset <= uwiComp->info.compTotalHotCodeSize); + } + pColdCode = NULL; } else diff --git a/src/coreclr/jit/unwindx86.cpp b/src/coreclr/jit/unwindx86.cpp index bd27e46cbef49..32d077429af6a 100644 --- a/src/coreclr/jit/unwindx86.cpp +++ b/src/coreclr/jit/unwindx86.cpp @@ -113,18 +113,17 @@ void Compiler::unwindEmit(void* pHotCode, void* pColdCode) // void Compiler::unwindReserveFunc(FuncInfoDsc* func) { -#ifdef DEBUG - if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr)) + unwindReserveFuncHelper(func, true); + + if (fgFirstColdBlock != nullptr) { - assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets. - unwindReserveFuncHelper(func, true); - } - else +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting()) + { + assert(func->funKind == FUNC_ROOT); // No splitting of funclets. + } + else #endif // DEBUG - { - unwindReserveFuncHelper(func, true); - - if (fgFirstColdBlock != nullptr) { unwindReserveFuncHelper(func, false); } @@ -164,17 +163,13 @@ void Compiler::unwindEmitFunc(FuncInfoDsc* func, void* pHotCode, void* pColdCode static_assert_no_msg(FUNC_HANDLER == (FuncKind)CORJIT_FUNC_HANDLER); static_assert_no_msg(FUNC_FILTER == (FuncKind)CORJIT_FUNC_FILTER); -#ifdef DEBUG - if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != nullptr)) + unwindEmitFuncHelper(func, pHotCode, pColdCode, true); + + if (pColdCode != nullptr) { - fakeUnwindEmitFuncHelper(func, pHotCode); - } - else +#ifdef DEBUG + if (!JitConfig.JitFakeProcedureSplitting()) #endif // DEBUG - { - unwindEmitFuncHelper(func, pHotCode, pColdCode, true); - - if (pColdCode != nullptr) { unwindEmitFuncHelper(func, pHotCode, pColdCode, false); } @@ -258,7 +253,17 @@ void Compiler::unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pCo if (isHotCode) { - assert(endOffset <= info.compTotalHotCodeSize); +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr)) + { + assert(endOffset <= info.compNativeCodeSize); + } + else +#endif // DEBUG + { + assert(endOffset <= info.compTotalHotCodeSize); + } + pColdCode = nullptr; } else @@ -276,22 +281,4 @@ void Compiler::unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pCo (BYTE*)&unwindInfo, (CorJitFuncKind)func->funKind); } -#ifdef DEBUG -void Compiler::fakeUnwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode) -{ - assert(fgFirstColdBlock != nullptr); - assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets. - - const UNATIVE_OFFSET startOffset = 0; - const UNATIVE_OFFSET endOffset = info.compNativeCodeSize; - - UNWIND_INFO unwindInfo; - unwindInfo.FunctionLength = (ULONG)(endOffset); - - // Pass pColdCode = nullptr; VM allocs unwind info for combined hot/cold section - eeAllocUnwindInfo((BYTE*)pHotCode, nullptr, startOffset, endOffset, sizeof(UNWIND_INFO), (BYTE*)&unwindInfo, - (CorJitFuncKind)func->funKind); -} -#endif // DEBUG - #endif // FEATURE_EH_FUNCLETS diff --git a/src/tests/Common/testenvironment.proj b/src/tests/Common/testenvironment.proj index 3a42df9c5a0d2..4cc62162bc2f2 100644 --- a/src/tests/Common/testenvironment.proj +++ b/src/tests/Common/testenvironment.proj @@ -36,7 +36,6 @@ COMPlus_EnableSSE42; COMPlus_EnableSSSE3; COMPlus_ForceRelocs; - COMPlus_GCgen0size; COMPlus_GCStress; COMPlus_GCName; COMPlus_gcServer; From fb5b07826b6c2c58b304791a7be5a0cc4e9f6118 Mon Sep 17 00:00:00 2001 From: Aman Khalid Date: Mon, 13 Jun 2022 17:20:18 -0700 Subject: [PATCH 4/7] Enable fake hot/cold splitting on ARM64 This commit contains fixes for various bugs exposed by enabling fake hot/cold splitting on ARM64: - Branches between hot/cold sections are now always long. - The pseudoinstruction for loading a constant from the cold section did not support loading 16-byte data into vector registers, as it temporarily loaded the constant into an 8-byte integer register. Now, 16-byte constants are loaded directly into vector registers via an `ld1` instruction. - Tests involving loading 16-byte constants exposed the data section is not always aligned to its largest constant. Now, the data section is always aligned to `emitConsDsc.alignment` when calling `eeAllocMem`. - Asserts/NYIs blocking hot/cold splitting on ARM64 have been removed. Fake hot/cold splitting requires we fake unwind info by treating each split function as one hot section. A more architecture-agnostic approach for this has been applied. --- src/coreclr/jit/compiler.h | 4 -- src/coreclr/jit/emit.cpp | 9 +---- src/coreclr/jit/unwind.cpp | 18 ++++++++- src/coreclr/jit/unwindamd64.cpp | 67 ++++++++++++--------------------- src/coreclr/jit/unwindarm.cpp | 45 ++++++++++++++++------ src/coreclr/jit/unwindx86.cpp | 63 ++++++++++++------------------- 6 files changed, 103 insertions(+), 103 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 13a7791b18d6e..7779d45b9ba5e 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8000,10 +8000,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void unwindReserveFuncHelper(FuncInfoDsc* func, bool isHotCode); void unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pColdCode, bool isHotCode); -#ifdef DEBUG - void fakeUnwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode); -#endif // DEBUG - #endif // TARGET_AMD64 || (TARGET_X86 && FEATURE_EH_FUNCLETS) UNATIVE_OFFSET unwindGetCurrentOffset(FuncInfoDsc* func); diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 54ff4fa9985f4..fe3b6c9a34ce7 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -6045,13 +6045,8 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, // For arm64/LoongArch64, we want to allocate JIT data always adjacent to code similar to what native compiler does. // This way allows us to use a single `ldr` to access such data like float constant/jmp table. - UNATIVE_OFFSET roDataAlignmentDelta = 0; - if (emitConsDsc.dsdOffs && (emitConsDsc.alignment == TARGET_POINTER_SIZE)) - { - UNATIVE_OFFSET roDataAlignment = TARGET_POINTER_SIZE; // 8 Byte align by default. - roDataAlignmentDelta = (UNATIVE_OFFSET)ALIGN_UP(emitTotalHotCodeSize, roDataAlignment) - emitTotalHotCodeSize; - assert((roDataAlignmentDelta == 0) || (roDataAlignmentDelta == 4)); - } + const UNATIVE_OFFSET roDataAlignmentDelta = + (UNATIVE_OFFSET)ALIGN_UP(emitTotalHotCodeSize, emitConsDsc.alignment) - emitTotalHotCodeSize; args.hotCodeSize = emitTotalHotCodeSize + roDataAlignmentDelta + emitConsDsc.dsdOffs; args.coldCodeSize = emitTotalColdCodeSize; diff --git a/src/coreclr/jit/unwind.cpp b/src/coreclr/jit/unwind.cpp index 6ad60a064f35c..63c4ed716cf39 100644 --- a/src/coreclr/jit/unwind.cpp +++ b/src/coreclr/jit/unwind.cpp @@ -69,7 +69,16 @@ void Compiler::unwindGetFuncLocations(FuncInfoDsc* func, // The hot section only goes up to the cold section assert(fgFirstFuncletBB == nullptr); - *ppEndLoc = new (this, CMK_UnwindInfo) emitLocation(ehEmitCookie(fgFirstColdBlock)); +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting()) + { + *ppEndLoc = nullptr; // If fake-splitting, "trick" VM by pretending entire function is hot. + } + else +#endif // DEBUG + { + *ppEndLoc = new (this, CMK_UnwindInfo) emitLocation(ehEmitCookie(fgFirstColdBlock)); + } } else { @@ -259,6 +268,13 @@ void Compiler::unwindEmitFuncCFI(FuncInfoDsc* func, void* pHotCode, void* pColdC DWORD unwindCodeBytes = 0; BYTE* pUnwindBlock = nullptr; +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting()) + { + pColdCode = nullptr; + } +#endif // DEBUG + if (func->startLoc == nullptr) { startOffset = 0; diff --git a/src/coreclr/jit/unwindamd64.cpp b/src/coreclr/jit/unwindamd64.cpp index 2c8e90fa5a944..88cefbe31ed5e 100644 --- a/src/coreclr/jit/unwindamd64.cpp +++ b/src/coreclr/jit/unwindamd64.cpp @@ -656,18 +656,17 @@ void Compiler::unwindReserve() // void Compiler::unwindReserveFunc(FuncInfoDsc* func) { -#ifdef DEBUG - if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr)) + unwindReserveFuncHelper(func, true); + + if (fgFirstColdBlock != nullptr) { - assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets. - unwindReserveFuncHelper(func, true); - } - else +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting()) + { + assert(func->funKind == FUNC_ROOT); // No splitting of funclets. + } + else #endif // DEBUG - { - unwindReserveFuncHelper(func, true); - - if (fgFirstColdBlock != nullptr) { unwindReserveFuncHelper(func, false); } @@ -859,7 +858,17 @@ void Compiler::unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pCo if (isHotCode) { - assert(endOffset <= info.compTotalHotCodeSize); +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr)) + { + assert(endOffset <= info.compNativeCodeSize); + } + else +#endif // DEBUG + { + assert(endOffset <= info.compTotalHotCodeSize); + } + pColdCode = nullptr; } else @@ -890,43 +899,17 @@ void Compiler::unwindEmitFunc(FuncInfoDsc* func, void* pHotCode, void* pColdCode static_assert_no_msg(FUNC_HANDLER == (FuncKind)CORJIT_FUNC_HANDLER); static_assert_no_msg(FUNC_FILTER == (FuncKind)CORJIT_FUNC_FILTER); -#ifdef DEBUG - if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != nullptr)) + unwindEmitFuncHelper(func, pHotCode, pColdCode, true); + + if (pColdCode != nullptr) { - fakeUnwindEmitFuncHelper(func, pHotCode); - } - else +#ifdef DEBUG + if (!JitConfig.JitFakeProcedureSplitting()) #endif // DEBUG - { - unwindEmitFuncHelper(func, pHotCode, pColdCode, true); - - if (pColdCode != nullptr) { unwindEmitFuncHelper(func, pHotCode, pColdCode, false); } } } -#ifdef DEBUG -void Compiler::fakeUnwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode) -{ - assert(fgFirstColdBlock != nullptr); - assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets. - - const UNATIVE_OFFSET startOffset = 0; - const UNATIVE_OFFSET endOffset = info.compNativeCodeSize; - const DWORD unwindCodeBytes = sizeof(func->unwindCodes) - func->unwindCodeSlot; - BYTE* pUnwindBlock = &func->unwindCodes[func->unwindCodeSlot]; - - if (opts.dspUnwind) - { - DumpUnwindInfo(true, startOffset, endOffset, (const UNWIND_INFO* const)pUnwindBlock); - } - - // Pass pColdCode = nullptr; VM allocs unwind info for combined hot/cold section - eeAllocUnwindInfo((BYTE*)pHotCode, nullptr, startOffset, endOffset, unwindCodeBytes, pUnwindBlock, - (CorJitFuncKind)func->funKind); -} -#endif // DEBUG - #endif // TARGET_AMD64 diff --git a/src/coreclr/jit/unwindarm.cpp b/src/coreclr/jit/unwindarm.cpp index 1eb7456250cbb..8a14c6edbb832 100644 --- a/src/coreclr/jit/unwindarm.cpp +++ b/src/coreclr/jit/unwindarm.cpp @@ -563,13 +563,20 @@ void Compiler::unwindReserve() void Compiler::unwindReserveFunc(FuncInfoDsc* func) { BOOL isFunclet = (func->funKind == FUNC_ROOT) ? FALSE : TRUE; - bool funcHasColdSection = false; + bool funcHasColdSection = (fgFirstColdBlock != nullptr); + +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting() && funcHasColdSection) + { + funcHasColdSection = false; // "Trick" the VM into thinking we don't have a cold section. + } +#endif // DEBUG #if defined(FEATURE_CFI_SUPPORT) if (generateCFIUnwindCodes()) { DWORD unwindCodeBytes = 0; - if (fgFirstColdBlock != nullptr) + if (funcHasColdSection) { eeReserveUnwindInfo(isFunclet, true /*isColdCode*/, unwindCodeBytes); } @@ -584,7 +591,7 @@ void Compiler::unwindReserveFunc(FuncInfoDsc* func) // cold section. This needs to be done before we split into fragments, as each // of the hot and cold sections can have multiple fragments. - if (fgFirstColdBlock != NULL) + if (funcHasColdSection) { assert(!isFunclet); // TODO-CQ: support hot/cold splitting with EH @@ -595,8 +602,6 @@ void Compiler::unwindReserveFunc(FuncInfoDsc* func) func->uwiCold = new (this, CMK_UnwindInfo) UnwindInfo(); func->uwiCold->InitUnwindInfo(this, startLoc, endLoc); func->uwiCold->HotColdSplitCodes(&func->uwi); - - funcHasColdSection = true; } // First we need to split the function or funclet into fragments that are no larger @@ -1604,11 +1609,19 @@ void UnwindFragmentInfo::Allocate( UNATIVE_OFFSET endOffset; UNATIVE_OFFSET codeSize; - // We don't support hot/cold splitting with EH, so if there is cold code, this - // better not be a funclet! - // TODO-CQ: support funclets in cold code - - noway_assert(isHotCode || funKind == CORJIT_FUNC_ROOT); +// We don't support hot/cold splitting with EH, so if there is cold code, this +// better not be a funclet! +// TODO-CQ: support funclets in cold code +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != NULL)) + { + noway_assert(isHotCode && (funKind == CORJIT_FUNC_ROOT)); + } + else +#endif // DEBUG + { + noway_assert(isHotCode || (funKind == CORJIT_FUNC_ROOT)); + } // Compute the final size, and start and end offsets of the fragment @@ -1656,7 +1669,17 @@ void UnwindFragmentInfo::Allocate( if (isHotCode) { - assert(endOffset <= uwiComp->info.compTotalHotCodeSize); +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != NULL)) + { + assert(endOffset <= uwiComp->info.compNativeCodeSize); + } + else +#endif // DEBUG + { + assert(endOffset <= uwiComp->info.compTotalHotCodeSize); + } + pColdCode = NULL; } else diff --git a/src/coreclr/jit/unwindx86.cpp b/src/coreclr/jit/unwindx86.cpp index bd27e46cbef49..32d077429af6a 100644 --- a/src/coreclr/jit/unwindx86.cpp +++ b/src/coreclr/jit/unwindx86.cpp @@ -113,18 +113,17 @@ void Compiler::unwindEmit(void* pHotCode, void* pColdCode) // void Compiler::unwindReserveFunc(FuncInfoDsc* func) { -#ifdef DEBUG - if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr)) + unwindReserveFuncHelper(func, true); + + if (fgFirstColdBlock != nullptr) { - assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets. - unwindReserveFuncHelper(func, true); - } - else +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting()) + { + assert(func->funKind == FUNC_ROOT); // No splitting of funclets. + } + else #endif // DEBUG - { - unwindReserveFuncHelper(func, true); - - if (fgFirstColdBlock != nullptr) { unwindReserveFuncHelper(func, false); } @@ -164,17 +163,13 @@ void Compiler::unwindEmitFunc(FuncInfoDsc* func, void* pHotCode, void* pColdCode static_assert_no_msg(FUNC_HANDLER == (FuncKind)CORJIT_FUNC_HANDLER); static_assert_no_msg(FUNC_FILTER == (FuncKind)CORJIT_FUNC_FILTER); -#ifdef DEBUG - if (JitConfig.JitFakeProcedureSplitting() && (pColdCode != nullptr)) + unwindEmitFuncHelper(func, pHotCode, pColdCode, true); + + if (pColdCode != nullptr) { - fakeUnwindEmitFuncHelper(func, pHotCode); - } - else +#ifdef DEBUG + if (!JitConfig.JitFakeProcedureSplitting()) #endif // DEBUG - { - unwindEmitFuncHelper(func, pHotCode, pColdCode, true); - - if (pColdCode != nullptr) { unwindEmitFuncHelper(func, pHotCode, pColdCode, false); } @@ -258,7 +253,17 @@ void Compiler::unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pCo if (isHotCode) { - assert(endOffset <= info.compTotalHotCodeSize); +#ifdef DEBUG + if (JitConfig.JitFakeProcedureSplitting() && (fgFirstColdBlock != nullptr)) + { + assert(endOffset <= info.compNativeCodeSize); + } + else +#endif // DEBUG + { + assert(endOffset <= info.compTotalHotCodeSize); + } + pColdCode = nullptr; } else @@ -276,22 +281,4 @@ void Compiler::unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pCo (BYTE*)&unwindInfo, (CorJitFuncKind)func->funKind); } -#ifdef DEBUG -void Compiler::fakeUnwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode) -{ - assert(fgFirstColdBlock != nullptr); - assert(func->funKind == FUNC_ROOT); // No fake-splitting of funclets. - - const UNATIVE_OFFSET startOffset = 0; - const UNATIVE_OFFSET endOffset = info.compNativeCodeSize; - - UNWIND_INFO unwindInfo; - unwindInfo.FunctionLength = (ULONG)(endOffset); - - // Pass pColdCode = nullptr; VM allocs unwind info for combined hot/cold section - eeAllocUnwindInfo((BYTE*)pHotCode, nullptr, startOffset, endOffset, sizeof(UNWIND_INFO), (BYTE*)&unwindInfo, - (CorJitFuncKind)func->funKind); -} -#endif // DEBUG - #endif // FEATURE_EH_FUNCLETS From b61a94a8fc91811899b04d8afd5ede71bd0ebff4 Mon Sep 17 00:00:00 2001 From: Aman Khalid Date: Wed, 15 Jun 2022 13:35:53 -0700 Subject: [PATCH 5/7] Fix regression in runtime-jit-experimental The newly-introduced `emitRemoveJumpToNextInst` optimization caused a regression when hot/cold-splitting, where jumps from the last hot instruction to the first cold instruction were erroneously removed. This is fixed by disabling the `isRemovableJmpCandidate` flag for branches between hot/cold sections. On an unrelated note, a JIT dump message has been added to indicate stress-splitting is occurring. --- src/coreclr/jit/codegenlinear.cpp | 33 +++++++++++++++++-------------- src/coreclr/jit/flowgraph.cpp | 1 + 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 6a44f1db324de..ff5a2dd2397fc 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -753,24 +753,27 @@ void CodeGen::genCodeForBBlist() break; case BBJ_ALWAYS: - inst_JMP(EJ_jmp, block->bbJumpDest +#ifdef TARGET_XARCH + { + // If a block was selected to place an alignment instruction because it ended + // with a jump, do not remove jumps from such blocks. + // Do not remove a jump between hot and cold regions. + bool isRemovableJmpCandidate = + !block->hasAlign() && !compiler->fgInDifferentRegions(block, block->bbJumpDest); + #ifdef TARGET_AMD64 - // AMD64 requires an instruction after a call instruction for unwinding - // inside an EH region so if the last instruction generated was a call instruction - // do not allow this jump to be marked for possible later removal. - // - // If a block was selected to place an alignment instruction because it ended - // with a jump, do not remove jumps from such blocks. - , - /* isRemovableJmpCandidate */ !GetEmitter()->emitIsLastInsCall() && !block->hasAlign() + // AMD64 requires an instruction after a call instruction for unwinding + // inside an EH region so if the last instruction generated was a call instruction + // do not allow this jump to be marked for possible later removal. + isRemovableJmpCandidate = isRemovableJmpCandidate && !GetEmitter()->emitIsLastInsCall(); +#endif // TARGET_AMD64 + + inst_JMP(EJ_jmp, block->bbJumpDest, isRemovableJmpCandidate); + } #else -#ifdef TARGET_XARCH - , - /* isRemovableJmpCandidate */ !block->hasAlign() -#endif + inst_JMP(EJ_jmp, block->bbJumpDest); +#endif // TARGET_XARCH -#endif - ); FALLTHROUGH; case BBJ_COND: diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp index 52399aa6c1692..ac240be9a2d27 100644 --- a/src/coreclr/jit/flowgraph.cpp +++ b/src/coreclr/jit/flowgraph.cpp @@ -3421,6 +3421,7 @@ PhaseStatus Compiler::fgDetermineFirstColdBlock() { firstColdBlock = fgFirstBB->bbNext; prevToFirstColdBlock = fgFirstBB; + JITDUMP("JitStressProcedureSplitting is enabled: Splitting after the first basic block\n"); } else { From 8ed9046e38fcad718feab82dbedcbfe625f903ce Mon Sep 17 00:00:00 2001 From: Aman Khalid Date: Mon, 20 Jun 2022 11:01:36 -0700 Subject: [PATCH 6/7] Update fake-splitting implementation on ARM64 To facilitate generating unwind info, fake-splitting now places the read-only data section after the cold section. This allows the hot/cold code sections to be truly contiguous. --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/ee_il_dll.cpp | 62 ++++++++++++++++++++++++++--------- src/coreclr/jit/emit.cpp | 29 +--------------- src/coreclr/jit/emitarm64.cpp | 6 +--- 4 files changed, 49 insertions(+), 50 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index fee93a619876f..29c6361cfcace 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7646,7 +7646,7 @@ class Compiler // ICorJitInfo wrappers - void eeAllocMem(AllocMemArgs* args); + void eeAllocMem(AllocMemArgs* args, const UNATIVE_OFFSET roDataSectionAlignment); void eeReserveUnwindInfo(bool isFunclet, bool isColdCode, ULONG unwindSize); diff --git a/src/coreclr/jit/ee_il_dll.cpp b/src/coreclr/jit/ee_il_dll.cpp index f8c437e326694..d09bffa0a5e9a 100644 --- a/src/coreclr/jit/ee_il_dll.cpp +++ b/src/coreclr/jit/ee_il_dll.cpp @@ -1122,34 +1122,64 @@ void Compiler::eeDispLineInfos() * (e.g., host AMD64, target ARM64), then VM will get confused anyway. */ -void Compiler::eeAllocMem(AllocMemArgs* args) +void Compiler::eeAllocMem(AllocMemArgs* args, const UNATIVE_OFFSET roDataSectionAlignment) { #ifdef DEBUG - const UNATIVE_OFFSET hotSizeRequest = args->hotCodeSize; - const UNATIVE_OFFSET coldSizeRequest = args->coldCodeSize; - // Fake splitting implementation: place hot/cold code in contiguous section - if (JitConfig.JitFakeProcedureSplitting() && (coldSizeRequest > 0)) + // Fake splitting implementation: place hot/cold code in contiguous section. + UNATIVE_OFFSET coldCodeOffset = 0; + if (JitConfig.JitFakeProcedureSplitting() && (args->coldCodeSize > 0)) { - args->hotCodeSize = hotSizeRequest + coldSizeRequest; + coldCodeOffset = args->hotCodeSize; + assert(coldCodeOffset > 0); + args->hotCodeSize += args->coldCodeSize; args->coldCodeSize = 0; } -#endif + +#endif // DEBUG + +#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) + + // For arm64/LoongArch64, we want to allocate JIT data always adjacent to code similar to what native compiler does. + // This way allows us to use a single `ldr` to access such data like float constant/jmp table. + // For LoongArch64 using `pcaddi + ld` to access such data. + + UNATIVE_OFFSET roDataAlignmentDelta = 0; + if (args->roDataSize > 0) + { + roDataAlignmentDelta = AlignmentPad(args->hotCodeSize, roDataSectionAlignment); + } + + const UNATIVE_OFFSET roDataOffset = args->hotCodeSize + roDataAlignmentDelta; + args->hotCodeSize = roDataOffset + args->roDataSize; + args->roDataSize = 0; + +#endif // defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) info.compCompHnd->allocMem(args); #ifdef DEBUG - if (JitConfig.JitFakeProcedureSplitting() && (coldSizeRequest > 0)) - { - // Fix up hot/cold code pointers - args->coldCodeBlock = ((BYTE*)args->hotCodeBlock) + hotSizeRequest; - args->coldCodeBlockRW = ((BYTE*)args->hotCodeBlockRW) + hotSizeRequest; - // Reset args' hot/cold code sizes in case caller reads them later - args->hotCodeSize = hotSizeRequest; - args->coldCodeSize = coldSizeRequest; + if (JitConfig.JitFakeProcedureSplitting() && (coldCodeOffset > 0)) + { + // Fix up cold code pointers. Cold section is adjacent to hot section. + assert(args->coldCodeBlock == nullptr); + assert(args->coldCodeBlockRW == nullptr); + args->coldCodeBlock = ((BYTE*)args->hotCodeBlock) + coldCodeOffset; + args->coldCodeBlockRW = ((BYTE*)args->hotCodeBlockRW) + coldCodeOffset; } -#endif + +#endif // DEBUG + +#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) + + // Fix up data section pointers. + assert(args->roDataBlock == nullptr); + assert(args->roDataBlockRW == nullptr); + args->roDataBlock = ((BYTE*)args->hotCodeBlock) + roDataOffset; + args->roDataBlockRW = ((BYTE*)args->hotCodeBlockRW) + roDataOffset; + +#endif // defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) } void Compiler::eeReserveUnwindInfo(bool isFunclet, bool isColdCode, ULONG unwindSize) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 10464c6c08ef7..6ecc7a32fcab0 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -6304,38 +6304,13 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, AllocMemArgs args; memset(&args, 0, sizeof(args)); -#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) - // For arm64/LoongArch64, we want to allocate JIT data always adjacent to code similar to what native compiler does. - // This way allows us to use a single `ldr` to access such data like float constant/jmp table. - - const UNATIVE_OFFSET roDataAlignmentDelta = - (UNATIVE_OFFSET)ALIGN_UP(emitTotalHotCodeSize, emitConsDsc.alignment) - emitTotalHotCodeSize; - - args.hotCodeSize = emitTotalHotCodeSize + roDataAlignmentDelta + emitConsDsc.dsdOffs; - args.coldCodeSize = emitTotalColdCodeSize; - args.roDataSize = 0; - args.xcptnsCount = xcptnsCount; - args.flag = allocMemFlag; - - emitComp->eeAllocMem(&args); - - codeBlock = (BYTE*)args.hotCodeBlock; - codeBlockRW = (BYTE*)args.hotCodeBlockRW; - coldCodeBlock = (BYTE*)args.coldCodeBlock; - coldCodeBlockRW = (BYTE*)args.coldCodeBlockRW; - - consBlock = codeBlock + emitTotalHotCodeSize + roDataAlignmentDelta; - consBlockRW = codeBlockRW + emitTotalHotCodeSize + roDataAlignmentDelta; - -#else - args.hotCodeSize = emitTotalHotCodeSize; args.coldCodeSize = emitTotalColdCodeSize; args.roDataSize = emitConsDsc.dsdOffs; args.xcptnsCount = xcptnsCount; args.flag = allocMemFlag; - emitComp->eeAllocMem(&args); + emitComp->eeAllocMem(&args, emitConsDsc.alignment); codeBlock = (BYTE*)args.hotCodeBlock; codeBlockRW = (BYTE*)args.hotCodeBlockRW; @@ -6344,8 +6319,6 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, consBlock = (BYTE*)args.roDataBlock; consBlockRW = (BYTE*)args.roDataBlockRW; -#endif - #ifdef DEBUG if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0) { diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 3068c4dc74edf..7b55449e5aef2 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -8437,8 +8437,7 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount) { case INS_bl_local: idjShort = true; - fmt = IF_BI_0A; - break; + FALLTHROUGH; case INS_b: // Unconditional jump is a single form. // Assume is long in case we cross hot/cold sections. @@ -9841,9 +9840,6 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) // Special case: emit add + ld1 instructions for loading 16-byte data into vector register. if (isVectorRegister(dstReg) && (opSize == EA_16BYTE)) { - // Low 4 bits should be 0 -- 16-byte JIT data should be aligned on 16 bytes. - assert((imm12 & 15) == 0); - const emitAttr elemSize = EA_1BYTE; const insOpts opt = optMakeArrangement(opSize, elemSize); From 548b9a5e1d0459853741ffdb31c611223ebd1161 Mon Sep 17 00:00:00 2001 From: Aman Khalid Date: Tue, 21 Jun 2022 17:26:15 -0700 Subject: [PATCH 7/7] Disable hot/cold splitting on LoongArch64 --- src/coreclr/jit/compiler.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 6c09a5c6f5402..460412d84597e 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3199,6 +3199,11 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.compProcedureSplitting = jitFlags->IsSet(JitFlags::JIT_FLAG_PROCSPLIT) || enableFakeSplitting; +#ifdef TARGET_LOONGARCH64 + // Hot/cold splitting is not being tested on LoongArch64. + opts.compProcedureSplitting = false; +#endif // TARGET_LOONGARCH64 + #ifdef DEBUG opts.compProcedureSplittingEH = opts.compProcedureSplitting; #endif // DEBUG