Skip to content

Commit

Permalink
Enable fake hot/cold splitting on ARM64 (#70708)
Browse files Browse the repository at this point in the history
This commit contains fixes for various bugs exposed by enabling fake
hot/cold splitting on ARM64:
- Branches between hot/cold sections are now always long.
- The pseudoinstruction for loading a constant from the cold section
did not support loading 16-byte data into vector registers, as it
temporarily loaded the constant into an 8-byte integer register. Now,
16-byte constants are loaded directly into vector registers via an
`ld1` instruction.
- Asserts/NYIs blocking hot/cold splitting on ARM64 have been removed.

Fake hot/cold splitting requires we fake unwind info by treating each
split function as one hot section. A more architecture-agnostic
approach for this has been applied. To facilitate this approach, the
fake-splitting implementation has been revised to place the hot
and cold sections contiguously in memory (immediately followed
by the read-only data section on ARM64).
  • Loading branch information
Aman Khalid authored Jun 22, 2022
1 parent f310367 commit bc1a872
Show file tree
Hide file tree
Showing 10 changed files with 330 additions and 242 deletions.
6 changes: 3 additions & 3 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3199,10 +3199,10 @@ void Compiler::compInitOptions(JitFlags* jitFlags)

opts.compProcedureSplitting = jitFlags->IsSet(JitFlags::JIT_FLAG_PROCSPLIT) || enableFakeSplitting;

#ifdef TARGET_ARM64
// TODO-ARM64-NYI: enable hot/cold splitting
#ifdef TARGET_LOONGARCH64
// Hot/cold splitting is not being tested on LoongArch64.
opts.compProcedureSplitting = false;
#endif // TARGET_ARM64
#endif // TARGET_LOONGARCH64

#ifdef DEBUG
opts.compProcedureSplittingEH = opts.compProcedureSplitting;
Expand Down
6 changes: 1 addition & 5 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -7660,7 +7660,7 @@ class Compiler

// ICorJitInfo wrappers

void eeAllocMem(AllocMemArgs* args);
void eeAllocMem(AllocMemArgs* args, const UNATIVE_OFFSET roDataSectionAlignment);

void eeReserveUnwindInfo(bool isFunclet, bool isColdCode, ULONG unwindSize);

Expand Down Expand Up @@ -8017,10 +8017,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
void unwindReserveFuncHelper(FuncInfoDsc* func, bool isHotCode);
void unwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode, void* pColdCode, bool isHotCode);

#ifdef DEBUG
void fakeUnwindEmitFuncHelper(FuncInfoDsc* func, void* pHotCode);
#endif // DEBUG

#endif // TARGET_AMD64 || (TARGET_X86 && FEATURE_EH_FUNCLETS)

UNATIVE_OFFSET unwindGetCurrentOffset(FuncInfoDsc* func);
Expand Down
62 changes: 46 additions & 16 deletions src/coreclr/jit/ee_il_dll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1122,34 +1122,64 @@ void Compiler::eeDispLineInfos()
* (e.g., host AMD64, target ARM64), then VM will get confused anyway.
*/

void Compiler::eeAllocMem(AllocMemArgs* args)
void Compiler::eeAllocMem(AllocMemArgs* args, const UNATIVE_OFFSET roDataSectionAlignment)
{
#ifdef DEBUG
const UNATIVE_OFFSET hotSizeRequest = args->hotCodeSize;
const UNATIVE_OFFSET coldSizeRequest = args->coldCodeSize;

// Fake splitting implementation: place hot/cold code in contiguous section
if (JitConfig.JitFakeProcedureSplitting() && (coldSizeRequest > 0))
// Fake splitting implementation: place hot/cold code in contiguous section.
UNATIVE_OFFSET coldCodeOffset = 0;
if (JitConfig.JitFakeProcedureSplitting() && (args->coldCodeSize > 0))
{
args->hotCodeSize = hotSizeRequest + coldSizeRequest;
coldCodeOffset = args->hotCodeSize;
assert(coldCodeOffset > 0);
args->hotCodeSize += args->coldCodeSize;
args->coldCodeSize = 0;
}
#endif

#endif // DEBUG

#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)

// For arm64/LoongArch64, we want to allocate JIT data always adjacent to code similar to what native compiler does.
// This way allows us to use a single `ldr` to access such data like float constant/jmp table.
// For LoongArch64 using `pcaddi + ld` to access such data.

UNATIVE_OFFSET roDataAlignmentDelta = 0;
if (args->roDataSize > 0)
{
roDataAlignmentDelta = AlignmentPad(args->hotCodeSize, roDataSectionAlignment);
}

const UNATIVE_OFFSET roDataOffset = args->hotCodeSize + roDataAlignmentDelta;
args->hotCodeSize = roDataOffset + args->roDataSize;
args->roDataSize = 0;

#endif // defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)

info.compCompHnd->allocMem(args);

#ifdef DEBUG
if (JitConfig.JitFakeProcedureSplitting() && (coldSizeRequest > 0))
{
// Fix up hot/cold code pointers
args->coldCodeBlock = ((BYTE*)args->hotCodeBlock) + hotSizeRequest;
args->coldCodeBlockRW = ((BYTE*)args->hotCodeBlockRW) + hotSizeRequest;

// Reset args' hot/cold code sizes in case caller reads them later
args->hotCodeSize = hotSizeRequest;
args->coldCodeSize = coldSizeRequest;
if (JitConfig.JitFakeProcedureSplitting() && (coldCodeOffset > 0))
{
// Fix up cold code pointers. Cold section is adjacent to hot section.
assert(args->coldCodeBlock == nullptr);
assert(args->coldCodeBlockRW == nullptr);
args->coldCodeBlock = ((BYTE*)args->hotCodeBlock) + coldCodeOffset;
args->coldCodeBlockRW = ((BYTE*)args->hotCodeBlockRW) + coldCodeOffset;
}
#endif

#endif // DEBUG

#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)

// Fix up data section pointers.
assert(args->roDataBlock == nullptr);
assert(args->roDataBlockRW == nullptr);
args->roDataBlock = ((BYTE*)args->hotCodeBlock) + roDataOffset;
args->roDataBlockRW = ((BYTE*)args->hotCodeBlockRW) + roDataOffset;

#endif // defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
}

void Compiler::eeReserveUnwindInfo(bool isFunclet, bool isColdCode, ULONG unwindSize)
Expand Down
39 changes: 1 addition & 38 deletions src/coreclr/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4561,7 +4561,6 @@ void emitter::emitJumpDistBind()
else if (emitIsUncondJump(jmp))
{
// Nothing to do; we don't shrink these.
assert(jmp->idjShort);
ssz = JMP_SIZE_SMALL;
}
else if (emitIsLoadLabel(jmp))
Expand Down Expand Up @@ -6350,47 +6349,13 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
AllocMemArgs args;
memset(&args, 0, sizeof(args));

#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)
// For arm64/LoongArch64, we want to allocate JIT data always adjacent to code similar to what native compiler does.
// This way allows us to use a single `ldr` to access such data like float constant/jmp table.
// For LoongArch64 using `pcaddi + ld` to access such data.
if (emitTotalColdCodeSize > 0)
{
// JIT data might be far away from the cold code.
NYI("Need to handle fix-up to data from cold code.");
}

UNATIVE_OFFSET roDataAlignmentDelta = 0;
if (emitConsDsc.dsdOffs > 0)
{
roDataAlignmentDelta = AlignmentPad(emitTotalHotCodeSize, dataAlignment);
}

args.hotCodeSize = emitTotalHotCodeSize + roDataAlignmentDelta + emitConsDsc.dsdOffs;
args.coldCodeSize = emitTotalColdCodeSize;
args.roDataSize = 0;
args.xcptnsCount = xcptnsCount;
args.flag = allocMemFlag;

emitComp->eeAllocMem(&args);

codeBlock = (BYTE*)args.hotCodeBlock;
codeBlockRW = (BYTE*)args.hotCodeBlockRW;
coldCodeBlock = (BYTE*)args.coldCodeBlock;
coldCodeBlockRW = (BYTE*)args.coldCodeBlockRW;

consBlock = codeBlock + emitTotalHotCodeSize + roDataAlignmentDelta;
consBlockRW = codeBlockRW + emitTotalHotCodeSize + roDataAlignmentDelta;

#else

args.hotCodeSize = emitTotalHotCodeSize;
args.coldCodeSize = emitTotalColdCodeSize;
args.roDataSize = emitConsDsc.dsdOffs;
args.xcptnsCount = xcptnsCount;
args.flag = allocMemFlag;

emitComp->eeAllocMem(&args);
emitComp->eeAllocMem(&args, emitConsDsc.alignment);

codeBlock = (BYTE*)args.hotCodeBlock;
codeBlockRW = (BYTE*)args.hotCodeBlockRW;
Expand All @@ -6399,8 +6364,6 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
consBlock = (BYTE*)args.roDataBlock;
consBlockRW = (BYTE*)args.roDataBlockRW;

#endif

#ifdef DEBUG
if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0)
{
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -997,7 +997,7 @@ class emitter
case IF_LARGELDC:
if (isVectorRegister(idReg1()))
{
// adrp + ldr + fmov
// (adrp + ldr + fmov) or (adrp + add + ld1)
size = 12;
}
else
Expand Down
Loading

0 comments on commit bc1a872

Please sign in to comment.