From 13852ff25c2054052761541fa5e54caedfb691d4 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Mon, 20 Jun 2022 22:23:49 -0700 Subject: [PATCH 1/3] Align arm64 data section as requested Currently, the data section alignment request is ignored unless it is 8. Since the minimum is 4, this effectively means that 16-byte SIMD16 data alignment requests are ignored. This is likely because this code was written before arm64 supported SIMD, and was never revised. Cases of SIMD loads of constant data lead to larger alignment padding of the data section. This is somewhat mitigated by https://github.com/dotnet/runtime/pull/71043 which fixes a bug with overallocation and overalignment of SIMD8 data loads. --- src/coreclr/jit/emit.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index f68ce116bb8f4..0a8a6f8ed7d41 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -6330,11 +6330,9 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, } UNATIVE_OFFSET roDataAlignmentDelta = 0; - if (emitConsDsc.dsdOffs && (emitConsDsc.alignment == TARGET_POINTER_SIZE)) + if (emitConsDsc.dsdOffs > 0) { - UNATIVE_OFFSET roDataAlignment = TARGET_POINTER_SIZE; // 8 Byte align by default. - roDataAlignmentDelta = (UNATIVE_OFFSET)ALIGN_UP(emitTotalHotCodeSize, roDataAlignment) - emitTotalHotCodeSize; - assert((roDataAlignmentDelta == 0) || (roDataAlignmentDelta == 4)); + roDataAlignmentDelta = AlignmentPad(emitTotalHotCodeSize, emitConsDsc.alignment); } args.hotCodeSize = emitTotalHotCodeSize + roDataAlignmentDelta + emitConsDsc.dsdOffs; From 59f580d31ee6cfc5e172c172be68aa8d3108a5e9 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Tue, 21 Jun 2022 09:55:56 -0700 Subject: [PATCH 2/3] Additional fixes 1. On arm64/LA64, if asking for a data alignment greater than code aligment, we need to increase the requested code alignment since the code section is where this data will live. This isn't viewable in SPMI diffs, but it does increase the alignment of some functions from 8 to 16 byte code alignment. 2. Assert that the data section is at least 4 bytes aligned (this is the default in our code, and alignment only increases). 3. Simplify the code setting the alignment flags for allocMem. --- src/coreclr/jit/emit.cpp | 62 ++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 0a8a6f8ed7d41..323c7c46e4c3a 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -6260,7 +6260,12 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, coldCodeBlock = nullptr; - CorJitAllocMemFlag allocMemFlag = CORJIT_ALLOCMEM_DEFAULT_CODE_ALIGN; + // This restricts the data alignment to: 4, 8, 16, or 32 bytes + // Alignments greater than 32 would require VM support in ICorJitInfo::allocMem + uint32_t dataAlignment = emitConsDsc.alignment; + assert((dataSection::MIN_DATA_ALIGN <= dataAlignment) && (dataAlignment <= dataSection::MAX_DATA_ALIGN) && isPow2(dataAlignment)); + + uint32_t codeAlignment = TARGET_POINTER_SIZE; #ifdef TARGET_X86 // @@ -6280,14 +6285,14 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, const weight_t scenarioHotWeight = 256.0; if (emitComp->fgCalledCount > (scenarioHotWeight * emitComp->fgProfileRunsCount())) { - allocMemFlag = CORJIT_ALLOCMEM_FLG_16BYTE_ALIGN; + codeAlignment = 16; } } else { if (emitTotalHotCodeSize <= 16) { - allocMemFlag = CORJIT_ALLOCMEM_FLG_16BYTE_ALIGN; + codeAlignment = 16; } } #endif @@ -6299,23 +6304,44 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, if (emitComp->opts.OptimizationEnabled() && !emitComp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT) && (emitTotalHotCodeSize > 16) && emitComp->fgHasLoops) { - allocMemFlag = CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN; + codeAlignment = 32; } #endif - // This restricts the emitConsDsc.alignment to: 1, 2, 4, 8, 16, or 32 bytes - // Alignments greater than 32 would require VM support in ICorJitInfo::allocMem - assert(isPow2(emitConsDsc.alignment) && (emitConsDsc.alignment <= 32)); +#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) + // For arm64/LoongArch64, we're going to put the data in the code section. So make sure the code section has + // adequate alignment. + if (emitConsDsc.dsdOffs > 0) + { + codeAlignment = max(codeAlignment, dataAlignment); + } +#endif + + // Note that we don't support forcing code alignment of 8 bytes on 32-bit platforms; an omission? + assert((TARGET_POINTER_SIZE <= codeAlignment) && (codeAlignment <= 32) && isPow2(codeAlignment)); - if (emitConsDsc.alignment == 16) + CorJitAllocMemFlag allocMemFlagCodeAlign = CORJIT_ALLOCMEM_DEFAULT_CODE_ALIGN; + if (codeAlignment == 32) { - allocMemFlag = static_cast(allocMemFlag | CORJIT_ALLOCMEM_FLG_RODATA_16BYTE_ALIGN); + allocMemFlagCodeAlign = CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN; } - else if (emitConsDsc.alignment == 32) + else if (codeAlignment == 16) { - allocMemFlag = static_cast(allocMemFlag | CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN); + allocMemFlagCodeAlign = CORJIT_ALLOCMEM_FLG_16BYTE_ALIGN; } + CorJitAllocMemFlag allocMemFlagDataAlign = static_cast(0); + if (dataAlignment == 16) + { + allocMemFlagDataAlign = CORJIT_ALLOCMEM_FLG_RODATA_16BYTE_ALIGN; + } + else if (dataAlignment == 32) + { + allocMemFlagDataAlign = CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN; + } + + CorJitAllocMemFlag allocMemFlag = static_cast(allocMemFlagCodeAlign | allocMemFlagDataAlign); + AllocMemArgs args; memset(&args, 0, sizeof(args)); @@ -6332,7 +6358,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, UNATIVE_OFFSET roDataAlignmentDelta = 0; if (emitConsDsc.dsdOffs > 0) { - roDataAlignmentDelta = AlignmentPad(emitTotalHotCodeSize, emitConsDsc.alignment); + roDataAlignmentDelta = AlignmentPad(emitTotalHotCodeSize, dataAlignment); } args.hotCodeSize = emitTotalHotCodeSize + roDataAlignmentDelta + emitConsDsc.dsdOffs; @@ -6375,6 +6401,18 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, { assert(((size_t)codeBlock & 31) == 0); } + if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_16BYTE_ALIGN) != 0) + { + assert(((size_t)codeBlock & 15) == 0); + } + if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN) != 0) + { + assert(((size_t)consBlock & 31) == 0); + } + if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_RODATA_16BYTE_ALIGN) != 0) + { + assert(((size_t)consBlock & 15) == 0); + } #endif // if (emitConsDsc.dsdOffs) From 8b824dbff70f7cd9335654d52c3aab0e97a87060 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Tue, 21 Jun 2022 11:17:46 -0700 Subject: [PATCH 3/3] Formatting + disable alignment asserts It looks like the buffer pointer passed back from crossgen2 doesn't meet the alignment request. Perhaps it does in the final image, but not in the buffer the JIT fills in? Maybe the asserts could be used for JIT-time but not AOT (when the buffer address is the final location of the code/data)? --- src/coreclr/jit/emit.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 323c7c46e4c3a..22cadb594e75b 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -6263,7 +6263,8 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, // This restricts the data alignment to: 4, 8, 16, or 32 bytes // Alignments greater than 32 would require VM support in ICorJitInfo::allocMem uint32_t dataAlignment = emitConsDsc.alignment; - assert((dataSection::MIN_DATA_ALIGN <= dataAlignment) && (dataAlignment <= dataSection::MAX_DATA_ALIGN) && isPow2(dataAlignment)); + assert((dataSection::MIN_DATA_ALIGN <= dataAlignment) && (dataAlignment <= dataSection::MAX_DATA_ALIGN) && + isPow2(dataAlignment)); uint32_t codeAlignment = TARGET_POINTER_SIZE; @@ -6401,6 +6402,9 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, { assert(((size_t)codeBlock & 31) == 0); } +#if 0 + // TODO: we should be able to assert the following, but it appears crossgen2 doesn't respect them, + // or maybe it respects them in the written image but not in the buffer pointer given to the JIT. if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_16BYTE_ALIGN) != 0) { assert(((size_t)codeBlock & 15) == 0); @@ -6413,6 +6417,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, { assert(((size_t)consBlock & 15) == 0); } +#endif // 0 #endif // if (emitConsDsc.dsdOffs)