From 138893c965952974da1d78e9bc433bf61adc64ba Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Wed, 10 Nov 2021 18:54:11 -0800 Subject: [PATCH 1/8] Update how OSR and PGO interact When both OSR and PGO are enabled: * Enable instrumenting OSR methods, so that the combined profile data from Tier0 plus any OSR variants provide a full picture for subsequent Tier1 optimization. * Use block profiles for both Tier0 methods that are likely to have patchpoints and OSR methods. The updates on the runtime side are to pass BBINSTR to OSR methods, and to handle the (typical) case where the OSR method instrumentation schema is a subset of the Tier0 method schema. We are still allowing OSR methods to read the profile data. So they are both profile instrumented and profile optimized. Not clear if this is going to work well as the Tier0 data will be incomplete and optimization quality may be poor. Something to revisit down the road. --- .../templates/runtimes/run-test-job.yml | 3 + src/coreclr/inc/pgo_formatprocessing.h | 38 +++++++++ src/coreclr/jit/fgprofile.cpp | 81 +++++++++++++------ src/coreclr/jit/importer.cpp | 14 +++- src/coreclr/vm/jitinterface.cpp | 3 +- src/coreclr/vm/pgo.cpp | 4 +- src/tests/Common/testenvironment.proj | 4 + 7 files changed, 115 insertions(+), 32 deletions(-) diff --git a/eng/pipelines/common/templates/runtimes/run-test-job.yml b/eng/pipelines/common/templates/runtimes/run-test-job.yml index 7439f6ff48ecd..cc22ce9f1c677 100644 --- a/eng/pipelines/common/templates/runtimes/run-test-job.yml +++ b/eng/pipelines/common/templates/runtimes/run-test-job.yml @@ -554,7 +554,10 @@ jobs: scenarios: - jitosr - jitosr_stress + - jitosr_pgo - jitpartialcompilation + - jitpartialcompilation_osr + - jitpartialcompilation_osr_pgo - jitobjectstackallocation ${{ if in(parameters.testGroup, 'ilasm') }}: scenarios: diff --git a/src/coreclr/inc/pgo_formatprocessing.h b/src/coreclr/inc/pgo_formatprocessing.h index 8d111bf2c8949..25cac1a94d21c 100644 --- a/src/coreclr/inc/pgo_formatprocessing.h +++ b/src/coreclr/inc/pgo_formatprocessing.h @@ -361,6 +361,44 @@ bool ReadInstrumentationSchemaWithLayout(const uint8_t *pByte, size_t cbDataMax, }); } + +// Return true if schemaTable entries are a subset of the schema described by pByte, with matching entries in the same order. +// Also updates offset of the matching entries in schemaTable to those of the pByte schema. +// +inline bool ComparePgoSchemaCompatible(const uint8_t *pByte, size_t cbDataMax, ICorJitInfo::PgoInstrumentationSchema* schemaTable, size_t cSchemas) +{ + size_t iSchema = 0; + size_t nSchema = 0; + size_t nMatched = 0; + size_t nUnmatched = 0; + size_t initialOffset = cbDataMax; + + ReadInstrumentationSchemaWithLayout(pByte, cbDataMax, initialOffset, + [schemaTable, cSchemas, &iSchema, &nSchema, &nMatched, &nUnmatched](const ICorJitInfo::PgoInstrumentationSchema& schema) + { + const size_t iSchemaAdj = nSchema - nUnmatched; + + if ((schema.InstrumentationKind != schemaTable[iSchemaAdj].InstrumentationKind) + || (schema.ILOffset != schemaTable[iSchemaAdj].ILOffset) + || (schema.Count != schemaTable[iSchemaAdj].Count) + || (schema.Other != schemaTable[iSchemaAdj].Other)) + { + nUnmatched++; + } + else + { + schemaTable[iSchemaAdj].Offset = schema.Offset; + nMatched++; + } + + nSchema++; + + return true; + }); + + return (nMatched == cSchemas); +} + inline bool ReadInstrumentationSchemaWithLayoutIntoSArray(const uint8_t *pByte, size_t cbDataMax, size_t initialOffset, SArray* pSchemas) { auto lambda = [pSchemas](const ICorJitInfo::PgoInstrumentationSchema &schema) diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index d301166966fbf..745edae0981ef 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -300,10 +300,10 @@ class Instrumentor virtual void BuildSchemaElements(BasicBlock* block, Schema& schema) { } - virtual void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) + virtual void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) { } - virtual void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory) + virtual void InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory) { } virtual void SuppressProbes() @@ -349,8 +349,8 @@ class BlockCountInstrumentor : public Instrumentor } void Prepare(bool isPreImport) override; void BuildSchemaElements(BasicBlock* block, Schema& schema) override; - void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override; - void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory) override; + void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override; + void InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory) override; }; //------------------------------------------------------------------------ @@ -428,7 +428,7 @@ void BlockCountInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& sche // schema -- instrumentation schema // profileMemory -- profile data slab // -void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) +void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) { const ICorJitInfo::PgoInstrumentationSchema& entry = schema[block->bbCountSchemaIndex]; @@ -464,7 +464,7 @@ void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* // Notes: // When prejitting, add the method entry callback node // -void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, BYTE* profileMemory) +void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory) { Compiler::Options& opts = m_comp->opts; Compiler::Info& info = m_comp->info; @@ -982,7 +982,7 @@ class EfficientEdgeCountInstrumentor : public Instrumentor, public SpanningTreeV return ((block->bbFlags & BBF_IMPORTED) == BBF_IMPORTED); } void BuildSchemaElements(BasicBlock* block, Schema& schema) override; - void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override; + void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override; void Badcode() override { @@ -1132,7 +1132,7 @@ void EfficientEdgeCountInstrumentor::BuildSchemaElements(BasicBlock* block, Sche // schema -- instrumentation schema // profileMemory -- profile data slab // -void EfficientEdgeCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) +void EfficientEdgeCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) { // Inlinee compilers build their blocks in the root compiler's // graph. So for NumSucc, we use the root compiler instance. @@ -1307,12 +1307,12 @@ class BuildClassProbeSchemaGen class ClassProbeInserter { Schema& m_schema; - BYTE* m_profileMemory; + uint8_t* m_profileMemory; int* m_currentSchemaIndex; unsigned& m_instrCount; public: - ClassProbeInserter(Schema& schema, BYTE* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount) + ClassProbeInserter(Schema& schema, uint8_t* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount) : m_schema(schema) , m_profileMemory(profileMemory) , m_currentSchemaIndex(pCurrentSchemaIndex) @@ -1349,7 +1349,7 @@ class ClassProbeInserter // Figure out where the table is located. // - BYTE* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory; + uint8_t* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory; *m_currentSchemaIndex += 2; // There are 2 schema entries per class probe // Grab a temp to hold the 'this' object as it will be used three times @@ -1426,7 +1426,7 @@ class ClassProbeInstrumentor : public Instrumentor } void Prepare(bool isPreImport) override; void BuildSchemaElements(BasicBlock* block, Schema& schema) override; - void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override; + void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override; void SuppressProbes() override; }; @@ -1490,7 +1490,7 @@ void ClassProbeInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& sche // schema -- instrumentation schema // profileMemory -- profile data slab // -void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) +void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) { if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0) { @@ -1563,21 +1563,43 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod() // Choose instrumentation technology. // // We enable edge profiling by default, except when: + // // * disabled by option // * we are prejitting - // * we are jitting osr methods + // * we are jitting tier0 methods with patchpoints + // * we are jitting an OSR method // - // Currently, OSR is incompatible with edge profiling. So if OSR is enabled, - // always do block profiling. + // OSR is incompatible with edge profiling. Only portions of the Tier0 + // method will be executed, and the bail-outs at patchpoints won't be obvious + // exit points from the method. So for OSR we always do block profiling. // // Note this incompatibility only exists for methods that actually have - // patchpoints, but we won't know that until we import. + // patchpoints. Currently we will only place patchponts in methods with + // backwards jumps. + // + // And because we want the Tier1 method to see the full set of profile data, + // when OSR is enabled, both Tier0 and any OSR methods need to contribute to + // the same profile data set. Since Tier0 has laid down a dense block-based + // schema, the OSR methods must use this schema as well. + // + // Note that OSR methods may also inline. We currently won't instrument + // any inlinee contributions (which would also need to carefully "share" + // the profile data segment with any Tier0 version and/or any other equivalent + // inlnee), so we'll lose a bit of their profile data. We can support this + // eventually if it turns out to matter. + // + // Similar issues arise with partially jitted methods. Because we currently + // only defer jitting for throw blocks, we currently ignore the impact of partial + // jitting on PGO. If we ever implement a broader pattern of deferral -- say deferring + // based on static PGO -- we will need to reconsider. // CLANG_FORMAT_COMMENT_ANCHOR; - const bool prejit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT); - const bool osr = (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) && (JitConfig.TC_OnStackReplacement() > 0)); - const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !osr; + const bool prejit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT); + const bool tier0WithPatchpoints = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) && + (JitConfig.TC_OnStackReplacement() > 0) && compHasBackwardJump; + const bool osrMethod = opts.IsOSR(); + const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !tier0WithPatchpoints && !osrMethod; if (useEdgeProfiles) { @@ -1586,7 +1608,9 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod() else { JITDUMP("Using block profiling, because %s\n", - (JitConfig.JitEdgeProfiling() > 0) ? "edge profiles disabled" : prejit ? "prejitting" : "OSR"); + (JitConfig.JitEdgeProfiling() > 0) + ? "edge profiles disabled" + : prejit ? "prejitting" : osrMethod ? "OSR" : "tier0 with patchpoints"); fgCountInstrumentor = new (this, CMK_Pgo) BlockCountInstrumentor(this); } @@ -1636,7 +1660,7 @@ PhaseStatus Compiler::fgInstrumentMethod() { noway_assert(!compIsForInlining()); - // Make post-importpreparations. + // Make post-import preparations. // const bool isPreImport = false; fgCountInstrumentor->Prepare(isPreImport); @@ -1694,11 +1718,16 @@ PhaseStatus Compiler::fgInstrumentMethod() assert(schema.size() > 0); - // Allocate the profile buffer + // Allocate/retrieve the profile buffer. // - BYTE* profileMemory; - - HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(), + // If this is an OSR method, we should use the same buffer that the Tier0 method used. + // + // This is supported by allocPgoInsrumentationDataBySchema, which will verify the schema + // we provide here matches the one from Tier0, and will fill in the data offsets in + // our schema properly. + // + uint8_t* profileMemory; + HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(), (UINT32)schema.size(), &profileMemory); // Deal with allocation failures. diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp index 98c5354269824..ef1c2bd0803b2 100644 --- a/src/coreclr/jit/importer.cpp +++ b/src/coreclr/jit/importer.cpp @@ -21244,10 +21244,19 @@ void Compiler::impDevirtualizeCall(GenTreeCall* call, // assert(call->IsVirtual()); - // Possibly instrument, if not optimizing. + // Possibly instrument. Note for OSR+PGO we will instrument when + // optimizing and (currently) won't devirtualize. We may want + // to revisit -- if we can devirtualize we should be able to + // suppress the probe. // - if (opts.OptimizationDisabled()) + // We strip BBINSTR from inlinees currently, so we'll only + // do this for the root method calls. + // + if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_BBINSTR)) { + assert(opts.OptimizationDisabled() || opts.IsOSR()); + assert(!compIsForInlining()); + // During importation, optionally flag this block as one that // contains calls requiring class profiling. Ideally perhaps // we'd just keep track of the calls themselves, so we don't @@ -21277,7 +21286,6 @@ void Compiler::impDevirtualizeCall(GenTreeCall* call, // compCurBB->bbFlags |= BBF_HAS_CLASS_PROFILE; } - return; } diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index 43c4fc17a597f..50e5f858fe69d 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -12532,6 +12532,7 @@ CORJIT_FLAGS GetCompileFlags(MethodDesc * ftn, CORJIT_FLAGS flags, CORINFO_METHO // // * We're writing pgo data and we're jitting at Tier0. // * Tiered PGO is enabled and we're jitting at Tier0. + // * Tiered PGO is enabled and we are jitting an OSR method. // if ((CLRConfig::GetConfigValue(CLRConfig::INTERNAL_WritePGOData) > 0) && flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0)) @@ -12539,7 +12540,7 @@ CORJIT_FLAGS GetCompileFlags(MethodDesc * ftn, CORJIT_FLAGS flags, CORINFO_METHO flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR); } else if ((CLRConfig::GetConfigValue(CLRConfig::INTERNAL_TieredPGO) > 0) - && flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0)) + && (flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0) || flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_OSR))) { flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR); } diff --git a/src/coreclr/vm/pgo.cpp b/src/coreclr/vm/pgo.cpp index 189487c34750b..37ca221f31f1d 100644 --- a/src/coreclr/vm/pgo.cpp +++ b/src/coreclr/vm/pgo.cpp @@ -653,7 +653,7 @@ HRESULT PgoManager::allocPgoInstrumentationBySchemaInstance(MethodDesc* pMD, HeaderList *currentHeaderList = m_pgoHeaders; if (currentHeaderList != NULL) { - if (!ComparePgoSchemaEquals(currentHeaderList->header.GetData(), currentHeaderList->header.countsOffset, pSchema, countSchemaItems)) + if (!ComparePgoSchemaCompatible(currentHeaderList->header.GetData(), currentHeaderList->header.countsOffset, pSchema, countSchemaItems)) { return E_NOTIMPL; } @@ -683,7 +683,7 @@ HRESULT PgoManager::allocPgoInstrumentationBySchemaInstance(MethodDesc* pMD, HeaderList* existingData = laPgoManagerThis->m_pgoDataLookup.Lookup(pMD); if (existingData != NULL) { - if (!ComparePgoSchemaEquals(existingData->header.GetData(), existingData->header.countsOffset, pSchema, countSchemaItems)) + if (!ComparePgoSchemaCompatible(existingData->header.GetData(), existingData->header.countsOffset, pSchema, countSchemaItems)) { return E_NOTIMPL; } diff --git a/src/tests/Common/testenvironment.proj b/src/tests/Common/testenvironment.proj index 359db9e2b6d13..c8d6e0d31e59e 100644 --- a/src/tests/Common/testenvironment.proj +++ b/src/tests/Common/testenvironment.proj @@ -187,7 +187,11 @@ + + + + From 1f8800b5c938826c4fe283198f3882204a9fd052 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 11 Nov 2021 09:21:32 -0800 Subject: [PATCH 2/8] fix linux build issues --- src/coreclr/inc/pgo_formatprocessing.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/coreclr/inc/pgo_formatprocessing.h b/src/coreclr/inc/pgo_formatprocessing.h index 25cac1a94d21c..a94192f7c6603 100644 --- a/src/coreclr/inc/pgo_formatprocessing.h +++ b/src/coreclr/inc/pgo_formatprocessing.h @@ -367,14 +367,13 @@ bool ReadInstrumentationSchemaWithLayout(const uint8_t *pByte, size_t cbDataMax, // inline bool ComparePgoSchemaCompatible(const uint8_t *pByte, size_t cbDataMax, ICorJitInfo::PgoInstrumentationSchema* schemaTable, size_t cSchemas) { - size_t iSchema = 0; size_t nSchema = 0; size_t nMatched = 0; size_t nUnmatched = 0; size_t initialOffset = cbDataMax; ReadInstrumentationSchemaWithLayout(pByte, cbDataMax, initialOffset, - [schemaTable, cSchemas, &iSchema, &nSchema, &nMatched, &nUnmatched](const ICorJitInfo::PgoInstrumentationSchema& schema) + [schemaTable, &nSchema, &nMatched, &nUnmatched](const ICorJitInfo::PgoInstrumentationSchema& schema) { const size_t iSchemaAdj = nSchema - nUnmatched; From 44ed72b1155df7bf2d40841ddf62fe36496a6080 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 11 Nov 2021 12:07:16 -0800 Subject: [PATCH 3/8] really fix linux build --- src/coreclr/inc/pgo_formatprocessing.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/coreclr/inc/pgo_formatprocessing.h b/src/coreclr/inc/pgo_formatprocessing.h index a94192f7c6603..f2d84e3646477 100644 --- a/src/coreclr/inc/pgo_formatprocessing.h +++ b/src/coreclr/inc/pgo_formatprocessing.h @@ -372,8 +372,7 @@ inline bool ComparePgoSchemaCompatible(const uint8_t *pByte, size_t cbDataMax, I size_t nUnmatched = 0; size_t initialOffset = cbDataMax; - ReadInstrumentationSchemaWithLayout(pByte, cbDataMax, initialOffset, - [schemaTable, &nSchema, &nMatched, &nUnmatched](const ICorJitInfo::PgoInstrumentationSchema& schema) + auto handler = [schemaTable, &nSchema, &nMatched, &nUnmatched](const ICorJitInfo::PgoInstrumentationSchema& schema) { const size_t iSchemaAdj = nSchema - nUnmatched; @@ -393,7 +392,9 @@ inline bool ComparePgoSchemaCompatible(const uint8_t *pByte, size_t cbDataMax, I nSchema++; return true; - }); + }; + + ReadInstrumentationSchemaWithLayout(pByte, cbDataMax, initialOffset, handler); return (nMatched == cSchemas); } From e8dadc8f8d8a1f633a11733409516197241d9019 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 11 Nov 2021 18:28:33 -0800 Subject: [PATCH 4/8] review feedback --- src/coreclr/inc/pgo_formatprocessing.h | 14 ++++++-------- src/coreclr/vm/pgo.cpp | 4 ++-- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/coreclr/inc/pgo_formatprocessing.h b/src/coreclr/inc/pgo_formatprocessing.h index f2d84e3646477..cfed6408ddc0a 100644 --- a/src/coreclr/inc/pgo_formatprocessing.h +++ b/src/coreclr/inc/pgo_formatprocessing.h @@ -365,7 +365,7 @@ bool ReadInstrumentationSchemaWithLayout(const uint8_t *pByte, size_t cbDataMax, // Return true if schemaTable entries are a subset of the schema described by pByte, with matching entries in the same order. // Also updates offset of the matching entries in schemaTable to those of the pByte schema. // -inline bool ComparePgoSchemaCompatible(const uint8_t *pByte, size_t cbDataMax, ICorJitInfo::PgoInstrumentationSchema* schemaTable, size_t cSchemas) +inline bool CheckIfPgoSchemaIsCompatibleAndSetOffsets(const uint8_t *pByte, size_t cbDataMax, ICorJitInfo::PgoInstrumentationSchema* schemaTable, size_t cSchemas) { size_t nSchema = 0; size_t nMatched = 0; @@ -374,18 +374,16 @@ inline bool ComparePgoSchemaCompatible(const uint8_t *pByte, size_t cbDataMax, I auto handler = [schemaTable, &nSchema, &nMatched, &nUnmatched](const ICorJitInfo::PgoInstrumentationSchema& schema) { - const size_t iSchemaAdj = nSchema - nUnmatched; - - if ((schema.InstrumentationKind != schemaTable[iSchemaAdj].InstrumentationKind) - || (schema.ILOffset != schemaTable[iSchemaAdj].ILOffset) - || (schema.Count != schemaTable[iSchemaAdj].Count) - || (schema.Other != schemaTable[iSchemaAdj].Other)) + if ((schema.InstrumentationKind != schemaTable[nMatched].InstrumentationKind) + || (schema.ILOffset != schemaTable[nMatched].ILOffset) + || (schema.Count != schemaTable[nMatched].Count) + || (schema.Other != schemaTable[nMatched].Other)) { nUnmatched++; } else { - schemaTable[iSchemaAdj].Offset = schema.Offset; + schemaTable[nMatched].Offset = schema.Offset; nMatched++; } diff --git a/src/coreclr/vm/pgo.cpp b/src/coreclr/vm/pgo.cpp index 37ca221f31f1d..c3f37a3226b98 100644 --- a/src/coreclr/vm/pgo.cpp +++ b/src/coreclr/vm/pgo.cpp @@ -653,7 +653,7 @@ HRESULT PgoManager::allocPgoInstrumentationBySchemaInstance(MethodDesc* pMD, HeaderList *currentHeaderList = m_pgoHeaders; if (currentHeaderList != NULL) { - if (!ComparePgoSchemaCompatible(currentHeaderList->header.GetData(), currentHeaderList->header.countsOffset, pSchema, countSchemaItems)) + if (!CheckIfPgoSchemaIsCompatibleAndSetOffsets(currentHeaderList->header.GetData(), currentHeaderList->header.countsOffset, pSchema, countSchemaItems)) { return E_NOTIMPL; } @@ -683,7 +683,7 @@ HRESULT PgoManager::allocPgoInstrumentationBySchemaInstance(MethodDesc* pMD, HeaderList* existingData = laPgoManagerThis->m_pgoDataLookup.Lookup(pMD); if (existingData != NULL) { - if (!ComparePgoSchemaCompatible(existingData->header.GetData(), existingData->header.countsOffset, pSchema, countSchemaItems)) + if (!CheckIfPgoSchemaIsCompatibleAndSetOffsets(existingData->header.GetData(), existingData->header.countsOffset, pSchema, countSchemaItems)) { return E_NOTIMPL; } From 432ca0f0641d35940069ba6b3a0140f387ee5ee8 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 12 Nov 2021 08:37:38 -0800 Subject: [PATCH 5/8] simplify compatability checking --- src/coreclr/inc/pgo_formatprocessing.h | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/coreclr/inc/pgo_formatprocessing.h b/src/coreclr/inc/pgo_formatprocessing.h index cfed6408ddc0a..18fe5601c0d28 100644 --- a/src/coreclr/inc/pgo_formatprocessing.h +++ b/src/coreclr/inc/pgo_formatprocessing.h @@ -367,28 +367,21 @@ bool ReadInstrumentationSchemaWithLayout(const uint8_t *pByte, size_t cbDataMax, // inline bool CheckIfPgoSchemaIsCompatibleAndSetOffsets(const uint8_t *pByte, size_t cbDataMax, ICorJitInfo::PgoInstrumentationSchema* schemaTable, size_t cSchemas) { - size_t nSchema = 0; size_t nMatched = 0; - size_t nUnmatched = 0; size_t initialOffset = cbDataMax; - auto handler = [schemaTable, &nSchema, &nMatched, &nUnmatched](const ICorJitInfo::PgoInstrumentationSchema& schema) + auto handler = [schemaTable, cSchemas, &nMatched](const ICorJitInfo::PgoInstrumentationSchema& schema) { - if ((schema.InstrumentationKind != schemaTable[nMatched].InstrumentationKind) - || (schema.ILOffset != schemaTable[nMatched].ILOffset) - || (schema.Count != schemaTable[nMatched].Count) - || (schema.Other != schemaTable[nMatched].Other)) - { - nUnmatched++; - } - else + if ((nMatched < cSchemas) + && (schema.InstrumentationKind == schemaTable[nMatched].InstrumentationKind) + && (schema.ILOffset == schemaTable[nMatched].ILOffset) + && (schema.Count == schemaTable[nMatched].Count) + && (schema.Other == schemaTable[nMatched].Other)) { schemaTable[nMatched].Offset = schema.Offset; nMatched++; } - nSchema++; - return true; }; From cf0d1b44d0e4b812bf88b797f70cb0e87de89298 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 12 Nov 2021 08:52:40 -0800 Subject: [PATCH 6/8] Fix phase ordering so partially jitted methods don't lose probes. --- src/coreclr/jit/compiler.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index d4f616892c927..f46ba737b7a0b 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -4571,6 +4571,10 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // DoPhase(this, PHASE_IMPORTATION, &Compiler::fgImport); + // Expand any patchpoints + // + DoPhase(this, PHASE_PATCHPOINTS, &Compiler::fgTransformPatchpoints); + // If instrumenting, add block and class probes. // if (compileFlags->IsSet(JitFlags::JIT_FLAG_BBINSTR)) @@ -4582,10 +4586,6 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // DoPhase(this, PHASE_INDXCALL, &Compiler::fgTransformIndirectCalls); - // Expand any patchpoints - // - DoPhase(this, PHASE_PATCHPOINTS, &Compiler::fgTransformPatchpoints); - // PostImportPhase: cleanup inlinees // auto postImportPhase = [this]() { From bc3a3b9abdeeb90be2daf17fac0c080260238796 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 12 Nov 2021 10:24:48 -0800 Subject: [PATCH 7/8] Fix issues with PGO and OSR methods with complex entry flow. --- src/coreclr/jit/fgopt.cpp | 14 +++++++++++++- src/coreclr/jit/fgprofile.cpp | 20 +++++++++++++++++--- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 59f326b0fc13a..939f835efe7c4 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -1553,7 +1553,18 @@ void Compiler::fgPostImportationCleanup() // auto addConditionalFlow = [this, entryStateVar, &entryJumpTarget](BasicBlock* fromBlock, BasicBlock* toBlock) { - fgSplitBlockAtBeginning(fromBlock); + + // We may have previously though this try entry was unreachable, but now we're going to + // step through it on the way to the OSR entry. So ensure it has plausible profile weight. + // + if (fgHaveProfileData() && !fromBlock->hasProfileWeight()) + { + JITDUMP("Updating block weight for now-reachable try entry " FMT_BB " via " FMT_BB "\n", + fromBlock->bbNum, fgFirstBB->bbNum); + fromBlock->inheritWeight(fgFirstBB); + } + + BasicBlock* const newBlock = fgSplitBlockAtBeginning(fromBlock); fromBlock->bbFlags |= BBF_INTERNAL; GenTree* const entryStateLcl = gtNewLclvNode(entryStateVar, TYP_INT); @@ -1565,6 +1576,7 @@ void Compiler::fgPostImportationCleanup() fromBlock->bbJumpKind = BBJ_COND; fromBlock->bbJumpDest = toBlock; fgAddRefPred(toBlock, fromBlock); + newBlock->inheritWeight(fromBlock); entryJumpTarget = fromBlock; }; diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index 745edae0981ef..4d25d4112c0a5 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -1949,6 +1949,14 @@ void Compiler::fgIncorporateBlockCounts() fgSetProfileWeight(block, profileWeight); } } + + // For OSR, give the method entry (which will be a scratch BB) + // the same weight as the OSR Entry. + // + if (opts.IsOSR()) + { + fgFirstBB->inheritWeight(fgOSREntryBB); + } } //------------------------------------------------------------------------ @@ -3314,11 +3322,17 @@ void Compiler::fgComputeCalledCount(weight_t returnWeight) BasicBlock* firstILBlock = fgFirstBB; // The first block for IL code (i.e. for the IL code at offset 0) - // Skip past any/all BBF_INTERNAL blocks that may have been added before the first real IL block. + // OSR methods can have complex entry flow, and so + // for OSR we ensure fgFirstBB has plausible profile data. // - while (firstILBlock->bbFlags & BBF_INTERNAL) + if (!opts.IsOSR()) { - firstILBlock = firstILBlock->bbNext; + // Skip past any/all BBF_INTERNAL blocks that may have been added before the first real IL block. + // + while (firstILBlock->bbFlags & BBF_INTERNAL) + { + firstILBlock = firstILBlock->bbNext; + } } // The 'firstILBlock' is now expected to have a profile-derived weight From 70f10ce44a158bdc26b6a96d1d15e1fd9188b757 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 12 Nov 2021 15:01:52 -0800 Subject: [PATCH 8/8] A few more fixes for partial compilation, because the number of things we think we might instrument and the number of things we end up instrumenting can differ. Also improve the DumpJittedMethod output for OSR, and allow selective dumping of a particular OSR variant by specifying its IL offset. --- src/coreclr/jit/compiler.cpp | 26 +++++++++++++++++++++++++- src/coreclr/jit/fgprofile.cpp | 12 +++++++++++- src/coreclr/jit/jitconfigvalues.h | 13 +++++++------ src/coreclr/jit/patchpoint.cpp | 9 +++++++++ 4 files changed, 52 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index f46ba737b7a0b..66067ca36ad41 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2899,6 +2899,18 @@ void Compiler::compInitOptions(JitFlags* jitFlags) verboseDump = (JitConfig.JitDumpTier0() > 0); } + // Optionally suppress dumping some OSR jit requests. + // + if (verboseDump && jitFlags->IsSet(JitFlags::JIT_FLAG_OSR)) + { + const int desiredOffset = JitConfig.JitDumpAtOSROffset(); + + if (desiredOffset != -1) + { + verboseDump = (((IL_OFFSET)desiredOffset) == info.compILEntry); + } + } + if (verboseDump) { verbose = true; @@ -6500,9 +6512,21 @@ int Compiler::compCompileHelper(CORINFO_MODULE_HANDLE classPtr, #ifdef DEBUG if ((JitConfig.DumpJittedMethods() == 1) && !compIsForInlining()) { + enum + { + BUFSIZE = 20 + }; + char osrBuffer[BUFSIZE] = {0}; + if (opts.IsOSR()) + { + // Tiering name already includes "OSR", we just want the IL offset + // + sprintf_s(osrBuffer, BUFSIZE, " @0x%x", info.compILEntry); + } + printf("Compiling %4d %s::%s, IL size = %u, hash=0x%08x %s%s%s\n", Compiler::jitTotalMethodCompiled, info.compClassName, info.compMethodName, info.compILCodeSize, info.compMethodHash(), - compGetTieringName(), opts.IsOSR() ? " OSR" : "", compGetStressMessage()); + compGetTieringName(), osrBuffer, compGetStressMessage()); } if (compIsForInlining()) { diff --git a/src/coreclr/jit/fgprofile.cpp b/src/coreclr/jit/fgprofile.cpp index 4d25d4112c0a5..abb745ddba930 100644 --- a/src/coreclr/jit/fgprofile.cpp +++ b/src/coreclr/jit/fgprofile.cpp @@ -1685,7 +1685,17 @@ PhaseStatus Compiler::fgInstrumentMethod() // Verify we created schema for the calls needing class probes. // (we counted those when importing) // - assert(fgClassInstrumentor->SchemaCount() == info.compClassProbeCount); + // This is not true when we do partial compilation; it can/will erase class probes, + // and there's no easy way to figure out how many should be left. + // + if (doesMethodHavePartialCompilationPatchpoints()) + { + assert(fgClassInstrumentor->SchemaCount() <= info.compClassProbeCount); + } + else + { + assert(fgClassInstrumentor->SchemaCount() == info.compClassProbeCount); + } // Optionally, when jitting, if there were no class probes and only one count probe, // suppress instrumentation. diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 13873e28f89e9..658ba2a69186a 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -171,12 +171,13 @@ CONFIG_INTEGER(TreesBeforeAfterMorph, W("JitDumpBeforeAfterMorph"), 0) // If 1, CONFIG_METHODSET(JitBreak, W("JitBreak")) // Stops in the importer when compiling a specified method CONFIG_METHODSET(JitDebugBreak, W("JitDebugBreak")) -CONFIG_METHODSET(JitDisasm, W("JitDisasm")) // Dumps disassembly for specified method -CONFIG_STRING(JitDisasmAssemblies, W("JitDisasmAssemblies")) // Only show JitDisasm and related info for methods - // from this semicolon-delimited list of assemblies. -CONFIG_INTEGER(JitDisasmWithGC, W("JitDisasmWithGC"), 0) // Dump interleaved GC Info for any method disassembled. -CONFIG_METHODSET(JitDump, W("JitDump")) // Dumps trees for specified method -CONFIG_INTEGER(JitDumpTier0, W("JitDumpTier0"), 1) // Dump tier0 requests +CONFIG_METHODSET(JitDisasm, W("JitDisasm")) // Dumps disassembly for specified method +CONFIG_STRING(JitDisasmAssemblies, W("JitDisasmAssemblies")) // Only show JitDisasm and related info for methods + // from this semicolon-delimited list of assemblies. +CONFIG_INTEGER(JitDisasmWithGC, W("JitDisasmWithGC"), 0) // Dump interleaved GC Info for any method disassembled. +CONFIG_METHODSET(JitDump, W("JitDump")) // Dumps trees for specified method +CONFIG_INTEGER(JitDumpTier0, W("JitDumpTier0"), 1) // Dump tier0 requests +CONFIG_INTEGER(JitDumpAtOSROffset, W("JitDumpAtOSROffset"), -1) // Only dump OSR requests for this offset CONFIG_INTEGER(JitDumpInlinePhases, W("JitDumpInlinePhases"), 1) // Dump inline compiler phases CONFIG_METHODSET(JitEHDump, W("JitEHDump")) // Dump the EH table for the method, as reported to the VM CONFIG_METHODSET(JitExclude, W("JitExclude")) diff --git a/src/coreclr/jit/patchpoint.cpp b/src/coreclr/jit/patchpoint.cpp index 60308075db780..48d78babce1df 100644 --- a/src/coreclr/jit/patchpoint.cpp +++ b/src/coreclr/jit/patchpoint.cpp @@ -248,6 +248,15 @@ class PatchpointTransformer compiler->gtNewHelperCallNode(CORINFO_HELP_PARTIAL_COMPILATION_PATCHPOINT, TYP_VOID, helperArgs); compiler->fgNewStmtAtEnd(block, helperCall); + + // This block will no longer have class probes. + // (They will appear in the OSR variant). + // + if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) != 0) + { + JITDUMP("No longer adding class probes to " FMT_BB "\n", block->bbNum); + block->bbFlags &= ~BBF_HAS_CLASS_PROFILE; + } } };