Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update how OSR and PGO interact #61453

Merged
merged 8 commits into from
Nov 13, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions eng/pipelines/common/templates/runtimes/run-test-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,10 @@ jobs:
scenarios:
- jitosr
- jitosr_stress
- jitosr_pgo
- jitpartialcompilation
- jitpartialcompilation_osr
- jitpartialcompilation_osr_pgo
- jitobjectstackallocation
${{ if in(parameters.testGroup, 'ilasm') }}:
scenarios:
Expand Down
38 changes: 38 additions & 0 deletions src/coreclr/inc/pgo_formatprocessing.h
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,44 @@ bool ReadInstrumentationSchemaWithLayout(const uint8_t *pByte, size_t cbDataMax,
});
}


// Return true if schemaTable entries are a subset of the schema described by pByte, with matching entries in the same order.
// Also updates offset of the matching entries in schemaTable to those of the pByte schema.
//
inline bool ComparePgoSchemaCompatible(const uint8_t *pByte, size_t cbDataMax, ICorJitInfo::PgoInstrumentationSchema* schemaTable, size_t cSchemas)
Copy link
Member

@jakobbotsch jakobbotsch Nov 11, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was a little surprised that this mutates the schema. Maybe rename it to something like SetOffsetsForSchemaSubsequence or similar?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about CheckIfPgoSchemaIsCompatibleAndSetOffsets?

{
size_t nSchema = 0;
size_t nMatched = 0;
size_t nUnmatched = 0;
size_t initialOffset = cbDataMax;

auto handler = [schemaTable, &nSchema, &nMatched, &nUnmatched](const ICorJitInfo::PgoInstrumentationSchema& schema)
{
const size_t iSchemaAdj = nSchema - nUnmatched;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this the same as nMatched?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And thinking a little bit more about it, that means this might index off the end of schemaTable.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is, yeah.


if ((schema.InstrumentationKind != schemaTable[iSchemaAdj].InstrumentationKind)
|| (schema.ILOffset != schemaTable[iSchemaAdj].ILOffset)
|| (schema.Count != schemaTable[iSchemaAdj].Count)
|| (schema.Other != schemaTable[iSchemaAdj].Other))
{
nUnmatched++;
}
else
{
schemaTable[iSchemaAdj].Offset = schema.Offset;
nMatched++;
}

nSchema++;

return true;
};

ReadInstrumentationSchemaWithLayout(pByte, cbDataMax, initialOffset, handler);

return (nMatched == cSchemas);
}

inline bool ReadInstrumentationSchemaWithLayoutIntoSArray(const uint8_t *pByte, size_t cbDataMax, size_t initialOffset, SArray<ICorJitInfo::PgoInstrumentationSchema>* pSchemas)
{
auto lambda = [pSchemas](const ICorJitInfo::PgoInstrumentationSchema &schema)
Expand Down
81 changes: 55 additions & 26 deletions src/coreclr/jit/fgprofile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,10 +300,10 @@ class Instrumentor
virtual void BuildSchemaElements(BasicBlock* block, Schema& schema)
{
}
virtual void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
virtual void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
}
virtual void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory)
virtual void InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory)
{
}
virtual void SuppressProbes()
Expand Down Expand Up @@ -349,8 +349,8 @@ class BlockCountInstrumentor : public Instrumentor
}
void Prepare(bool isPreImport) override;
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory) override;
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;
void InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory) override;
};

//------------------------------------------------------------------------
Expand Down Expand Up @@ -428,7 +428,7 @@ void BlockCountInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& sche
// schema -- instrumentation schema
// profileMemory -- profile data slab
//
void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
const ICorJitInfo::PgoInstrumentationSchema& entry = schema[block->bbCountSchemaIndex];

Expand Down Expand Up @@ -464,7 +464,7 @@ void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE*
// Notes:
// When prejitting, add the method entry callback node
//
void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, BYTE* profileMemory)
void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory)
{
Compiler::Options& opts = m_comp->opts;
Compiler::Info& info = m_comp->info;
Expand Down Expand Up @@ -982,7 +982,7 @@ class EfficientEdgeCountInstrumentor : public Instrumentor, public SpanningTreeV
return ((block->bbFlags & BBF_IMPORTED) == BBF_IMPORTED);
}
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;

void Badcode() override
{
Expand Down Expand Up @@ -1132,7 +1132,7 @@ void EfficientEdgeCountInstrumentor::BuildSchemaElements(BasicBlock* block, Sche
// schema -- instrumentation schema
// profileMemory -- profile data slab
//
void EfficientEdgeCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
void EfficientEdgeCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
// Inlinee compilers build their blocks in the root compiler's
// graph. So for NumSucc, we use the root compiler instance.
Expand Down Expand Up @@ -1307,12 +1307,12 @@ class BuildClassProbeSchemaGen
class ClassProbeInserter
{
Schema& m_schema;
BYTE* m_profileMemory;
uint8_t* m_profileMemory;
int* m_currentSchemaIndex;
unsigned& m_instrCount;

public:
ClassProbeInserter(Schema& schema, BYTE* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount)
ClassProbeInserter(Schema& schema, uint8_t* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount)
: m_schema(schema)
, m_profileMemory(profileMemory)
, m_currentSchemaIndex(pCurrentSchemaIndex)
Expand Down Expand Up @@ -1349,7 +1349,7 @@ class ClassProbeInserter

// Figure out where the table is located.
//
BYTE* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory;
uint8_t* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory;
*m_currentSchemaIndex += 2; // There are 2 schema entries per class probe

// Grab a temp to hold the 'this' object as it will be used three times
Expand Down Expand Up @@ -1426,7 +1426,7 @@ class ClassProbeInstrumentor : public Instrumentor
}
void Prepare(bool isPreImport) override;
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;
void SuppressProbes() override;
};

Expand Down Expand Up @@ -1490,7 +1490,7 @@ void ClassProbeInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& sche
// schema -- instrumentation schema
// profileMemory -- profile data slab
//
void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
{
Expand Down Expand Up @@ -1563,21 +1563,43 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
// Choose instrumentation technology.
//
// We enable edge profiling by default, except when:
//
// * disabled by option
// * we are prejitting
// * we are jitting osr methods
// * we are jitting tier0 methods with patchpoints
// * we are jitting an OSR method
//
// Currently, OSR is incompatible with edge profiling. So if OSR is enabled,
// always do block profiling.
// OSR is incompatible with edge profiling. Only portions of the Tier0
// method will be executed, and the bail-outs at patchpoints won't be obvious
// exit points from the method. So for OSR we always do block profiling.
//
// Note this incompatibility only exists for methods that actually have
// patchpoints, but we won't know that until we import.
// patchpoints. Currently we will only place patchponts in methods with
// backwards jumps.
//
// And because we want the Tier1 method to see the full set of profile data,
// when OSR is enabled, both Tier0 and any OSR methods need to contribute to
// the same profile data set. Since Tier0 has laid down a dense block-based
// schema, the OSR methods must use this schema as well.
//
// Note that OSR methods may also inline. We currently won't instrument
// any inlinee contributions (which would also need to carefully "share"
// the profile data segment with any Tier0 version and/or any other equivalent
// inlnee), so we'll lose a bit of their profile data. We can support this
// eventually if it turns out to matter.
//
// Similar issues arise with partially jitted methods. Because we currently
// only defer jitting for throw blocks, we currently ignore the impact of partial
// jitting on PGO. If we ever implement a broader pattern of deferral -- say deferring
// based on static PGO -- we will need to reconsider.
//
CLANG_FORMAT_COMMENT_ANCHOR;

const bool prejit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT);
const bool osr = (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) && (JitConfig.TC_OnStackReplacement() > 0));
const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !osr;
const bool prejit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT);
const bool tier0WithPatchpoints = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) &&
(JitConfig.TC_OnStackReplacement() > 0) && compHasBackwardJump;
const bool osrMethod = opts.IsOSR();
const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !tier0WithPatchpoints && !osrMethod;

if (useEdgeProfiles)
{
Expand All @@ -1586,7 +1608,9 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
else
{
JITDUMP("Using block profiling, because %s\n",
(JitConfig.JitEdgeProfiling() > 0) ? "edge profiles disabled" : prejit ? "prejitting" : "OSR");
(JitConfig.JitEdgeProfiling() > 0)
? "edge profiles disabled"
: prejit ? "prejitting" : osrMethod ? "OSR" : "tier0 with patchpoints");

fgCountInstrumentor = new (this, CMK_Pgo) BlockCountInstrumentor(this);
}
Expand Down Expand Up @@ -1636,7 +1660,7 @@ PhaseStatus Compiler::fgInstrumentMethod()
{
noway_assert(!compIsForInlining());

// Make post-importpreparations.
// Make post-import preparations.
//
const bool isPreImport = false;
fgCountInstrumentor->Prepare(isPreImport);
Expand Down Expand Up @@ -1694,11 +1718,16 @@ PhaseStatus Compiler::fgInstrumentMethod()

assert(schema.size() > 0);

// Allocate the profile buffer
// Allocate/retrieve the profile buffer.
//
BYTE* profileMemory;

HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
// If this is an OSR method, we should use the same buffer that the Tier0 method used.
//
// This is supported by allocPgoInsrumentationDataBySchema, which will verify the schema
// we provide here matches the one from Tier0, and will fill in the data offsets in
// our schema properly.
//
uint8_t* profileMemory;
HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
(UINT32)schema.size(), &profileMemory);

// Deal with allocation failures.
Expand Down
14 changes: 11 additions & 3 deletions src/coreclr/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21244,10 +21244,19 @@ void Compiler::impDevirtualizeCall(GenTreeCall* call,
//
assert(call->IsVirtual());

// Possibly instrument, if not optimizing.
// Possibly instrument. Note for OSR+PGO we will instrument when
// optimizing and (currently) won't devirtualize. We may want
// to revisit -- if we can devirtualize we should be able to
// suppress the probe.
//
if (opts.OptimizationDisabled())
// We strip BBINSTR from inlinees currently, so we'll only
// do this for the root method calls.
//
if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_BBINSTR))
{
assert(opts.OptimizationDisabled() || opts.IsOSR());
assert(!compIsForInlining());

// During importation, optionally flag this block as one that
// contains calls requiring class profiling. Ideally perhaps
// we'd just keep track of the calls themselves, so we don't
Expand Down Expand Up @@ -21277,7 +21286,6 @@ void Compiler::impDevirtualizeCall(GenTreeCall* call,
//
compCurBB->bbFlags |= BBF_HAS_CLASS_PROFILE;
}

return;
}

Expand Down
3 changes: 2 additions & 1 deletion src/coreclr/vm/jitinterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12532,14 +12532,15 @@ CORJIT_FLAGS GetCompileFlags(MethodDesc * ftn, CORJIT_FLAGS flags, CORINFO_METHO
//
// * We're writing pgo data and we're jitting at Tier0.
// * Tiered PGO is enabled and we're jitting at Tier0.
// * Tiered PGO is enabled and we are jitting an OSR method.
//
if ((CLRConfig::GetConfigValue(CLRConfig::INTERNAL_WritePGOData) > 0)
&& flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0))
{
flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
}
else if ((CLRConfig::GetConfigValue(CLRConfig::INTERNAL_TieredPGO) > 0)
&& flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0))
&& (flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_TIER0) || flags.IsSet(CORJIT_FLAGS::CORJIT_FLAG_OSR)))
{
flags.Set(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR);
}
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/vm/pgo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ HRESULT PgoManager::allocPgoInstrumentationBySchemaInstance(MethodDesc* pMD,
HeaderList *currentHeaderList = m_pgoHeaders;
if (currentHeaderList != NULL)
{
if (!ComparePgoSchemaEquals(currentHeaderList->header.GetData(), currentHeaderList->header.countsOffset, pSchema, countSchemaItems))
if (!ComparePgoSchemaCompatible(currentHeaderList->header.GetData(), currentHeaderList->header.countsOffset, pSchema, countSchemaItems))
{
return E_NOTIMPL;
}
Expand Down Expand Up @@ -683,7 +683,7 @@ HRESULT PgoManager::allocPgoInstrumentationBySchemaInstance(MethodDesc* pMD,
HeaderList* existingData = laPgoManagerThis->m_pgoDataLookup.Lookup(pMD);
if (existingData != NULL)
{
if (!ComparePgoSchemaEquals(existingData->header.GetData(), existingData->header.countsOffset, pSchema, countSchemaItems))
if (!ComparePgoSchemaCompatible(existingData->header.GetData(), existingData->header.countsOffset, pSchema, countSchemaItems))
{
return E_NOTIMPL;
}
Expand Down
4 changes: 4 additions & 0 deletions src/tests/Common/testenvironment.proj
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,11 @@
<TestEnvironment Include="gcstress0xc_jitminopts_heapverify1" GCStress="0xC" JITMinOpts="1" HeapVerify="1" />
<TestEnvironment Include="jitosr" TC_OnStackReplacement="1" TC_QuickJitForLoops="1" TieredCompilation="1" />
<TestEnvironment Include="jitosr_stress" TC_OnStackReplacement="1" TC_QuickJitForLoops="1" TC_OnStackReplacement_InitialCounter="1" OSR_HitLimit="1" TieredCompilation="1" />
<TestEnvironment Include="jitosr_pgo" TC_OnStackReplacement="1" TC_QuickJitForLoops="1" TieredCompilation="1" TieredPGO="1" />
<TestEnvironment Include="jitpartialcompilation" TC_PartialCompilation="1" TC_QuickJitForLoops="1" TieredCompilation="1" />
<TestEnvironment Include="jitpartialcompilation_pgo" TC_PartialCompilation="1" TC_QuickJitForLoops="1" TieredCompilation="1" TieredPGO="1" />
<TestEnvironment Include="jitpartialcompilation_osr" TC_OnStackReplacement="1" TC_PartialCompilation="1" TC_QuickJitForLoops="1" TieredCompilation="1" />
<TestEnvironment Include="jitpartialcompilation_osr_pgo" TC_OnStackReplacement="1" TC_PartialCompilation="1" TC_QuickJitForLoops="1" TieredCompilation="1" TieredPGO="1" />
<TestEnvironment Include="jitobjectstackallocation" JitObjectStackAllocation="1" TieredCompilation="0" />
<TestEnvironment Include="ilasmroundtrip" RunningIlasmRoundTrip="1" />
<TestEnvironment Include="clrinterpreter" TieredCompilation="1" />
Expand Down