Skip to content

Commit

Permalink
Update spin-wait pause/yield normalization (#55295)
Browse files Browse the repository at this point in the history
Update spin-wait pause/yield normalization

- Modified the measurement to use much less time and to remeasure periodically to reduce CPU usage during startup
- Each measurement does a low-microsecond-level measurement of pause/yield times
- Some small amount of history of recent measurements is retained and used to for now take the lowest measurement for normalization
- Measurements are done lazily, and at most every few seconds another measurement is taken
- Added a profiling event that includes info about a measurement and the established value from recent measurements that is used for normalization
  • Loading branch information
kouvel committed Jul 13, 2021
1 parent b2a670b commit 4ff3762
Show file tree
Hide file tree
Showing 15 changed files with 412 additions and 138 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -322,33 +322,14 @@ public void DisableComObjectEagerCleanup()
[MethodImpl(MethodImplOptions.InternalCall)]
public extern bool Join(int millisecondsTimeout);

private static int s_optimalMaxSpinWaitsPerSpinIteration;

[DllImport(RuntimeHelpers.QCall)]
private static extern int GetOptimalMaxSpinWaitsPerSpinIterationInternal();

/// <summary>
/// Max value to be passed into <see cref="SpinWait(int)"/> for optimal delaying. This value is normalized to be
/// appropriate for the processor.
/// </summary>
internal static int OptimalMaxSpinWaitsPerSpinIteration
{
get
{
int optimalMaxSpinWaitsPerSpinIteration = s_optimalMaxSpinWaitsPerSpinIteration;
return optimalMaxSpinWaitsPerSpinIteration != 0 ? optimalMaxSpinWaitsPerSpinIteration : CalculateOptimalMaxSpinWaitsPerSpinIteration();
}
}

[MethodImpl(MethodImplOptions.NoInlining)]
private static int CalculateOptimalMaxSpinWaitsPerSpinIteration()
{
// This is done lazily because the first call to the function below in the process triggers a measurement that
// takes a nontrivial amount of time if the measurement has not already been done in the backgorund.
// See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal();
Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0);
return s_optimalMaxSpinWaitsPerSpinIteration;
[MethodImpl(MethodImplOptions.InternalCall)]
get;
}

[MethodImpl(MethodImplOptions.InternalCall)]
Expand Down
120 changes: 97 additions & 23 deletions src/coreclr/inc/yieldprocessornormalized.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,59 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
#endif
#define YieldProcessor Dont_Use_YieldProcessor

const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
#define DISABLE_COPY(T) \
T(const T &) = delete; \
T &operator =(const T &) = delete

extern unsigned int g_yieldsPerNormalizedYield;
extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
#define DISABLE_CONSTRUCT_COPY(T) \
T() = delete; \
DISABLE_COPY(T)

void InitializeYieldProcessorNormalizedCrst();
void EnsureYieldProcessorNormalizedInitialized();
class YieldProcessorNormalization
{
public:
static const unsigned int TargetNsPerNormalizedYield = 37;
static const unsigned int TargetMaxNsPerSpinIteration = 272;

// These are maximums for the computed values for normalization based their calculation
static const unsigned int MaxYieldsPerNormalizedYield = TargetNsPerNormalizedYield * 10;
static const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
TargetMaxNsPerSpinIteration * 3 / (TargetNsPerNormalizedYield * 2) + 1;

private:
static bool s_isMeasurementScheduled;

static unsigned int s_yieldsPerNormalizedYield;
static unsigned int s_optimalMaxNormalizedYieldsPerSpinIteration;

public:
static bool IsMeasurementScheduled()
{
return s_isMeasurementScheduled;
}

static void PerformMeasurement();

private:
static void ScheduleMeasurementIfNecessary();

public:
static unsigned int GetOptimalMaxNormalizedYieldsPerSpinIteration()
{
return s_optimalMaxNormalizedYieldsPerSpinIteration;
}

static void FireMeasurementEvents();

private:
static double AtomicLoad(double *valueRef);
static void AtomicStore(double *valueRef, double value);

DISABLE_CONSTRUCT_COPY(YieldProcessorNormalization);

friend class YieldProcessorNormalizationInfo;
friend void YieldProcessorNormalizedForPreSkylakeCount(unsigned int);
};

class YieldProcessorNormalizationInfo
{
Expand All @@ -30,12 +75,15 @@ class YieldProcessorNormalizationInfo

public:
YieldProcessorNormalizationInfo()
: yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
: yieldsPerNormalizedYield(YieldProcessorNormalization::s_yieldsPerNormalizedYield),
optimalMaxNormalizedYieldsPerSpinIteration(YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIteration),
optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
{
YieldProcessorNormalization::ScheduleMeasurementIfNecessary();
}

DISABLE_COPY(YieldProcessorNormalizationInfo);

friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
Expand Down Expand Up @@ -98,9 +146,8 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo

if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
// is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (count > MaxCount)
{
count = MaxCount;
Expand Down Expand Up @@ -144,9 +191,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(

if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
// is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
preSkylakeCount = MaxCount;
Expand Down Expand Up @@ -175,7 +221,35 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
// }
FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
{
YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
// This function does not forward to the one above because it is used by some code under utilcode, where
// YieldProcessorNormalizationInfo cannot be used since normalization does not happen in some of its consumers. So this
// version uses the fields in YieldProcessorNormalization directly.

_ASSERTE(preSkylakeCount != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
preSkylakeCount = MaxCount;
}
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
SIZE_T n =
(SIZE_T)preSkylakeCount *
YieldProcessorNormalization::s_yieldsPerNormalizedYield /
PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
{
n = 1;
}
do
{
System_YieldProcessor();
} while (--n != 0);
}

// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
Expand All @@ -193,15 +267,12 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
const YieldProcessorNormalizationInfo &normalizationInfo,
unsigned int spinIteration)
{
// normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
// InitializeYieldProcessorNormalized()
const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
_ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);

// This shift value should be adjusted based on the asserted condition below
// This shift value should be adjusted based on the asserted conditions below
const UINT8 MaxShift = 3;
static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
static_assert_no_msg(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
static_assert_no_msg(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);

unsigned int n;
if (spinIteration <= MaxShift &&
Expand All @@ -219,3 +290,6 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
System_YieldProcessor();
} while (--n != 0);
}

#undef DISABLE_CONSTRUCT_COPY
#undef DISABLE_COPY
16 changes: 12 additions & 4 deletions src/coreclr/utilcode/yieldprocessornormalized.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,16 @@
// The .NET Foundation licenses this file to you under the MIT license.

#include "stdafx.h"
#include "yieldprocessornormalized.h"

// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
// tuned for Skylake processors
unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
bool YieldProcessorNormalization::s_isMeasurementScheduled;

// Defaults are for when normalization has not yet been done
unsigned int YieldProcessorNormalization::s_yieldsPerNormalizedYield = 1;
unsigned int YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIteration =
(unsigned int)
(
(double)YieldProcessorNormalization::TargetMaxNsPerSpinIteration /
YieldProcessorNormalization::TargetNsPerNormalizedYield +
0.5
);
2 changes: 1 addition & 1 deletion src/coreclr/vm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ set(VM_SOURCES_DAC_AND_WKS_COMMON
versionresilienthashcode.cpp
virtualcallstub.cpp
win32threadpool.cpp
yieldprocessornormalized.cpp
zapsig.cpp
)

Expand Down Expand Up @@ -389,6 +388,7 @@ set(VM_SOURCES_WKS
threadsuspend.cpp
typeparse.cpp
weakreferencenative.cpp
yieldprocessornormalized.cpp
${VM_SOURCES_GDBJIT}
)

Expand Down
27 changes: 26 additions & 1 deletion src/coreclr/vm/ClrEtwAll.man
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,13 @@
<opcode name="Profiler" message="$(string.RuntimePublisher.ProfilerOpcodeMessage)" symbol="CLR_PROFILER_OPCODE" value="11"/>
</opcodes>
</task>
<!--Next available ID is 37-->
<task name="YieldProcessorMeasurement" symbol="CLR_YIELD_PROCESSOR_MEASUREMENT_TASK"
value="37" eventGUID="{B4AFC324-DECE-4B02-86DC-AAB8F22BC1B1}"
message="$(string.RuntimePublisher.YieldProcessorMeasurementTaskMessage)">
<opcodes>
</opcodes>
</task>
<!--Next available ID is 38-->
</tasks>
<!--Maps-->
<maps>
Expand Down Expand Up @@ -2916,6 +2922,19 @@
</Settings>
</UserData>
</template>

<template tid="YieldProcessorMeasurement">
<data name="ClrInstanceID" inType="win:UInt16"/>
<data name="NsPerYield" inType="win:Double"/>
<data name="EstablishedNsPerYield" inType="win:Double"/>
<UserData>
<Settings xmlns="myNs">
<ClrInstanceID> %1 </ClrInstanceID>
<NsPerYield> %2 </NsPerYield>
<EstablishedNsPerYield> %3 </EstablishedNsPerYield>
</Settings>
</UserData>
</template>
</templates>

<events>
Expand Down Expand Up @@ -3313,6 +3332,10 @@
keywords ="ThreadingKeyword" opcode="Wait"
task="ThreadPoolWorkerThread"
symbol="ThreadPoolWorkerThreadWait" message="$(string.RuntimePublisher.ThreadPoolWorkerThreadEventMessage)"/>

<event value="58" version="0" level="win:Informational" template="YieldProcessorMeasurement"
keywords="ThreadingKeyword" task="YieldProcessorMeasurement" opcode="win:Info"
symbol="YieldProcessorMeasurement" message="$(string.RuntimePublisher.YieldProcessorMeasurementEventMessage)"/>

<!-- CLR private ThreadPool events -->
<event value="60" version="0" level="win:Verbose" template="ThreadPoolWorkingThreadCount"
Expand Down Expand Up @@ -8049,6 +8072,7 @@
<string id="RuntimePublisher.WorkerThreadRetirementRetireThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreads=%2" />
<string id="RuntimePublisher.WorkerThreadRetirementUnretireThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreads=%2" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreadCount=%2;%nClrInstanceID=%3" />
<string id="RuntimePublisher.YieldProcessorMeasurementEventMessage" value="ClrInstanceID=%1;%nNsPerYield=%2;%nEstablishedNsPerYield=%3" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentSampleEventMessage" value="Throughput=%1;%nClrInstanceID=%2" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentAdjustmentEventMessage" value="AverageThroughput=%1;%nNewWorkerThreadCount=%2;%nReason=%3;%nClrInstanceID=%4" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentStatsEventMessage" value="Duration=%1;%nThroughput=%2;%nThreadWave=%3;%nThroughputWave=%4;%nThroughputErrorEstimate=%5;%nAverageThroughputErrorEstimate=%6;%nThroughputRatio=%7;%nConfidence=%8;%nNewControlSetting=%9;%nNewThreadWaveMagnitude=%10;%nClrInstanceID=%11" />
Expand Down Expand Up @@ -8334,6 +8358,7 @@
<string id="RuntimePublisher.JitInstrumentationDataTaskMessage" value="JitInstrumentationData" />
<string id="RuntimePublisher.ExecutionCheckpointTaskMessage" value="ExecutionCheckpoint" />
<string id="RuntimePublisher.ProfilerTaskMessage" value="Profiler" />
<string id="RuntimePublisher.YieldProcessorMeasurementTaskMessage" value="YieldProcessorMeasurement" />

<string id="RundownPublisher.EEStartupTaskMessage" value="Runtime" />
<string id="RundownPublisher.MethodTaskMessage" value="Method" />
Expand Down
8 changes: 5 additions & 3 deletions src/coreclr/vm/ClrEtwAllMeta.lst
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ nomac:GarbageCollection:::GCJoin_V2

nostack:Type:::BulkType

###################
# Threadpool events
###################
#################################
# Threading and Threadpool events
#################################
nomac:WorkerThreadCreation:::WorkerThreadCreate
noclrinstanceid:WorkerThreadCreation:::WorkerThreadCreate
nomac:WorkerThreadCreation:::WorkerThreadTerminate
Expand Down Expand Up @@ -170,6 +170,8 @@ nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample
nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample
nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment
nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment
nomac:YieldProcessorMeasurement:::YieldProcessorMeasurement
nostack:YieldProcessorMeasurement:::YieldProcessorMeasurement

##################
# Exception events
Expand Down
17 changes: 4 additions & 13 deletions src/coreclr/vm/comsynchronizable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1089,22 +1089,13 @@ FCIMPL1(void, ThreadNative::SetIsThreadpoolThread, ThreadBaseObject* thread)
}
FCIMPLEND

INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration()
FCIMPL0(INT32, ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
{
QCALL_CONTRACT;

INT32 optimalMaxNormalizedYieldsPerSpinIteration;

BEGIN_QCALL;

// RuntimeThread calls this function only once lazily and caches the result, so ensure initialization
EnsureYieldProcessorNormalizedInitialized();
optimalMaxNormalizedYieldsPerSpinIteration = g_optimalMaxNormalizedYieldsPerSpinIteration;

END_QCALL;
FCALL_CONTRACT;

return optimalMaxNormalizedYieldsPerSpinIteration;
return (INT32)YieldProcessorNormalization::GetOptimalMaxNormalizedYieldsPerSpinIteration();
}
FCIMPLEND

FCIMPL1(void, ThreadNative::SpinWait, int iterations)
{
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/vm/comsynchronizable.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ friend class ThreadBaseObject;
UINT64 QCALLTYPE GetProcessDefaultStackSize();

static FCDECL1(INT32, GetManagedThreadId, ThreadBaseObject* th);
static INT32 QCALLTYPE GetOptimalMaxSpinWaitsPerSpinIteration();
static FCDECL0(INT32, GetOptimalMaxSpinWaitsPerSpinIteration);
static FCDECL1(void, SpinWait, int iterations);
static BOOL QCALLTYPE YieldThread();
static FCDECL0(Object*, GetCurrentThread);
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/vm/ecalllist.h
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ FCFuncStart(gThreadFuncs)
#endif // FEATURE_COMINTEROP
FCFuncElement("Interrupt", ThreadNative::Interrupt)
FCFuncElement("Join", ThreadNative::Join)
QCFuncElement("GetOptimalMaxSpinWaitsPerSpinIterationInternal", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
FCFuncElement("get_OptimalMaxSpinWaitsPerSpinIteration", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
FCFuncElement("GetCurrentProcessorNumber", ThreadNative::GetCurrentProcessorNumber)
FCFuncEnd()

Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/vm/eventtrace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4417,6 +4417,12 @@ VOID EtwCallbackCommon(
{
ETW::TypeSystemLog::OnKeywordsChanged();
}

if (g_fEEStarted && !g_fEEShutDown)
{
// Emit the YieldProcessor measured values at the beginning of the trace
YieldProcessorNormalization::FireMeasurementEvents();
}
}

// Individual callbacks for each EventPipe provider.
Expand Down
5 changes: 0 additions & 5 deletions src/coreclr/vm/finalizerthread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -379,11 +379,6 @@ DWORD WINAPI FinalizerThread::FinalizerThreadStart(void *args)
{
GetFinalizerThread()->SetBackground(TRUE);

{
GCX_PREEMP();
EnsureYieldProcessorNormalizedInitialized();
}

while (!fQuitFinalizer)
{
// This will apply any policy for swallowing exceptions during normal
Expand Down
Loading

0 comments on commit 4ff3762

Please sign in to comment.