Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update spin-wait pause/yield normalization #55295

Merged
merged 5 commits into from
Jul 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -322,33 +322,14 @@ public void DisableComObjectEagerCleanup()
[MethodImpl(MethodImplOptions.InternalCall)]
public extern bool Join(int millisecondsTimeout);

private static int s_optimalMaxSpinWaitsPerSpinIteration;

[DllImport(RuntimeHelpers.QCall)]
private static extern int GetOptimalMaxSpinWaitsPerSpinIterationInternal();

/// <summary>
/// Max value to be passed into <see cref="SpinWait(int)"/> for optimal delaying. This value is normalized to be
/// appropriate for the processor.
/// </summary>
internal static int OptimalMaxSpinWaitsPerSpinIteration
{
get
{
int optimalMaxSpinWaitsPerSpinIteration = s_optimalMaxSpinWaitsPerSpinIteration;
return optimalMaxSpinWaitsPerSpinIteration != 0 ? optimalMaxSpinWaitsPerSpinIteration : CalculateOptimalMaxSpinWaitsPerSpinIteration();
}
}

[MethodImpl(MethodImplOptions.NoInlining)]
private static int CalculateOptimalMaxSpinWaitsPerSpinIteration()
{
// This is done lazily because the first call to the function below in the process triggers a measurement that
// takes a nontrivial amount of time if the measurement has not already been done in the backgorund.
// See Thread::InitializeYieldProcessorNormalized(), which describes and calculates this value.
s_optimalMaxSpinWaitsPerSpinIteration = GetOptimalMaxSpinWaitsPerSpinIterationInternal();
Debug.Assert(s_optimalMaxSpinWaitsPerSpinIteration > 0);
return s_optimalMaxSpinWaitsPerSpinIteration;
[MethodImpl(MethodImplOptions.InternalCall)]
get;
}

[MethodImpl(MethodImplOptions.InternalCall)]
Expand Down
120 changes: 97 additions & 23 deletions src/coreclr/inc/yieldprocessornormalized.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,59 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
#endif
#define YieldProcessor Dont_Use_YieldProcessor

const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
#define DISABLE_COPY(T) \
T(const T &) = delete; \
T &operator =(const T &) = delete

extern unsigned int g_yieldsPerNormalizedYield;
extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
#define DISABLE_CONSTRUCT_COPY(T) \
T() = delete; \
DISABLE_COPY(T)

void InitializeYieldProcessorNormalizedCrst();
void EnsureYieldProcessorNormalizedInitialized();
class YieldProcessorNormalization
{
public:
static const unsigned int TargetNsPerNormalizedYield = 37;
static const unsigned int TargetMaxNsPerSpinIteration = 272;

// These are maximums for the computed values for normalization based their calculation
static const unsigned int MaxYieldsPerNormalizedYield = TargetNsPerNormalizedYield * 10;
static const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
TargetMaxNsPerSpinIteration * 3 / (TargetNsPerNormalizedYield * 2) + 1;

private:
static bool s_isMeasurementScheduled;

static unsigned int s_yieldsPerNormalizedYield;
static unsigned int s_optimalMaxNormalizedYieldsPerSpinIteration;

public:
static bool IsMeasurementScheduled()
{
return s_isMeasurementScheduled;
}

static void PerformMeasurement();

private:
static void ScheduleMeasurementIfNecessary();

public:
static unsigned int GetOptimalMaxNormalizedYieldsPerSpinIteration()
{
return s_optimalMaxNormalizedYieldsPerSpinIteration;
}

static void FireMeasurementEvents();

private:
static double AtomicLoad(double *valueRef);
static void AtomicStore(double *valueRef, double value);

DISABLE_CONSTRUCT_COPY(YieldProcessorNormalization);

friend class YieldProcessorNormalizationInfo;
friend void YieldProcessorNormalizedForPreSkylakeCount(unsigned int);
};

class YieldProcessorNormalizationInfo
{
Expand All @@ -30,12 +75,15 @@ class YieldProcessorNormalizationInfo

public:
YieldProcessorNormalizationInfo()
: yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
: yieldsPerNormalizedYield(YieldProcessorNormalization::s_yieldsPerNormalizedYield),
optimalMaxNormalizedYieldsPerSpinIteration(YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIteration),
optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
{
YieldProcessorNormalization::ScheduleMeasurementIfNecessary();
}

DISABLE_COPY(YieldProcessorNormalizationInfo);

friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
Expand Down Expand Up @@ -98,9 +146,8 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo

if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
// is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (count > MaxCount)
{
count = MaxCount;
Expand Down Expand Up @@ -144,9 +191,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(

if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
// is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
preSkylakeCount = MaxCount;
Expand Down Expand Up @@ -175,7 +221,35 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
// }
FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
{
YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
// This function does not forward to the one above because it is used by some code under utilcode, where
// YieldProcessorNormalizationInfo cannot be used since normalization does not happen in some of its consumers. So this
// version uses the fields in YieldProcessorNormalization directly.

_ASSERTE(preSkylakeCount != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
mangod9 marked this conversation as resolved.
Show resolved Hide resolved
if (preSkylakeCount > MaxCount)
{
preSkylakeCount = MaxCount;
}
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
SIZE_T n =
(SIZE_T)preSkylakeCount *
YieldProcessorNormalization::s_yieldsPerNormalizedYield /
PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
{
n = 1;
}
do
{
System_YieldProcessor();
} while (--n != 0);
}

// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
Expand All @@ -193,15 +267,12 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
const YieldProcessorNormalizationInfo &normalizationInfo,
unsigned int spinIteration)
{
// normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
// InitializeYieldProcessorNormalized()
const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
_ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);

// This shift value should be adjusted based on the asserted condition below
// This shift value should be adjusted based on the asserted conditions below
const UINT8 MaxShift = 3;
static_assert_no_msg(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
static_assert_no_msg(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
static_assert_no_msg(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);

unsigned int n;
if (spinIteration <= MaxShift &&
Expand All @@ -219,3 +290,6 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
System_YieldProcessor();
} while (--n != 0);
}

#undef DISABLE_CONSTRUCT_COPY
#undef DISABLE_COPY
16 changes: 12 additions & 4 deletions src/coreclr/utilcode/yieldprocessornormalized.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,16 @@
// The .NET Foundation licenses this file to you under the MIT license.

#include "stdafx.h"
#include "yieldprocessornormalized.h"

// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
// tuned for Skylake processors
unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
bool YieldProcessorNormalization::s_isMeasurementScheduled;

// Defaults are for when normalization has not yet been done
unsigned int YieldProcessorNormalization::s_yieldsPerNormalizedYield = 1;
unsigned int YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIteration =
(unsigned int)
(
(double)YieldProcessorNormalization::TargetMaxNsPerSpinIteration /
YieldProcessorNormalization::TargetNsPerNormalizedYield +
0.5
);
2 changes: 1 addition & 1 deletion src/coreclr/vm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ set(VM_SOURCES_DAC_AND_WKS_COMMON
versionresilienthashcode.cpp
virtualcallstub.cpp
win32threadpool.cpp
yieldprocessornormalized.cpp
zapsig.cpp
)

Expand Down Expand Up @@ -389,6 +388,7 @@ set(VM_SOURCES_WKS
threadsuspend.cpp
typeparse.cpp
weakreferencenative.cpp
yieldprocessornormalized.cpp
${VM_SOURCES_GDBJIT}
)

Expand Down
27 changes: 26 additions & 1 deletion src/coreclr/vm/ClrEtwAll.man
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,13 @@
<opcode name="Profiler" message="$(string.RuntimePublisher.ProfilerOpcodeMessage)" symbol="CLR_PROFILER_OPCODE" value="11"/>
</opcodes>
</task>
<!--Next available ID is 37-->
<task name="YieldProcessorMeasurement" symbol="CLR_YIELD_PROCESSOR_MEASUREMENT_TASK"
value="37" eventGUID="{B4AFC324-DECE-4B02-86DC-AAB8F22BC1B1}"
message="$(string.RuntimePublisher.YieldProcessorMeasurementTaskMessage)">
<opcodes>
</opcodes>
</task>
<!--Next available ID is 38-->
</tasks>
<!--Maps-->
<maps>
Expand Down Expand Up @@ -2916,6 +2922,19 @@
</Settings>
</UserData>
</template>

<template tid="YieldProcessorMeasurement">
<data name="ClrInstanceID" inType="win:UInt16"/>
<data name="NsPerYield" inType="win:Double"/>
<data name="EstablishedNsPerYield" inType="win:Double"/>
<UserData>
<Settings xmlns="myNs">
<ClrInstanceID> %1 </ClrInstanceID>
<NsPerYield> %2 </NsPerYield>
<EstablishedNsPerYield> %3 </EstablishedNsPerYield>
</Settings>
</UserData>
</template>
</templates>

<events>
Expand Down Expand Up @@ -3313,6 +3332,10 @@
keywords ="ThreadingKeyword" opcode="Wait"
task="ThreadPoolWorkerThread"
symbol="ThreadPoolWorkerThreadWait" message="$(string.RuntimePublisher.ThreadPoolWorkerThreadEventMessage)"/>

<event value="58" version="0" level="win:Informational" template="YieldProcessorMeasurement"
kouvel marked this conversation as resolved.
Show resolved Hide resolved
keywords="ThreadingKeyword" task="YieldProcessorMeasurement" opcode="win:Info"
symbol="YieldProcessorMeasurement" message="$(string.RuntimePublisher.YieldProcessorMeasurementEventMessage)"/>

<!-- CLR private ThreadPool events -->
<event value="60" version="0" level="win:Verbose" template="ThreadPoolWorkingThreadCount"
Expand Down Expand Up @@ -7125,6 +7148,7 @@
<string id="RuntimePublisher.WorkerThreadRetirementRetireThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreads=%2" />
<string id="RuntimePublisher.WorkerThreadRetirementUnretireThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreads=%2" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadEventMessage" value="WorkerThreadCount=%1;%nRetiredWorkerThreadCount=%2;%nClrInstanceID=%3" />
<string id="RuntimePublisher.YieldProcessorMeasurementEventMessage" value="ClrInstanceID=%1;%nNsPerYield=%2;%nEstablishedNsPerYield=%3" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentSampleEventMessage" value="Throughput=%1;%nClrInstanceID=%2" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentAdjustmentEventMessage" value="AverageThroughput=%1;%nNewWorkerThreadCount=%2;%nReason=%3;%nClrInstanceID=%4" />
<string id="RuntimePublisher.ThreadPoolWorkerThreadAdjustmentStatsEventMessage" value="Duration=%1;%nThroughput=%2;%nThreadWave=%3;%nThroughputWave=%4;%nThroughputErrorEstimate=%5;%nAverageThroughputErrorEstimate=%6;%nThroughputRatio=%7;%nConfidence=%8;%nNewControlSetting=%9;%nNewThreadWaveMagnitude=%10;%nClrInstanceID=%11" />
Expand Down Expand Up @@ -7410,6 +7434,7 @@
<string id="RuntimePublisher.JitInstrumentationDataTaskMessage" value="JitInstrumentationData" />
<string id="RuntimePublisher.ExecutionCheckpointTaskMessage" value="ExecutionCheckpoint" />
<string id="RuntimePublisher.ProfilerTaskMessage" value="Profiler" />
<string id="RuntimePublisher.YieldProcessorMeasurementTaskMessage" value="YieldProcessorMeasurement" />

<string id="RundownPublisher.EEStartupTaskMessage" value="Runtime" />
<string id="RundownPublisher.MethodTaskMessage" value="Method" />
Expand Down
8 changes: 5 additions & 3 deletions src/coreclr/vm/ClrEtwAllMeta.lst
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ nomac:GarbageCollection:::GCJoin_V2

nostack:Type:::BulkType

###################
# Threadpool events
###################
#################################
# Threading and Threadpool events
#################################
nomac:WorkerThreadCreation:::WorkerThreadCreate
noclrinstanceid:WorkerThreadCreation:::WorkerThreadCreate
nomac:WorkerThreadCreation:::WorkerThreadTerminate
Expand Down Expand Up @@ -170,6 +170,8 @@ nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample
nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentSample
nomac:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment
nostack:ThreadPoolWorkerThreadAdjustment:::ThreadPoolWorkerThreadAdjustmentAdjustment
nomac:YieldProcessorMeasurement:::YieldProcessorMeasurement
nostack:YieldProcessorMeasurement:::YieldProcessorMeasurement

##################
# Exception events
Expand Down
17 changes: 4 additions & 13 deletions src/coreclr/vm/comsynchronizable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1089,22 +1089,13 @@ FCIMPL1(void, ThreadNative::SetIsThreadpoolThread, ThreadBaseObject* thread)
}
FCIMPLEND

INT32 QCALLTYPE ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration()
FCIMPL0(INT32, ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
{
QCALL_CONTRACT;

INT32 optimalMaxNormalizedYieldsPerSpinIteration;

BEGIN_QCALL;

// RuntimeThread calls this function only once lazily and caches the result, so ensure initialization
EnsureYieldProcessorNormalizedInitialized();
optimalMaxNormalizedYieldsPerSpinIteration = g_optimalMaxNormalizedYieldsPerSpinIteration;

END_QCALL;
FCALL_CONTRACT;

return optimalMaxNormalizedYieldsPerSpinIteration;
return (INT32)YieldProcessorNormalization::GetOptimalMaxNormalizedYieldsPerSpinIteration();
}
FCIMPLEND

FCIMPL1(void, ThreadNative::SpinWait, int iterations)
{
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/vm/comsynchronizable.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ friend class ThreadBaseObject;
UINT64 QCALLTYPE GetProcessDefaultStackSize();

static FCDECL1(INT32, GetManagedThreadId, ThreadBaseObject* th);
static INT32 QCALLTYPE GetOptimalMaxSpinWaitsPerSpinIteration();
static FCDECL0(INT32, GetOptimalMaxSpinWaitsPerSpinIteration);
static FCDECL1(void, SpinWait, int iterations);
static BOOL QCALLTYPE YieldThread();
static FCDECL0(Object*, GetCurrentThread);
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/vm/ecalllist.h
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ FCFuncStart(gThreadFuncs)
#endif // FEATURE_COMINTEROP
FCFuncElement("Interrupt", ThreadNative::Interrupt)
FCFuncElement("Join", ThreadNative::Join)
QCFuncElement("GetOptimalMaxSpinWaitsPerSpinIterationInternal", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
FCFuncElement("get_OptimalMaxSpinWaitsPerSpinIteration", ThreadNative::GetOptimalMaxSpinWaitsPerSpinIteration)
FCFuncElement("GetCurrentProcessorNumber", ThreadNative::GetCurrentProcessorNumber)
FCFuncEnd()

Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/vm/eventtrace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4417,6 +4417,12 @@ VOID EtwCallbackCommon(
{
ETW::TypeSystemLog::OnKeywordsChanged();
}

if (g_fEEStarted && !g_fEEShutDown)
{
// Emit the YieldProcessor measured values at the beginning of the trace
YieldProcessorNormalization::FireMeasurementEvents();
}
}

// Individual callbacks for each EventPipe provider.
Expand Down
5 changes: 0 additions & 5 deletions src/coreclr/vm/finalizerthread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -379,11 +379,6 @@ DWORD WINAPI FinalizerThread::FinalizerThreadStart(void *args)
{
GetFinalizerThread()->SetBackground(TRUE);

{
GCX_PREEMP();
EnsureYieldProcessorNormalizedInitialized();
}

while (!fQuitFinalizer)
{
// This will apply any policy for swallowing exceptions during normal
Expand Down
Loading