Skip to content

Commit

Permalink
Port yield normalization from CoreCLR to Native AOT (dotnet#103675)
Browse files Browse the repository at this point in the history
  • Loading branch information
eduardo-vp authored Jul 17, 2024
1 parent 8c95a64 commit d35f302
Show file tree
Hide file tree
Showing 12 changed files with 390 additions and 653 deletions.
6 changes: 0 additions & 6 deletions src/coreclr/gc/env/gcenv.os.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
#ifndef __GCENV_OS_H__
#define __GCENV_OS_H__

#ifdef HAS_SYSTEM_YIELDPROCESSOR
// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
#undef YieldProcessor
#define YieldProcessor System_YieldProcessor
#endif

#define NUMA_NODE_UNDEFINED UINT16_MAX

bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);
Expand Down
39 changes: 18 additions & 21 deletions src/coreclr/inc/yieldprocessornormalized.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,11 @@

#pragma once

// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
// the intention is to use the system-default implementation of YieldProcessor().
#define HAS_SYSTEM_YIELDPROCESSOR
#ifdef FEATURE_NATIVEAOT
FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
#else
FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
#ifdef YieldProcessor
#undef YieldProcessor
#endif
#define YieldProcessor Dont_Use_YieldProcessor

#define DISABLE_COPY(T) \
T(const T &) = delete; \
Expand Down Expand Up @@ -144,17 +141,17 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
{
_ASSERTE(count != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
if (sizeof(size_t) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below
// On platforms with a small size_t, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (count > MaxCount)
{
count = MaxCount;
}
}

SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
_ASSERTE(n != 0);
do
{
Expand Down Expand Up @@ -189,9 +186,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
{
_ASSERTE(preSkylakeCount != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
if (sizeof(size_t) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below
// On platforms with a small size_t, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
Expand All @@ -200,7 +197,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
{
n = 1;
Expand All @@ -227,9 +224,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl

_ASSERTE(preSkylakeCount != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
if (sizeof(size_t) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below
// On platforms with a small size_t, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
Expand All @@ -238,8 +235,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
SIZE_T n =
(SIZE_T)preSkylakeCount *
size_t n =
(size_t)preSkylakeCount *
YieldProcessorNormalization::s_yieldsPerNormalizedYield /
PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
Expand Down Expand Up @@ -268,11 +265,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
unsigned int spinIteration)
{
// This shift value should be adjusted based on the asserted conditions below
const UINT8 MaxShift = 3;
static_assert_no_msg(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
static_assert_no_msg(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
const uint8_t MaxShift = 3;
static_assert(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
static_assert(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");

unsigned int n;
if (spinIteration <= MaxShift &&
Expand Down
1 change: 0 additions & 1 deletion src/coreclr/nativeaot/Runtime/Crst.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ enum CrstType
CrstRestrictedCallouts,
CrstGcStressControl,
CrstThreadStore,
CrstYieldProcessorNormalized,
CrstEventPipe,
CrstEventPipeConfig,
CrstGcEvent,
Expand Down
8 changes: 5 additions & 3 deletions src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,6 @@ uint32_t WINAPI FinalizerStart(void* pContext)

g_pFinalizerThread = PTR_Thread(pThread);

// We have some time until the first finalization request - use the time to calibrate normalized waits.
EnsureYieldProcessorNormalizedInitialized();

// Wait for a finalization request.
uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
ASSERT(uResult == WAIT_OBJECT_0);
Expand Down Expand Up @@ -184,6 +181,11 @@ EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
{
FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
g_FinalizerDoneEvent.Set();

if (YieldProcessorNormalization::IsMeasurementScheduled())
{
YieldProcessorNormalization::PerformMeasurement();
}
}

//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,4 @@ ThreadPoolWorkingThreadCount
ThreadRunning
WaitHandleWaitStart
WaitHandleWaitStop
YieldProcessorMeasurement
2 changes: 0 additions & 2 deletions src/coreclr/nativeaot/Runtime/startup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,6 @@ static bool InitDLL(HANDLE hPalInstance)
#endif
#endif // !USE_PORTABLE_HELPERS

InitializeYieldProcessorNormalizedCrst();

#ifdef STRESS_LOG
uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();
Expand Down
20 changes: 20 additions & 0 deletions src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,26 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
return _InterlockedCompareExchange64(pDst, iValue, iComparand);
}

#ifdef HOST_X86
FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
{
int64_t iOldValue;
do {
iOldValue = *pDst;
} while (PalInterlockedCompareExchange64(pDst,
iValue,
iOldValue) != iOldValue);
return iOldValue;
}
#else // HOST_X86
EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
#pragma intrinsic(_InterlockedExchange64)
FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
{
return _InterlockedExchange64(pDst, iValue);
}
#endif // HOST_X86

#if defined(HOST_AMD64) || defined(HOST_ARM64)
EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
#pragma intrinsic(_InterlockedCompareExchange128)
Expand Down
102 changes: 2 additions & 100 deletions src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,104 +15,6 @@
#include "volatile.h"
#include "yieldprocessornormalized.h"

#define ULONGLONG int64_t
#include "../../utilcode/yieldprocessornormalized.cpp"

static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
static CrstStatic s_initializeYieldProcessorNormalizedCrst;

// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
// tuned for Skylake processors
unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;

void InitializeYieldProcessorNormalizedCrst()
{
WRAPPER_NO_CONTRACT;
s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
}

static void InitializeYieldProcessorNormalized()
{
WRAPPER_NO_CONTRACT;

CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);

if (s_isYieldProcessorNormalizedInitialized)
{
return;
}

// Intel pre-Skylake processor: measured typically 14-17 cycles per yield
// Intel post-Skylake processor: measured typically 125-150 cycles per yield
const int MeasureDurationMs = 10;
const int NsPerSecond = 1000 * 1000 * 1000;

ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();

if (ticksPerSecond < 1000 / MeasureDurationMs)
{
// High precision clock not available or clock resolution is too low, resort to defaults
s_isYieldProcessorNormalizedInitialized = true;
return;
}

// Measure the nanosecond delay per yield
ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
unsigned int yieldCount = 0;
ULONGLONG startTicks = PalQueryPerformanceCounter();
ULONGLONG elapsedTicks;
do
{
// On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
// the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
// low microsecond range.
for (int i = 0; i < 1000; ++i)
{
System_YieldProcessor();
}
yieldCount += 1000;

ULONGLONG nowTicks = PalQueryPerformanceCounter();
elapsedTicks = nowTicks - startTicks;
} while (elapsedTicks < measureDurationTicks);
double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
if (nsPerYield < 1)
{
nsPerYield = 1;
}

// Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
// value is naturally limited to MinNsPerNormalizedYield.
int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
if (yieldsPerNormalizedYield < 1)
{
yieldsPerNormalizedYield = 1;
}
_ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);

// Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
// spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
// better job of allowing other work to run.
int optimalMaxNormalizedYieldsPerSpinIteration =
(int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
{
optimalMaxNormalizedYieldsPerSpinIteration = 1;
}

g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
s_isYieldProcessorNormalizedInitialized = true;

GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
}

void EnsureYieldProcessorNormalizedInitialized()
{
WRAPPER_NO_CONTRACT;

if (!s_isYieldProcessorNormalizedInitialized)
{
InitializeYieldProcessorNormalized();
}
}
#include "../../vm/yieldprocessornormalizedshared.cpp"
Loading

0 comments on commit d35f302

Please sign in to comment.