From d35f3021b91d67eeac232a0370c6efb6c256f060 Mon Sep 17 00:00:00 2001
From: Eduardo Velarde <32459232+eduardo-vp@users.noreply.github.com>
Date: Wed, 17 Jul 2024 16:19:42 -0700
Subject: [PATCH] Port yield normalization from CoreCLR to Native AOT (#103675)

---
 src/coreclr/gc/env/gcenv.os.h                 |   6 -
 src/coreclr/inc/yieldprocessornormalized.h    |  39 +-
 src/coreclr/nativeaot/Runtime/Crst.h          |   1 -
 .../nativeaot/Runtime/FinalizerHelpers.cpp    |   8 +-
 .../eventpipe/gen-eventing-event-inc.lst      |   1 +
 src/coreclr/nativeaot/Runtime/startup.cpp     |   2 -
 .../Runtime/windows/PalRedhawkInline.h        |  20 +
 .../Runtime/yieldprocessornormalized.cpp      | 102 +-----
 .../Runtime/yieldprocessornormalized.h        | 228 +-----------
 .../utilcode/yieldprocessornormalized.cpp     |   1 -
 src/coreclr/vm/yieldprocessornormalized.cpp   | 294 +--------------
 .../vm/yieldprocessornormalizedshared.cpp     | 341 ++++++++++++++++++
 12 files changed, 390 insertions(+), 653 deletions(-)
 create mode 100644 src/coreclr/vm/yieldprocessornormalizedshared.cpp

diff --git a/src/coreclr/gc/env/gcenv.os.h b/src/coreclr/gc/env/gcenv.os.h
index 01ed27dac3e59..aa7223850eaa9 100644
--- a/src/coreclr/gc/env/gcenv.os.h
+++ b/src/coreclr/gc/env/gcenv.os.h
@@ -6,12 +6,6 @@
 #ifndef __GCENV_OS_H__
 #define __GCENV_OS_H__
 
-#ifdef HAS_SYSTEM_YIELDPROCESSOR
-// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
-#undef YieldProcessor
-#define YieldProcessor System_YieldProcessor
-#endif
-
 #define NUMA_NODE_UNDEFINED UINT16_MAX
 
 bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);
diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index 121e60b033356..e37bf79f0c508 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -3,14 +3,11 @@
 
 #pragma once
 
-// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
-// the intention is to use the system-default implementation of YieldProcessor().
-#define HAS_SYSTEM_YIELDPROCESSOR
+#ifdef FEATURE_NATIVEAOT
+FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
+#else
 FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
-#ifdef YieldProcessor
-#undef YieldProcessor
 #endif
-#define YieldProcessor Dont_Use_YieldProcessor
 
 #define DISABLE_COPY(T) \
     T(const T &) = delete; \
@@ -144,9 +141,9 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
 {
     _ASSERTE(count != 0);
 
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    if (sizeof(size_t) <= sizeof(unsigned int))
     {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        // On platforms with a small size_t, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (count > MaxCount)
         {
@@ -154,7 +151,7 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
         }
     }
 
-    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
+    size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
     _ASSERTE(n != 0);
     do
     {
@@ -189,9 +186,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
 {
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    if (sizeof(size_t) <= sizeof(unsigned int))
     {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        // On platforms with a small size_t, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -200,7 +197,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
     {
         n = 1;
@@ -227,9 +224,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
 
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    if (sizeof(size_t) <= sizeof(unsigned int))
     {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        // On platforms with a small size_t, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -238,8 +235,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    SIZE_T n =
-        (SIZE_T)preSkylakeCount *
+    size_t n =
+        (size_t)preSkylakeCount *
         YieldProcessorNormalization::s_yieldsPerNormalizedYield /
         PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
@@ -268,11 +265,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
     unsigned int spinIteration)
 {
     // This shift value should be adjusted based on the asserted conditions below
-    const UINT8 MaxShift = 3;
-    static_assert_no_msg(
-        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-    static_assert_no_msg(
-        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+    const uint8_t MaxShift = 3;
+    static_assert(
+        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
+    static_assert(
+        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
 
     unsigned int n;
     if (spinIteration <= MaxShift &&
diff --git a/src/coreclr/nativeaot/Runtime/Crst.h b/src/coreclr/nativeaot/Runtime/Crst.h
index 31bf8fde9eec8..4ab9db08e0f5e 100644
--- a/src/coreclr/nativeaot/Runtime/Crst.h
+++ b/src/coreclr/nativeaot/Runtime/Crst.h
@@ -20,7 +20,6 @@ enum CrstType
     CrstRestrictedCallouts,
     CrstGcStressControl,
     CrstThreadStore,
-    CrstYieldProcessorNormalized,
     CrstEventPipe,
     CrstEventPipeConfig,
     CrstGcEvent,
diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
index dd9f1e096842f..8fa6053818969 100644
--- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
@@ -48,9 +48,6 @@ uint32_t WINAPI FinalizerStart(void* pContext)
 
     g_pFinalizerThread = PTR_Thread(pThread);
 
-    // We have some time until the first finalization request - use the time to calibrate normalized waits.
-    EnsureYieldProcessorNormalizedInitialized();
-
     // Wait for a finalization request.
     uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
     ASSERT(uResult == WAIT_OBJECT_0);
@@ -184,6 +181,11 @@ EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
 {
     FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
     g_FinalizerDoneEvent.Set();
+
+    if (YieldProcessorNormalization::IsMeasurementScheduled())
+    {
+        YieldProcessorNormalization::PerformMeasurement();
+    }
 }
 
 //
diff --git a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
index 901af659ff84b..0f4c932719a39 100644
--- a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
+++ b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
@@ -113,3 +113,4 @@ ThreadPoolWorkingThreadCount
 ThreadRunning
 WaitHandleWaitStart
 WaitHandleWaitStop
+YieldProcessorMeasurement
diff --git a/src/coreclr/nativeaot/Runtime/startup.cpp b/src/coreclr/nativeaot/Runtime/startup.cpp
index db2802dcb115e..af835018e1823 100644
--- a/src/coreclr/nativeaot/Runtime/startup.cpp
+++ b/src/coreclr/nativeaot/Runtime/startup.cpp
@@ -133,8 +133,6 @@ static bool InitDLL(HANDLE hPalInstance)
 #endif
 #endif // !USE_PORTABLE_HELPERS
 
-    InitializeYieldProcessorNormalizedCrst();
-
 #ifdef STRESS_LOG
     uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
     uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();
diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
index 187ad26fb8bf1..1f2a74dcd1510 100644
--- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
+++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
@@ -56,6 +56,26 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
     return _InterlockedCompareExchange64(pDst, iValue, iComparand);
 }
 
+#ifdef HOST_X86
+FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
+{
+    int64_t iOldValue;
+    do {
+        iOldValue = *pDst;
+    } while (PalInterlockedCompareExchange64(pDst,
+                                          iValue,
+                                          iOldValue) != iOldValue);
+    return iOldValue;
+}
+#else // HOST_X86
+EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
+#pragma intrinsic(_InterlockedExchange64)
+FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
+{
+    return _InterlockedExchange64(pDst, iValue);
+}
+#endif // HOST_X86
+
 #if defined(HOST_AMD64) || defined(HOST_ARM64)
 EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
 #pragma intrinsic(_InterlockedCompareExchange128)
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
index 444d52b0114c0..efaf4e8bb2070 100644
--- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
+++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
@@ -15,104 +15,6 @@
 #include "volatile.h"
 #include "yieldprocessornormalized.h"
 
-#define ULONGLONG int64_t
+#include "../../utilcode/yieldprocessornormalized.cpp"
 
-static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
-static CrstStatic s_initializeYieldProcessorNormalizedCrst;
-
-// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
-// tuned for Skylake processors
-unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
-unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
-
-void InitializeYieldProcessorNormalizedCrst()
-{
-    WRAPPER_NO_CONTRACT;
-    s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
-}
-
-static void InitializeYieldProcessorNormalized()
-{
-    WRAPPER_NO_CONTRACT;
-
-    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
-
-    if (s_isYieldProcessorNormalizedInitialized)
-    {
-        return;
-    }
-
-    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
-    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
-    const int MeasureDurationMs = 10;
-    const int NsPerSecond = 1000 * 1000 * 1000;
-
-    ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();
-
-    if (ticksPerSecond < 1000 / MeasureDurationMs)
-    {
-        // High precision clock not available or clock resolution is too low, resort to defaults
-        s_isYieldProcessorNormalizedInitialized = true;
-        return;
-    }
-
-    // Measure the nanosecond delay per yield
-    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
-    unsigned int yieldCount = 0;
-      ULONGLONG startTicks = PalQueryPerformanceCounter();
-    ULONGLONG elapsedTicks;
-    do
-    {
-        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
-        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
-        // low microsecond range.
-        for (int i = 0; i < 1000; ++i)
-        {
-            System_YieldProcessor();
-        }
-        yieldCount += 1000;
-
-        ULONGLONG nowTicks = PalQueryPerformanceCounter();
-        elapsedTicks = nowTicks - startTicks;
-    } while (elapsedTicks < measureDurationTicks);
-    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
-    if (nsPerYield < 1)
-    {
-        nsPerYield = 1;
-    }
-
-    // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
-    // value is naturally limited to MinNsPerNormalizedYield.
-    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
-    if (yieldsPerNormalizedYield < 1)
-    {
-        yieldsPerNormalizedYield = 1;
-    }
-    _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
-
-    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
-    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
-    // better job of allowing other work to run.
-    int optimalMaxNormalizedYieldsPerSpinIteration =
-        (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
-    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
-    {
-        optimalMaxNormalizedYieldsPerSpinIteration = 1;
-    }
-
-    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
-    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
-    s_isYieldProcessorNormalizedInitialized = true;
-
-    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
-}
-
-void EnsureYieldProcessorNormalizedInitialized()
-{
-    WRAPPER_NO_CONTRACT;
-
-    if (!s_isYieldProcessorNormalizedInitialized)
-    {
-        InitializeYieldProcessorNormalized();
-    }
-}
+#include "../../vm/yieldprocessornormalizedshared.cpp"
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
index 8c74bf3cfe300..5539ebf90561b 100644
--- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
+++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
@@ -1,229 +1,5 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-#pragma once
-
-#include <limits.h>
-
-// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
-// the intention is to use the system-default implementation of YieldProcessor().
-#define HAS_SYSTEM_YIELDPROCESSOR
-FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
-#ifdef YieldProcessor
-#undef YieldProcessor
-#endif
-#define YieldProcessor Dont_Use_YieldProcessor
-#ifdef PalYieldProcessor
-#undef PalYieldProcessor
-#endif
-#define PalYieldProcessor Dont_Use_PalYieldProcessor
-
-#define SIZE_T uintptr_t
-
-const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
-const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
-
-extern unsigned int g_yieldsPerNormalizedYield;
-extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
-
-void InitializeYieldProcessorNormalizedCrst();
-void EnsureYieldProcessorNormalizedInitialized();
-
-class YieldProcessorNormalizationInfo
-{
-private:
-    unsigned int yieldsPerNormalizedYield;
-    unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
-    unsigned int optimalMaxYieldsPerSpinIteration;
-
-public:
-    YieldProcessorNormalizationInfo()
-        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
-        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
-        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
-    {
-    }
-
-    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
-    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
-    friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
-    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
-};
-
-// See YieldProcessorNormalized() for preliminary info. Typical usage:
-//     if (!condition)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorNormalized(normalizationInfo);
-//         } while (!condition);
-//     }
-FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
-{
-    unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
-    _ASSERTE(n != 0);
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
-
-// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the
-// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following:
-//   - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value
-//     for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage
-//     and decrease scalability of the operation.
-//         while(!condition)
-//         {
-//             YieldProcessorNormalized();
-//         }
-//   - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the
-//     condition, otherwise it may unnecessarily increase latency of the operation
-//   - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in
-//     yield count per iteration for each failed check of the condition, the progression can significantly magnify the second
-//     issue above on later iterations.
-//   - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each
-//     issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using
-//     System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method.
-FORCEINLINE void YieldProcessorNormalized()
-{
-    YieldProcessorNormalized(YieldProcessorNormalizationInfo());
-}
-
-// See YieldProcessorNormalized(count) for preliminary info. Typical usage:
-//     if (!moreExpensiveCondition)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorNormalized(normalizationInfo, 2);
-//         } while (!moreExpensiveCondition);
-//     }
-FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
-{
-    _ASSERTE(count != 0);
-
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
-    {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
-        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
-        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
-        if (count > MaxCount)
-        {
-            count = MaxCount;
-        }
-    }
-
-    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
-    _ASSERTE(n != 0);
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
-
-// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is
-// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage:
-//     while(!moreExpensiveCondition)
-//     {
-//         YieldProcessorNormalized(2);
-//     }
-FORCEINLINE void YieldProcessorNormalized(unsigned int count)
-{
-    YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count);
-}
-
-// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary
-// info. Typical usage:
-//     if (!condition)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100);
-//         } while (!condition);
-//     }
-FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
-    const YieldProcessorNormalizationInfo &normalizationInfo,
-    unsigned int preSkylakeCount)
-{
-    _ASSERTE(preSkylakeCount != 0);
-
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
-    {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
-        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
-        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
-        if (preSkylakeCount > MaxCount)
-        {
-            preSkylakeCount = MaxCount;
-        }
-    }
-
-    const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
-    if (n == 0)
-    {
-        n = 1;
-    }
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
-
-// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned
-// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in
-// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a
-// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage:
-//     while(!condition)
-//     {
-//         YieldProcessorNormalizedForPreSkylakeCount(100);
-//     }
-FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
-{
-    YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
-}
-
-// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
-// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait
-// iteration exponentially up to a limit. Typical usage:
-//     if (!conditionThatMayNotBeSatisfiedSoon)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally
-//         } while (!conditionThatMayNotBeSatisfiedSoon);
-//     }
-FORCEINLINE void YieldProcessorWithBackOffNormalized(
-    const YieldProcessorNormalizationInfo &normalizationInfo,
-    unsigned int spinIteration)
-{
-    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
-    // InitializeYieldProcessorNormalized()
-    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
-        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
-    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    // This shift value should be adjusted based on the asserted condition below
-    const uint8_t MaxShift = 3;
-    static_assert(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
-
-    unsigned int n;
-    if (spinIteration <= MaxShift &&
-        ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
-    {
-        n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
-    }
-    else
-    {
-        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
-    }
-    _ASSERTE(n != 0);
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
+#include "PalRedhawk.h"
+#include "../../inc/yieldprocessornormalized.h"
diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp
index 020d8d7cc79e4..c6aaaa19557fa 100644
--- a/src/coreclr/utilcode/yieldprocessornormalized.cpp
+++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp
@@ -1,7 +1,6 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-#include "stdafx.h"
 #include "yieldprocessornormalized.h"
 
 bool YieldProcessorNormalization::s_isMeasurementScheduled;
diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
index 14166de34dd64..258e30d634c7c 100644
--- a/src/coreclr/vm/yieldprocessornormalized.cpp
+++ b/src/coreclr/vm/yieldprocessornormalized.cpp
@@ -7,296 +7,4 @@
 
 #include "finalizerthread.h"
 
-enum class NormalizationState : UINT8
-{
-    Uninitialized,
-    Initialized,
-    Failed
-};
-
-static const int NsPerYieldMeasurementCount = 8;
-static const unsigned int MeasurementPeriodMs = 4000;
-
-static const unsigned int NsPerS = 1000 * 1000 * 1000;
-
-static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
-static unsigned int s_previousNormalizationTimeMs;
-
-static UINT64 s_performanceCounterTicksPerS;
-static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
-static int s_nextMeasurementIndex;
-static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
-
-static unsigned int DetermineMeasureDurationUs()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_PREEMPTIVE;
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_normalizationState != NormalizationState::Failed);
-
-    // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
-    // if the overhead seems high relative to the measure duration.
-    unsigned int measureDurationUs = 1;
-    LARGE_INTEGER li;
-    QueryPerformanceCounter(&li);
-    UINT64 startTicks = li.QuadPart;
-    QueryPerformanceCounter(&li);
-    UINT64 elapsedTicks = li.QuadPart - startTicks;
-    if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
-    {
-        measureDurationUs *= 4;
-    }
-    return measureDurationUs;
-}
-
-static double MeasureNsPerYield(unsigned int measureDurationUs)
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_PREEMPTIVE;
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_normalizationState != NormalizationState::Failed);
-
-    int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
-    UINT64 ticksPerS = s_performanceCounterTicksPerS;
-    UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
-
-    LARGE_INTEGER li;
-    QueryPerformanceCounter(&li);
-    UINT64 startTicks = li.QuadPart;
-
-    for (int i = 0; i < yieldCount; ++i)
-    {
-        System_YieldProcessor();
-    }
-
-    QueryPerformanceCounter(&li);
-    UINT64 elapsedTicks = li.QuadPart - startTicks;
-    while (elapsedTicks < measureDurationTicks)
-    {
-        int nextYieldCount =
-            Max(4,
-                elapsedTicks == 0
-                    ? yieldCount / 4
-                    : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
-        for (int i = 0; i < nextYieldCount; ++i)
-        {
-            System_YieldProcessor();
-        }
-
-        QueryPerformanceCounter(&li);
-        elapsedTicks = li.QuadPart - startTicks;
-        yieldCount += nextYieldCount;
-    }
-
-    // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
-    const double MinNsPerYield = 0.1;
-
-    // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
-    // really take this long. Limit the maximum to keep the recorded values reasonable.
-    const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
-
-    return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
-}
-
-void YieldProcessorNormalization::PerformMeasurement()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_PREEMPTIVE;
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_isMeasurementScheduled);
-
-    double latestNsPerYield;
-    if (s_normalizationState == NormalizationState::Initialized)
-    {
-        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
-        {
-            return;
-        }
-
-        int nextMeasurementIndex = s_nextMeasurementIndex;
-        latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
-        AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield);
-        if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
-        {
-            nextMeasurementIndex = 0;
-        }
-        s_nextMeasurementIndex = nextMeasurementIndex;
-    }
-    else if (s_normalizationState == NormalizationState::Uninitialized)
-    {
-        LARGE_INTEGER li;
-        if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
-        {
-            // High precision clock not available or clock resolution is too low, resort to defaults
-            s_normalizationState = NormalizationState::Failed;
-            return;
-        }
-        s_performanceCounterTicksPerS = li.QuadPart;
-
-        unsigned int measureDurationUs = DetermineMeasureDurationUs();
-        for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
-        {
-            latestNsPerYield = MeasureNsPerYield(measureDurationUs);
-            AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
-            if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
-            {
-                AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
-            }
-
-            if (i < NsPerYieldMeasurementCount - 1)
-            {
-                FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
-            }
-        }
-    }
-    else
-    {
-        _ASSERTE(s_normalizationState == NormalizationState::Failed);
-        return;
-    }
-
-    double establishedNsPerYield = s_nsPerYieldMeasurements[0];
-    for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
-    {
-        double nsPerYield = s_nsPerYieldMeasurements[i];
-        if (nsPerYield < establishedNsPerYield)
-        {
-            establishedNsPerYield = nsPerYield;
-        }
-    }
-    if (establishedNsPerYield != s_establishedNsPerYield)
-    {
-        AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
-    }
-
-    FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
-
-    // Calculate the number of yields required to span the duration of a normalized yield
-    unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
-    _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
-    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
-
-    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
-    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
-    // better job of allowing other work to run.
-    s_optimalMaxNormalizedYieldsPerSpinIteration =
-        Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
-    _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
-
-    s_previousNormalizationTimeMs = GetTickCount();
-    s_normalizationState = NormalizationState::Initialized;
-    s_isMeasurementScheduled = false;
-}
-
-
-void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_ANY;
-    }
-    CONTRACTL_END;
-
-    NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
-    if (normalizationState == NormalizationState::Initialized)
-    {
-        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
-        {
-            return;
-        }
-    }
-    else if (normalizationState == NormalizationState::Uninitialized)
-    {
-    }
-    else
-    {
-        _ASSERTE(normalizationState == NormalizationState::Failed);
-        return;
-    }
-
-    // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
-    if (s_isMeasurementScheduled || !g_fEEStarted)
-    {
-        return;
-    }
-
-    s_isMeasurementScheduled = true;
-    FinalizerThread::EnableFinalization();
-}
-
-
-void YieldProcessorNormalization::FireMeasurementEvents()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_ANY;
-    }
-    CONTRACTL_END;
-
-    if (!EventEnabledYieldProcessorMeasurement())
-    {
-        return;
-    }
-
-    // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
-    // recorded information, so try to enumerate the array with some care.
-    double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);
-    int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
-    for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
-    {
-        double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]);
-        if (nsPerYield != 0) // the array may not be fully initialized yet
-        {
-            FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
-        }
-
-        if (++nextIndex >= NsPerYieldMeasurementCount)
-        {
-            nextIndex = 0;
-        }
-    }
-}
-
-double YieldProcessorNormalization::AtomicLoad(double *valueRef)
-{
-    WRAPPER_NO_CONTRACT;
-
-#ifdef TARGET_64BIT
-    return VolatileLoadWithoutBarrier(valueRef);
-#else
-    return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
-#endif
-}
-
-void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
-{
-    WRAPPER_NO_CONTRACT;
-
-#ifdef TARGET_64BIT
-    *valueRef = value;
-#else
-    InterlockedExchangeT(valueRef, value);
-#endif
-}
-
+#include "yieldprocessornormalizedshared.cpp"
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
new file mode 100644
index 0000000000000..05daee2194737
--- /dev/null
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -0,0 +1,341 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+enum class NormalizationState : uint8_t
+{
+    Uninitialized,
+    Initialized,
+    Failed
+};
+
+static const int NsPerYieldMeasurementCount = 8;
+static const unsigned int MeasurementPeriodMs = 4000;
+
+static const unsigned int NsPerS = 1000 * 1000 * 1000;
+
+static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
+static unsigned int s_previousNormalizationTimeMs;
+
+static uint64_t s_performanceCounterTicksPerS;
+static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
+static int s_nextMeasurementIndex;
+static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
+
+void RhEnableFinalization();
+
+inline unsigned int GetTickCountPortable()
+{
+#ifdef FEATURE_NATIVEAOT
+    return (unsigned int)PalGetTickCount64();
+#else
+    return GetTickCount();
+#endif
+}
+
+static uint64_t GetPerformanceCounter()
+{
+#ifdef FEATURE_NATIVEAOT
+    return PalQueryPerformanceCounter();
+#else
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    return li.QuadPart;
+#endif
+}
+
+static unsigned int DetermineMeasureDurationUs()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+#ifndef FEATURE_NATIVEAOT
+        MODE_PREEMPTIVE;
+#endif
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
+
+    // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
+    // if the overhead seems high relative to the measure duration.
+    unsigned int measureDurationUs = 1;
+    uint64_t startTicks = GetPerformanceCounter();
+    uint64_t elapsedTicks = GetPerformanceCounter() - startTicks;
+    if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
+    {
+        measureDurationUs *= 4;
+    }
+    return measureDurationUs;
+}
+
+static double MeasureNsPerYield(unsigned int measureDurationUs)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+#ifndef FEATURE_NATIVEAOT
+        MODE_PREEMPTIVE;
+#endif
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
+
+    int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
+    uint64_t ticksPerS = s_performanceCounterTicksPerS;
+    uint64_t measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
+
+    uint64_t startTicks = GetPerformanceCounter();
+
+    for (int i = 0; i < yieldCount; ++i)
+    {
+        System_YieldProcessor();
+    }
+
+    uint64_t elapsedTicks = GetPerformanceCounter() - startTicks;
+    while (elapsedTicks < measureDurationTicks)
+    {
+        int nextYieldCount =
+            max(4,
+                elapsedTicks == 0
+                    ? yieldCount / 4
+                    : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
+        for (int i = 0; i < nextYieldCount; ++i)
+        {
+            System_YieldProcessor();
+        }
+
+        elapsedTicks = GetPerformanceCounter() - startTicks;
+        yieldCount += nextYieldCount;
+    }
+
+    // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
+    const double MinNsPerYield = 0.1;
+
+    // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
+    // really take this long. Limit the maximum to keep the recorded values reasonable.
+    const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
+
+    return max(MinNsPerYield, min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
+}
+
+void YieldProcessorNormalization::PerformMeasurement()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+#ifndef FEATURE_NATIVEAOT
+        MODE_PREEMPTIVE;
+#endif
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_isMeasurementScheduled);
+
+    double latestNsPerYield;
+    if (s_normalizationState == NormalizationState::Initialized)
+    {
+        if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+
+        int nextMeasurementIndex = s_nextMeasurementIndex;
+        latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
+        AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield);
+        if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
+        {
+            nextMeasurementIndex = 0;
+        }
+        s_nextMeasurementIndex = nextMeasurementIndex;
+    }
+    else if (s_normalizationState == NormalizationState::Uninitialized)
+    {
+#ifdef FEATURE_NATIVEAOT
+        if ((s_performanceCounterTicksPerS = PalQueryPerformanceFrequency()) < 1000 * 1000)
+#else
+        LARGE_INTEGER li;
+        if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
+#endif
+        {
+            // High precision clock not available or clock resolution is too low, resort to defaults
+            s_normalizationState = NormalizationState::Failed;
+            return;
+        }
+
+#ifndef FEATURE_NATIVEAOT
+        s_performanceCounterTicksPerS = li.QuadPart;
+#endif
+
+        unsigned int measureDurationUs = DetermineMeasureDurationUs();
+        for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+        {
+            latestNsPerYield = MeasureNsPerYield(measureDurationUs);
+            AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
+            if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
+            {
+                AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
+            }
+            if (i < NsPerYieldMeasurementCount - 1)
+            {
+                FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+            }
+        }
+    }
+    else
+    {
+        _ASSERTE(s_normalizationState == NormalizationState::Failed);
+        return;
+    }
+
+    double establishedNsPerYield = s_nsPerYieldMeasurements[0];
+    for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
+    {
+        double nsPerYield = s_nsPerYieldMeasurements[i];
+        if (nsPerYield < establishedNsPerYield)
+        {
+            establishedNsPerYield = nsPerYield;
+        }
+    }
+    if (establishedNsPerYield != s_establishedNsPerYield)
+    {
+        AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
+    }
+
+    FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+
+    // Calculate the number of yields required to span the duration of a normalized yield
+    unsigned int yieldsPerNormalizedYield = max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
+    _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
+    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+
+    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
+    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
+    // better job of allowing other work to run.
+    s_optimalMaxNormalizedYieldsPerSpinIteration =
+        max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
+    _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+
+    s_previousNormalizationTimeMs = GetTickCountPortable();
+    s_normalizationState = NormalizationState::Initialized;
+    s_isMeasurementScheduled = false;
+}
+
+
+void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+    NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
+    if (normalizationState == NormalizationState::Initialized)
+    {
+        if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+    }
+    else if (normalizationState == NormalizationState::Uninitialized)
+    {
+    }
+    else
+    {
+        _ASSERTE(normalizationState == NormalizationState::Failed);
+        return;
+    }
+
+#ifdef FEATURE_NATIVEAOT
+    if (s_isMeasurementScheduled)
+#else
+    // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
+    if (s_isMeasurementScheduled || !g_fEEStarted)
+#endif
+    {
+        return;
+    }
+
+    s_isMeasurementScheduled = true;
+#ifdef FEATURE_NATIVEAOT
+    RhEnableFinalization();
+#else
+    FinalizerThread::EnableFinalization();
+#endif
+}
+
+void YieldProcessorNormalization::FireMeasurementEvents()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+    if (!EventEnabledYieldProcessorMeasurement())
+    {
+        return;
+    }
+
+    // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
+    // recorded information, so try to enumerate the array with some care.
+    double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);
+    int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
+    for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+    {
+        double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]);
+        if (nsPerYield != 0) // the array may not be fully initialized yet
+        {
+            FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
+        }
+
+        if (++nextIndex >= NsPerYieldMeasurementCount)
+        {
+            nextIndex = 0;
+        }
+    }
+}
+
+double YieldProcessorNormalization::AtomicLoad(double *valueRef)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    return VolatileLoadWithoutBarrier(valueRef);
+#else
+#ifdef FEATURE_NATIVEAOT
+    static_assert(sizeof(int64_t) == sizeof(double), "");
+    int64_t intRes = PalInterlockedCompareExchange64((int64_t*)valueRef, 0, 0);
+    return *(double*)(int64_t*)(&intRes);
+#else
+    return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
+#endif
+#endif
+}
+
+void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    *valueRef = value;
+#else
+#ifdef FEATURE_NATIVEAOT
+    static_assert(sizeof(int64_t) == sizeof(double), "");
+    PalInterlockedExchange64((int64_t *)valueRef, *(int64_t *)(double*)&value);
+#else
+    InterlockedExchangeT(valueRef, value);
+#endif
+#endif
+}
+