From d7924cc6e3279236ec32c54b0fb6207c9604cbc8 Mon Sep 17 00:00:00 2001 From: Vladimir Sadov Date: Thu, 15 Dec 2022 10:56:15 -0800 Subject: [PATCH] [NativeAOT] Thin locks (#79519) * switch to managed thread ID in Lock * fattening the lock * __declspec(selectany) * few tweaks * fairness * more room for thread ids * remove CurrentNativeThreadId * couple fixes * fix win-arm64 build * win-arm64 build , another try * Apply suggestions from code review Co-authored-by: Jan Kotas * fix after renaming * do not report successful spin if thread has waited * keep extern and undo mangling of tls_CurrentThread in asm * use SyncTable indexer in less perf-sensitive places. * GetNewHashCode just delegate to shared random * Apply suggestions from code review Co-authored-by: Jan Kotas * unchecked const conversion * some refactoring comments and typos * min number of spins in the backoff * moved CurrentManagedThreadIdUnchecked to ManagedThreadId * Use `-1` to report success and allow using element #1 in the SyncTable * use threadstatic for managed thread ID * check before calling RhGetProcessCpuCount * use 0 as default thread ID Co-authored-by: Jan Kotas --- src/coreclr/nativeaot/Runtime/MiscHelpers.cpp | 4 +- src/coreclr/nativeaot/Runtime/thread.cpp | 7 +- src/coreclr/nativeaot/Runtime/thread.h | 5 +- src/coreclr/nativeaot/Runtime/threadstore.cpp | 17 +- src/coreclr/nativeaot/Runtime/threadstore.inl | 7 +- .../SynchronizedMethodHelpers.cs | 38 +- .../src/System/Environment.NativeAot.Unix.cs | 2 - .../System/Environment.NativeAot.Windows.cs | 2 - .../RuntimeHelpers.NativeAot.cs | 13 +- .../src/System/RuntimeExceptionHelpers.cs | 2 +- .../src/System/Threading/Lock.cs | 353 +++++++++----- .../src/System/Threading/ManagedThreadId.cs | 2 + .../src/System/Threading/Monitor.NativeAot.cs | 89 ++-- .../src/System/Threading/ObjectHeader.cs | 435 ++++++++++++++---- .../src/System/Threading/SyncTable.cs | 46 +- .../src/System/Threading/Thread.NativeAot.cs | 28 +- 16 files changed, 744 insertions(+), 306 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp index c385e82792113..9a38973db122a 100644 --- a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp @@ -51,11 +51,11 @@ EXTERN_C NATIVEAOT_API void __cdecl RhSpinWait(int32_t iterations) ASSERT(iterations > 0); // limit the spin count in coop mode. - ASSERT_MSG(iterations <= 10000 || !ThreadStore::GetCurrentThread()->IsCurrentThreadInCooperativeMode(), + ASSERT_MSG(iterations <= 1024 || !ThreadStore::GetCurrentThread()->IsCurrentThreadInCooperativeMode(), "This is too long wait for coop mode. You must p/invoke with GC transition."); YieldProcessorNormalizationInfo normalizationInfo; - YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, iterations); + YieldProcessorNormalized(normalizationInfo, iterations); } // Yield the cpu to another thread ready to process, if one is available. diff --git a/src/coreclr/nativeaot/Runtime/thread.cpp b/src/coreclr/nativeaot/Runtime/thread.cpp index fd70a59aef864..f55a96d9e9d55 100644 --- a/src/coreclr/nativeaot/Runtime/thread.cpp +++ b/src/coreclr/nativeaot/Runtime/thread.cpp @@ -260,7 +260,10 @@ void Thread::Construct() // alloc_context ever needs different initialization, a matching change to the tls_CurrentThread // static initialization will need to be made. - m_uPalThreadIdForLogging = PalGetCurrentThreadIdForLogging(); + m_pTransitionFrame = TOP_OF_STACK_MARKER; + m_pDeferredTransitionFrame = TOP_OF_STACK_MARKER; + m_hPalThread = INVALID_HANDLE_VALUE; + m_threadId.SetToCurrentThread(); HANDLE curProcessPseudo = PalGetCurrentProcess(); @@ -328,7 +331,7 @@ bool Thread::CatchAtSafePoint() uint64_t Thread::GetPalThreadIdForLogging() { - return m_uPalThreadIdForLogging; + return *(uint64_t*)&m_threadId; } bool Thread::IsCurrentThread() diff --git a/src/coreclr/nativeaot/Runtime/thread.h b/src/coreclr/nativeaot/Runtime/thread.h index a83cc952199a4..138f8e998e49a 100644 --- a/src/coreclr/nativeaot/Runtime/thread.h +++ b/src/coreclr/nativeaot/Runtime/thread.h @@ -95,8 +95,7 @@ struct ThreadBuffer PTR_VOID m_pStackLow; PTR_VOID m_pStackHigh; PTR_UInt8 m_pTEB; // Pointer to OS TEB structure for this thread - uint64_t m_uPalThreadIdForLogging; // @TODO: likely debug-only - EEThreadId m_threadId; + EEThreadId m_threadId; // OS thread ID PTR_VOID m_pThreadStressLog; // pointer to head of thread's StressLogChunks NATIVE_CONTEXT* m_interruptedContext; // context for an asynchronously interrupted thread. #ifdef FEATURE_SUSPEND_REDIRECTION @@ -192,7 +191,7 @@ class Thread : private ThreadBuffer gc_alloc_context * GetAllocContext(); // @TODO: I would prefer to not expose this in this way #ifndef DACCESS_COMPILE - uint64_t GetPalThreadIdForLogging(); + uint64_t GetPalThreadIdForLogging(); bool IsCurrentThread(); void GcScanRoots(void * pfnEnumCallback, void * pvCallbackData); diff --git a/src/coreclr/nativeaot/Runtime/threadstore.cpp b/src/coreclr/nativeaot/Runtime/threadstore.cpp index 5b3aaf9b1e29d..edb374ac2442e 100644 --- a/src/coreclr/nativeaot/Runtime/threadstore.cpp +++ b/src/coreclr/nativeaot/Runtime/threadstore.cpp @@ -415,20 +415,9 @@ COOP_PINVOKE_HELPER(void, RhpCancelThreadAbort, (void* thread)) C_ASSERT(sizeof(Thread) == sizeof(ThreadBuffer)); -EXTERN_C DECLSPEC_THREAD ThreadBuffer tls_CurrentThread; -DECLSPEC_THREAD ThreadBuffer tls_CurrentThread = -{ - { 0 }, // m_rgbAllocContextBuffer - Thread::TSF_Unknown, // m_ThreadStateFlags - TOP_OF_STACK_MARKER, // m_pTransitionFrame - TOP_OF_STACK_MARKER, // m_pDeferredTransitionFrame - 0, // m_pCachedTransitionFrame - 0, // m_pNext - INVALID_HANDLE_VALUE, // m_hPalThread - 0, // m_ppvHijackedReturnAddressLocation - 0, // m_pvHijackedReturnAddress - 0, // all other fields are initialized by zeroes -}; +#ifndef _MSC_VER +__thread ThreadBuffer tls_CurrentThread; +#endif EXTERN_C ThreadBuffer* RhpGetThread() { diff --git a/src/coreclr/nativeaot/Runtime/threadstore.inl b/src/coreclr/nativeaot/Runtime/threadstore.inl index aa450b72fb448..29495046a9827 100644 --- a/src/coreclr/nativeaot/Runtime/threadstore.inl +++ b/src/coreclr/nativeaot/Runtime/threadstore.inl @@ -1,7 +1,12 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -EXTERN_C DECLSPEC_THREAD ThreadBuffer tls_CurrentThread; +#ifdef _MSC_VER +// a workaround to prevent tls_CurrentThread from becoming dynamically checked/initialized. +EXTERN_C __declspec(selectany) __declspec(thread) ThreadBuffer tls_CurrentThread; +#else +EXTERN_C __thread ThreadBuffer tls_CurrentThread; +#endif // static inline Thread * ThreadStore::RawGetCurrentThread() diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/Internal/Runtime/CompilerHelpers/SynchronizedMethodHelpers.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/Internal/Runtime/CompilerHelpers/SynchronizedMethodHelpers.cs index b19f4622deaf6..2beff5cecf7e4 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/Internal/Runtime/CompilerHelpers/SynchronizedMethodHelpers.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/Internal/Runtime/CompilerHelpers/SynchronizedMethodHelpers.cs @@ -14,20 +14,33 @@ internal static class SynchronizedMethodHelpers private static void MonitorEnter(object obj, ref bool lockTaken) { // Inlined Monitor.Enter with a few tweaks - Lock lck = Monitor.GetLock(obj); + int resultOrIndex = ObjectHeader.Acquire(obj); + if (resultOrIndex < 0) + { + lockTaken = true; + return; + } + + Lock lck = resultOrIndex == 0 ? + ObjectHeader.GetLockObject(obj) : + SyncTable.GetLockObject(resultOrIndex); + if (lck.TryAcquire(0)) { lockTaken = true; return; } + Monitor.TryAcquireContended(lck, obj, Timeout.Infinite); lockTaken = true; } private static void MonitorExit(object obj, ref bool lockTaken) { // Inlined Monitor.Exit with a few tweaks - if (!lockTaken) return; - Monitor.GetLock(obj).Release(); + if (!lockTaken) + return; + + ObjectHeader.Release(obj); lockTaken = false; } @@ -35,21 +48,34 @@ private static void MonitorEnterStatic(IntPtr pEEType, ref bool lockTaken) { // Inlined Monitor.Enter with a few tweaks object obj = GetStaticLockObject(pEEType); - Lock lck = Monitor.GetLock(obj); + int resultOrIndex = ObjectHeader.Acquire(obj); + if (resultOrIndex < 0) + { + lockTaken = true; + return; + } + + Lock lck = resultOrIndex == 0 ? + ObjectHeader.GetLockObject(obj) : + SyncTable.GetLockObject(resultOrIndex); + if (lck.TryAcquire(0)) { lockTaken = true; return; } + Monitor.TryAcquireContended(lck, obj, Timeout.Infinite); lockTaken = true; } private static void MonitorExitStatic(IntPtr pEEType, ref bool lockTaken) { // Inlined Monitor.Exit with a few tweaks - if (!lockTaken) return; + if (!lockTaken) + return; + object obj = GetStaticLockObject(pEEType); - Monitor.GetLock(obj).Release(); + ObjectHeader.Release(obj); lockTaken = false; } diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Unix.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Unix.cs index d7df4693019c7..98cb722517117 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Unix.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Unix.cs @@ -9,8 +9,6 @@ namespace System { public static partial class Environment { - internal static int CurrentNativeThreadId => ManagedThreadId.Current; - public static long TickCount64 => (long)RuntimeImports.RhpGetTickCount64(); [DoesNotReturn] diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Windows.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Windows.cs index e26fdcca48acb..9aaa5980fc68d 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Windows.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Windows.cs @@ -7,8 +7,6 @@ namespace System { public static partial class Environment { - internal static int CurrentNativeThreadId => unchecked((int)Interop.Kernel32.GetCurrentThreadId()); - public static long TickCount64 => (long)Interop.Kernel32.GetTickCount64(); [DoesNotReturn] diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/CompilerServices/RuntimeHelpers.NativeAot.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/CompilerServices/RuntimeHelpers.NativeAot.cs index 5c6bf96ff7049..5b8a86cee2878 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/CompilerServices/RuntimeHelpers.NativeAot.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/CompilerServices/RuntimeHelpers.NativeAot.cs @@ -94,17 +94,9 @@ public static object GetObjectValue(object? obj) return RuntimeImports.RhCompareObjectContentsAndPadding(o1, o2); } - [ThreadStatic] - private static int t_hashSeed; - internal static int GetNewHashCode() { - int multiplier = Environment.CurrentManagedThreadId * 4 + 5; - // Every thread has its own generator for hash codes so that we won't get into a situation - // where two threads consistently give out the same hash codes. - // Choice of multiplier guarantees period of 2**32 - see Knuth Vol 2 p16 (3.2.1.2 Theorem A). - t_hashSeed = t_hashSeed * multiplier + 1; - return t_hashSeed; + return Random.Shared.Next(); } public static unsafe int GetHashCode(object o) @@ -228,6 +220,9 @@ internal static unsafe ushort GetElementSize(this Array array) internal static unsafe MethodTable* GetMethodTable(this object obj) => obj.m_pEEType; + internal static unsafe ref MethodTable* GetMethodTableRef(this object obj) + => ref obj.m_pEEType; + internal static unsafe EETypePtr GetEETypePtr(this object obj) => new EETypePtr(obj.m_pEEType); diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/RuntimeExceptionHelpers.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/RuntimeExceptionHelpers.cs index 6d146348ee816..465dbe83356a0 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/RuntimeExceptionHelpers.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/RuntimeExceptionHelpers.cs @@ -464,7 +464,7 @@ private static void SerializeExceptionsForDump(Exception currentException, IntPt LowLevelList exceptions = new LowLevelList(curThreadExceptions); LowLevelList nonThrownInnerExceptions = new LowLevelList(); - uint currentThreadId = (uint)Environment.CurrentNativeThreadId; + uint currentThreadId = (uint)Environment.CurrentManagedThreadId; // Reset nesting levels for exceptions on this thread that might not be currently in flight foreach (KeyValuePair item in s_exceptionDataTable) diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Lock.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Lock.cs index fa81ae7a43684..1b2aea5b0653f 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Lock.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Lock.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Diagnostics; +using System.IO; using System.Runtime; using System.Runtime.CompilerServices; @@ -10,17 +11,43 @@ namespace System.Threading [ReflectionBlocked] public sealed class Lock : IDisposable { - // The following constants define characteristics of spinning logic in the Lock class - private const int SpinningNotInitialized = 0; - private const int SpinningDisabled = -1; - private const int MaxSpinningValue = 10000; + // + // This lock is a hybrid spinning/blocking lock with dynamically adjusted spinning. + // On a multiprocessor machine an acquiring thread will try to acquire multiple times + // before going to sleep. The amount of spinning is dynamically adjusted based on past + // history of the lock and will stay in the following range. + // + private const uint MaxSpinLimit = 200; + private const uint MinSpinLimit = 10; + private const uint SpinningNotInitialized = MaxSpinLimit + 1; + private const uint SpinningDisabled = 0; + + // + // We will use exponential backoff in rare cases when we need to change state atomically and cannot + // make progress due to concurrent state changes by other threads. + // While we cannot know the ideal amount of wait needed before making a successfull attempt, + // the exponential backoff will generally be not more than 2X worse than the perfect guess and + // will do a lot less attempts than an simple retry. On multiprocessor machine fruitless attempts + // will cause unnecessary sharing of the contended state which may make modifying the state more expensive. + // To protect against degenerate cases we will cap the per-iteration wait to 1024 spinwaits. + // + private const uint MaxExponentialBackoffBits = 10; + + // + // This lock is unfair and permits acquiring a contended lock by a nonwaiter in the presence of waiters. + // It is possible for one thread to keep holding the lock long enough that waiters go to sleep and + // then release and reacquire fast enough that waiters have no chance to get the lock. + // In extreme cases one thread could keep retaking the lock starving everybody else. + // If we see woken waiters not able to take the lock for too long we will ask nonwaiters to wait. + // + private const uint WaiterWatchdogTicks = 100; // // NOTE: Lock must not have a static (class) constructor, as Lock itself is used to synchronize // class construction. If Lock has its own class constructor, this can lead to infinite recursion. - // All static data in Lock must be pre-initialized. + // All static data in Lock must be lazy-initialized. // - private static int s_maxSpinCount; + internal static int s_processorCount; // // m_state layout: @@ -30,28 +57,38 @@ public sealed class Lock : IDisposable // bit 1: True if we've set the event to wake a waiting thread. The waiter resets this to false when it // wakes up. This avoids the overhead of setting the event multiple times. // + // bit 2: True if nonwaiters must not get ahead of waiters when acquiring a contended lock. + // // everything else: A count of the number of threads waiting on the event. // + private const int Uncontended = 0; private const int Locked = 1; private const int WaiterWoken = 2; - private const int WaiterCountIncrement = 4; + private const int YieldToWaiters = 4; + private const int WaiterCountIncrement = 8; - private const int Uncontended = 0; + // state of the lock + private AutoResetEvent? _lazyEvent; + private int _owningThreadId; + private uint _recursionCount; + private int _state; + private uint _spinLimit = SpinningNotInitialized; + private int _wakeWatchDog; - private volatile int _state; + // used to transfer the state when inflating thin locks + internal void InitializeLocked(int threadId, int recursionCount) + { + Debug.Assert(recursionCount == 0 || threadId != 0); - private uint _recursionCount; - private IntPtr _owningThreadId; - private volatile AutoResetEvent? _lazyEvent; + _state = threadId == 0 ? Uncontended : Locked; + _owningThreadId = threadId; + _recursionCount = (uint)recursionCount; + } private AutoResetEvent Event { get { - // - // Can't use LazyInitializer.EnsureInitialized because Lock needs to stay low level enough - // for the purposes of lazy generic lookups. LazyInitializer uses a generic delegate. - // if (_lazyEvent == null) Interlocked.CompareExchange(ref _lazyEvent, new AutoResetEvent(false), null); @@ -64,27 +101,14 @@ public void Dispose() _lazyEvent?.Dispose(); } - private static IntPtr CurrentNativeThreadId => (IntPtr)RuntimeImports.RhCurrentNativeThreadId(); + private static int CurrentThreadId => Environment.CurrentManagedThreadId; - // On platforms where CurrentNativeThreadId redirects to ManagedThreadId.Current the inlined - // version of Lock.Acquire has the ManagedThreadId.Current call not inlined, while the non-inlined - // version has it inlined. So it saves code to keep this function not inlined while having - // the same runtime cost. [MethodImpl(MethodImplOptions.NoInlining)] public void Acquire() { - IntPtr currentThreadId = CurrentNativeThreadId; - - // - // Make one quick attempt to acquire an uncontended lock - // - if (Interlocked.CompareExchange(ref _state, Locked, Uncontended) == Uncontended) - { - Debug.Assert(_owningThreadId == IntPtr.Zero); - Debug.Assert(_recursionCount == 0); - _owningThreadId = currentThreadId; + int currentThreadId = CurrentThreadId; + if (TryAcquireOneShot(currentThreadId)) return; - } // // Fall back to the slow path for contention @@ -103,14 +127,14 @@ public bool TryAcquire(int millisecondsTimeout, bool trackContentions = false) if (millisecondsTimeout < -1) throw new ArgumentOutOfRangeException(nameof(millisecondsTimeout), SR.ArgumentOutOfRange_NeedNonNegOrNegative1); - IntPtr currentThreadId = CurrentNativeThreadId; + int currentThreadId = CurrentThreadId; // // Make one quick attempt to acquire an uncontended lock // if (Interlocked.CompareExchange(ref _state, Locked, Uncontended) == Uncontended) { - Debug.Assert(_owningThreadId == IntPtr.Zero); + Debug.Assert(_owningThreadId == 0); Debug.Assert(_recursionCount == 0); _owningThreadId = currentThreadId; return true; @@ -122,7 +146,38 @@ public bool TryAcquire(int millisecondsTimeout, bool trackContentions = false) return TryAcquireContended(currentThreadId, millisecondsTimeout, trackContentions); } - private bool TryAcquireContended(IntPtr currentThreadId, int millisecondsTimeout, bool trackContentions = false) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool TryAcquireOneShot(int currentThreadId) + { + // + // Make one quick attempt to acquire an uncontended lock + // + if (Interlocked.CompareExchange(ref _state, Locked, Uncontended) == Uncontended) + { + Debug.Assert(_owningThreadId == 0); + Debug.Assert(_recursionCount == 0); + _owningThreadId = currentThreadId; + return true; + } + + return false; + } + + private static unsafe void ExponentialBackoff(uint iteration) + { + if (iteration > 0) + { + // no need for much randomness here, we will just hash the stack address + iteration. + uint rand = ((uint)&iteration + iteration) * 2654435769u; + // set the highmost bit to ensure minimum number of spins is exponentialy increasing + // that is in case some stack location results in a sequence of very low spin counts + rand |= (1 << 32); + uint spins = rand >> (byte)(32 - Math.Min(iteration, MaxExponentialBackoffBits)); + Thread.SpinWaitInternal((int)spins); + } + } + + private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, bool trackContentions = false) { // // If we already own the lock, just increment the recursion count. @@ -139,115 +194,137 @@ private bool TryAcquireContended(IntPtr currentThreadId, int millisecondsTimeout if (millisecondsTimeout == 0) return false; - int spins = 1; - - if (s_maxSpinCount == SpinningNotInitialized) + if (_spinLimit == SpinningNotInitialized) { // Use RhGetProcessCpuCount directly to avoid Environment.ProcessorCount->ClassConstructorRunner->Lock->Environment.ProcessorCount cycle - s_maxSpinCount = (RuntimeImports.RhGetProcessCpuCount() > 1) ? MaxSpinningValue : SpinningDisabled; + if (s_processorCount == 0) + s_processorCount = RuntimeImports.RhGetProcessCpuCount(); + + _spinLimit = (s_processorCount > 1) ? MaxSpinLimit : SpinningDisabled; } + bool hasWaited = false; + // we will retry after waking up while (true) { - // - // Try to grab the lock. We may take the lock here even if there are existing waiters. This creates the possibility - // of starvation of waiters, but it also prevents lock convoys from destroying perf. - // The starvation issue is largely mitigated by the priority boost the OS gives to a waiter when we set - // the event, after we release the lock. Eventually waiters will be boosted high enough to preempt this thread. - // - int oldState = _state; - if ((oldState & Locked) == 0 && Interlocked.CompareExchange(ref _state, oldState | Locked, oldState) == oldState) - goto GotTheLock; - - // - // Back off by a factor of 2 for each attempt, up to MaxSpinCount - // - if (spins <= s_maxSpinCount) - { - RuntimeImports.RhSpinWait(spins); - spins *= 2; - } - else if (oldState != 0) + uint iteration = 0; + uint localSpinLimit = _spinLimit; + // inner loop where we try acquiring the lock or registering as a waiter + while (true) { // - // We reached our spin limit, and need to wait. Increment the waiter count. - // Note that we do not do any overflow checking on this increment. In order to overflow, - // we'd need to have about 1 billion waiting threads, which is inconceivable anytime in the - // forseeable future. + // Try to grab the lock. We may take the lock here even if there are existing waiters. This creates the possibility + // of starvation of waiters, but it also prevents lock convoys and preempted waiters from destroying perf. + // However, if we do not see _wakeWatchDog cleared for long enough, we go into YieldToWaiters mode to ensure some + // waiter progress. // - int newState = (oldState + WaiterCountIncrement) & ~WaiterWoken; - if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) - break; - } - } + int oldState = _state; + bool canAcquire = ((oldState & Locked) == 0) && + (hasWaited || ((oldState & YieldToWaiters) == 0)); - // - // Now we wait. - // + if (canAcquire) + { + int newState = oldState | Locked; + if (hasWaited) + newState = (newState - WaiterCountIncrement) & ~WaiterWoken & ~YieldToWaiters; - if (trackContentions) - { - Monitor.IncrementLockContentionCount(); - } + if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) + { + if (hasWaited) + _wakeWatchDog = 0; + else + // if spinning was successful, update spin count + if (iteration < localSpinLimit && localSpinLimit < MaxSpinLimit) + _spinLimit = localSpinLimit + 1; + + // GOT THE LOCK!! + Debug.Assert((_state | Locked) != 0); + Debug.Assert(_owningThreadId == 0); + Debug.Assert(_recursionCount == 0); + _owningThreadId = currentThreadId; + return true; + } + } - TimeoutTracker timeoutTracker = TimeoutTracker.Start(millisecondsTimeout); - AutoResetEvent ev = Event; + // spinning was unsuccessful. reduce spin count. + if (iteration == localSpinLimit && localSpinLimit > MinSpinLimit) + _spinLimit = localSpinLimit - 1; - while (true) - { + if (iteration++ < localSpinLimit) + { + Thread.SpinWaitInternal(1); + continue; + } + else if (!canAcquire) + { + // + // We reached our spin limit, and need to wait. Increment the waiter count. + // Note that we do not do any overflow checking on this increment. In order to overflow, + // we'd need to have about 1 billion waiting threads, which is inconceivable anytime in the + // forseeable future. + // + int newState = oldState + WaiterCountIncrement; + if (hasWaited) + newState = (newState - WaiterCountIncrement) & ~WaiterWoken; + + if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) + break; + } + + Debug.Assert(iteration >= localSpinLimit); + ExponentialBackoff(iteration - localSpinLimit); + } + + // + // Now we wait. + // + + if (trackContentions) + { + Monitor.IncrementLockContentionCount(); + } + + TimeoutTracker timeoutTracker = TimeoutTracker.Start(millisecondsTimeout); + Debug.Assert(_state >= WaiterCountIncrement); + bool waitSucceeded = Event.WaitOne(millisecondsTimeout); Debug.Assert(_state >= WaiterCountIncrement); - bool waitSucceeded = ev.WaitOne(timeoutTracker.Remaining); + if (!waitSucceeded) + break; + + // we did not time out and will try acquiring the lock + hasWaited = true; + millisecondsTimeout = timeoutTracker.Remaining; + } + // We timed out. We're not going to wait again. + { + uint iteration = 0; while (true) { int oldState = _state; Debug.Assert(oldState >= WaiterCountIncrement); - // Clear the "waiter woken" bit. - int newState = oldState & ~WaiterWoken; + int newState = oldState - WaiterCountIncrement; - if ((oldState & Locked) == 0) - { - // The lock is available, try to get it. - newState |= Locked; - newState -= WaiterCountIncrement; + // We could not have consumed a wake, or the wait would've succeeded. + // If we are the last waiter though, we will clear WaiterWoken and YieldToWaiters + // just so that lock would not look like contended. + if (newState < WaiterCountIncrement) + newState = newState & ~WaiterWoken & ~YieldToWaiters; - if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) - goto GotTheLock; - } - else if (!waitSucceeded) - { - // The lock is not available, and we timed out. We're not going to wait agin. - newState -= WaiterCountIncrement; + if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) + return false; - if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) - return false; - } - else - { - // The lock is not available, and we didn't time out. We're going to wait again. - if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) - break; - } + ExponentialBackoff(iteration++); } } - - GotTheLock: - Debug.Assert((_state | Locked) != 0); - Debug.Assert(_owningThreadId == IntPtr.Zero); - Debug.Assert(_recursionCount == 0); - _owningThreadId = currentThreadId; - return true; } public bool IsAcquired { get { - // - // The comment below is for platforms where CurrentNativeThreadId redirects to - // ManagedThreadId.Current instead of being a compiler intrinsic. // // Compare the current owning thread ID with the current thread ID. We need // to read the current thread's ID before we read m_owningThreadId. Otherwise, @@ -264,24 +341,38 @@ public bool IsAcquired // because while we're doing this check the current thread is definitely still // alive. // - IntPtr currentThreadId = CurrentNativeThreadId; - bool acquired = (currentThreadId == _owningThreadId); - if (acquired) - Debug.Assert((_state & Locked) != 0); - return acquired; + int currentThreadId = CurrentThreadId; + return IsAcquiredByThread(currentThreadId); } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool IsAcquiredByThread(int currentThreadId) + { + bool acquired = (currentThreadId == _owningThreadId); + Debug.Assert(!acquired || (_state & Locked) != 0); + return acquired; + } + [MethodImpl(MethodImplOptions.NoInlining)] public void Release() { - if (!IsAcquired) + ReleaseByThread(CurrentThreadId); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void ReleaseByThread(int threadId) + { + if (threadId != _owningThreadId) throw new SynchronizationLockException(); - if (_recursionCount > 0) - _recursionCount--; - else + if (_recursionCount == 0) + { ReleaseCore(); + return; + } + + _recursionCount--; } internal uint ReleaseAll() @@ -300,7 +391,7 @@ internal uint ReleaseAll() private void ReleaseCore() { Debug.Assert(_recursionCount == 0); - _owningThreadId = IntPtr.Zero; + _owningThreadId = 0; // // Make one quick attempt to release an uncontended lock @@ -317,8 +408,9 @@ private void ReleaseCore() private void ReleaseContended() { Debug.Assert(_recursionCount == 0); - Debug.Assert(_owningThreadId == IntPtr.Zero); + Debug.Assert(_owningThreadId == 0); + uint iteration = 0; while (true) { int oldState = _state; @@ -330,8 +422,21 @@ private void ReleaseContended() { // there are waiters, and nobody has woken one. newState |= WaiterWoken; + + int lastWakeTicks = _wakeWatchDog; + if (lastWakeTicks != 0 && Environment.TickCount - lastWakeTicks > 100) + { + newState |= YieldToWaiters; + } + if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) { + if (lastWakeTicks == 0) + { + // nonzero timestamp of the last wake + _wakeWatchDog = Environment.TickCount | 1; + } + Event.Set(); return; } @@ -342,6 +447,8 @@ private void ReleaseContended() if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState) return; } + + ExponentialBackoff(iteration++); } } diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ManagedThreadId.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ManagedThreadId.cs index 08872af88871f..38672d2348e5a 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ManagedThreadId.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ManagedThreadId.cs @@ -246,6 +246,8 @@ public static void RecycleId(int id) } } + internal static int CurrentManagedThreadIdUnchecked => t_currentManagedThreadId; + public static int Current { get diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs index f5317ff6d22ed..087745d3b71aa 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs @@ -24,19 +24,8 @@ public static partial class Monitor { #region Object->Lock/Condition mapping - private static ConditionalWeakTable s_conditionTable = new ConditionalWeakTable(); - private static ConditionalWeakTable.CreateValueCallback s_createCondition = (o) => new Condition(GetLock(o)); - - internal static Lock GetLock(object obj) - { - if (obj == null) - throw new ArgumentNullException(nameof(obj)); - - Debug.Assert(!(obj is Lock), - "Do not use Monitor.Enter or TryEnter on a Lock instance; use Lock methods directly instead."); - - return ObjectHeader.GetLockObject(obj); - } + private static readonly ConditionalWeakTable s_conditionTable = new ConditionalWeakTable(); + private static readonly ConditionalWeakTable.CreateValueCallback s_createCondition = (o) => new Condition(ObjectHeader.GetLockObject(o)); private static Condition GetCondition(object obj) { @@ -49,77 +38,98 @@ private static Condition GetCondition(object obj) #region Public Enter/Exit methods + [MethodImpl(MethodImplOptions.NoInlining)] public static void Enter(object obj) { - Lock lck = GetLock(obj); + int resultOrIndex = ObjectHeader.Acquire(obj); + if (resultOrIndex < 0) + return; + + Lock lck = resultOrIndex == 0 ? + ObjectHeader.GetLockObject(obj) : + SyncTable.GetLockObject(resultOrIndex); + if (lck.TryAcquire(0)) return; + TryAcquireContended(lck, obj, Timeout.Infinite); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void Enter(object obj, ref bool lockTaken) { + // we are inlining lockTaken check as the check is likely be optimized away if (lockTaken) throw new ArgumentException(SR.Argument_MustBeFalse, nameof(lockTaken)); - Lock lck = GetLock(obj); - if (lck.TryAcquire(0)) - { - lockTaken = true; - return; - } - TryAcquireContended(lck, obj, Timeout.Infinite); + Enter(obj); lockTaken = true; } + [MethodImpl(MethodImplOptions.NoInlining)] public static bool TryEnter(object obj) { - return GetLock(obj).TryAcquire(0); + int resultOrIndex = ObjectHeader.TryAcquire(obj); + if (resultOrIndex < 0) + return true; + + if (resultOrIndex == 0) + return false; + + Lock lck = SyncTable.GetLockObject(resultOrIndex); + return lck.TryAcquire(0); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void TryEnter(object obj, ref bool lockTaken) { + // we are inlining lockTaken check as the check is likely be optimized away if (lockTaken) throw new ArgumentException(SR.Argument_MustBeFalse, nameof(lockTaken)); - lockTaken = GetLock(obj).TryAcquire(0); + lockTaken = TryEnter(obj); } + [MethodImpl(MethodImplOptions.NoInlining)] public static bool TryEnter(object obj, int millisecondsTimeout) { if (millisecondsTimeout < -1) throw new ArgumentOutOfRangeException(nameof(millisecondsTimeout), SR.ArgumentOutOfRange_NeedNonNegOrNegative1); - Lock lck = GetLock(obj); + int resultOrIndex = ObjectHeader.TryAcquire(obj); + if (resultOrIndex < 0) + return true; + + Lock lck = resultOrIndex == 0 ? + ObjectHeader.GetLockObject(obj) : + SyncTable.GetLockObject(resultOrIndex); + if (lck.TryAcquire(0)) return true; + return TryAcquireContended(lck, obj, millisecondsTimeout); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void TryEnter(object obj, int millisecondsTimeout, ref bool lockTaken) { + // we are inlining lockTaken check as the check is likely be optimized away if (lockTaken) throw new ArgumentException(SR.Argument_MustBeFalse, nameof(lockTaken)); - if (millisecondsTimeout < -1) - throw new ArgumentOutOfRangeException(nameof(millisecondsTimeout), SR.ArgumentOutOfRange_NeedNonNegOrNegative1); - Lock lck = GetLock(obj); - if (lck.TryAcquire(0)) - { - lockTaken = true; - return; - } - lockTaken = TryAcquireContended(lck, obj, millisecondsTimeout); + lockTaken = TryEnter(obj, millisecondsTimeout); } + [MethodImpl(MethodImplOptions.NoInlining)] public static void Exit(object obj) { - GetLock(obj).Release(); + ObjectHeader.Release(obj); } + [MethodImpl(MethodImplOptions.NoInlining)] public static bool IsEntered(object obj) { - return GetLock(obj).IsAcquired; + return ObjectHeader.IsAcquired(obj); } #endregion @@ -192,20 +202,19 @@ private enum DebugBlockingItemType } // Represents an item a thread is blocked on. This structure is allocated on the stack and accessed by the debugger. - // Fields are volatile to avoid potential compiler optimizations. private struct DebugBlockingItem { // The object the thread is waiting on - public volatile object _object; + public object _object; // Indicates how the thread is blocked on the item - public volatile DebugBlockingItemType _blockingType; + public DebugBlockingItemType _blockingType; // Blocking timeout in milliseconds or Timeout.Infinite for no timeout - public volatile int _timeout; + public int _timeout; // Next pointer in the linked list of DebugBlockingItem records - public volatile IntPtr _next; + public IntPtr _next; } private unsafe struct DebugBlockingScope : IDisposable diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ObjectHeader.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ObjectHeader.cs index c9c5d5c0bc1cf..0de12219eb913 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ObjectHeader.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ObjectHeader.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using Internal.Runtime; using System.Diagnostics; using System.Runtime.CompilerServices; @@ -16,39 +17,36 @@ namespace System.Threading /// internal static class ObjectHeader { - // The following two header bits are used by the GC engine: + // The following three header bits are reserved for the GC engine: + // BIT_SBLK_UNUSED = 0x80000000 // BIT_SBLK_FINALIZER_RUN = 0x40000000 // BIT_SBLK_GC_RESERVE = 0x20000000 // // All other bits may be used to store runtime data: hash code, sync entry index, etc. // Here we use the same bit layout as in CLR: if bit 26 (BIT_SBLK_IS_HASHCODE) is set, // all the lower bits 0..25 store the hash code, otherwise they store either the sync - // entry index or all zero. - // - // If needed, the MASK_HASHCODE_INDEX bit mask may be made wider or narrower than the - // current 26 bits; the BIT_SBLK_IS_HASHCODE bit is not required to be adjacent to the - // mask. The code only assumes that MASK_HASHCODE_INDEX occupies the lowest bits of the - // header (i.e. ends with bit 0) and that (MASK_HASHCODE_INDEX + 1) does not overflow - // the Int32 type (i.e. the mask may be no longer than 30 bits). - + // entry index (indicated by BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) or thin lock data. private const int IS_HASHCODE_BIT_NUMBER = 26; + private const int IS_HASH_OR_SYNCBLKINDEX_BIT_NUMBER = 27; private const int BIT_SBLK_IS_HASHCODE = 1 << IS_HASHCODE_BIT_NUMBER; internal const int MASK_HASHCODE_INDEX = BIT_SBLK_IS_HASHCODE - 1; + private const int BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX = 1 << IS_HASH_OR_SYNCBLKINDEX_BIT_NUMBER; + + // if BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX is clear, the rest of the header dword is laid out as follows: + // - lower sixteen bits (bits 0 thru 15) is thread id used for the thin locks + // value is zero if no thread is holding the lock + // - following six bits (bits 16 thru 21) is recursion level used for the thin locks + // value is zero if lock is not taken or only taken once by the same thread + private const int SBLK_MASK_LOCK_THREADID = 0x0000FFFF; // special value of 0 + 65535 thread ids + private const int SBLK_MASK_LOCK_RECLEVEL = 0x003F0000; // 64 recursion levels + private const int SBLK_LOCK_RECLEVEL_INC = 0x00010000; // each level is this much higher than the previous one + private const int SBLK_RECLEVEL_SHIFT = 16; // shift right this much to get recursion level -#if TARGET_ARM || TARGET_ARM64 - [MethodImpl(MethodImplOptions.NoInlining)] -#else [MethodImpl(MethodImplOptions.AggressiveInlining)] -#endif - public static unsafe int ReadVolatileMemory(int* pHeader) + private static unsafe int* GetHeaderPtr(MethodTable** ppMethodTable) { - // While in x86/amd64 Volatile.Read is cheap, in arm we have to pay the - // cost of a barrier. We do no inlining to get around that. -#if TARGET_ARM || TARGET_ARM64 - return *pHeader; -#else - return Volatile.Read(ref *pHeader); -#endif + // The header is 4 bytes before m_pEEType field on all architectures + return (int*)ppMethodTable - 1; } /// @@ -58,16 +56,12 @@ public static unsafe int ReadVolatileMemory(int* pHeader) public static unsafe int GetHashCode(object o) { if (o == null) - { return 0; - } - fixed (byte* pRawData = &o.GetRawData()) + fixed (MethodTable** ppMethodTable = &o.GetMethodTableRef()) { - // The header is 4 bytes before m_pEEType field on all architectures - int* pHeader = (int*)(pRawData - sizeof(IntPtr) - sizeof(int)); - - int bits = ReadVolatileMemory(pHeader); + int* pHeader = GetHeaderPtr(ppMethodTable); + int bits = *pHeader; int hashOrIndex = bits & MASK_HASHCODE_INDEX; if ((bits & BIT_SBLK_IS_HASHCODE) != 0) { @@ -75,7 +69,8 @@ public static unsafe int GetHashCode(object o) Debug.Assert(hashOrIndex != 0); return hashOrIndex; } - if (hashOrIndex != 0) + + if ((bits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) != 0) { // Look up the hash code in the SyncTable int hashCode = SyncTable.GetHashCode(hashOrIndex); @@ -84,19 +79,18 @@ public static unsafe int GetHashCode(object o) return hashCode; } } + // The hash code has not yet been set. Assign some value. - return AssignHashCode(pHeader); + return AssignHashCode(o, pHeader); } } /// /// Assigns a hash code to the object in a thread-safe way. /// - private static unsafe int AssignHashCode(int* pHeader) + private static unsafe int AssignHashCode(object o, int* pHeader) { int newHash = RuntimeHelpers.GetNewHashCode() & MASK_HASHCODE_INDEX; - int bitAndValue; - // Never use the zero hash code. SyncTable treats the zero value as "not assigned". if (newHash == 0) { @@ -105,73 +99,82 @@ private static unsafe int AssignHashCode(int* pHeader) while (true) { - int oldBits = Volatile.Read(ref *pHeader); - bitAndValue = oldBits & (BIT_SBLK_IS_HASHCODE | MASK_HASHCODE_INDEX); - if (bitAndValue != 0) + int oldBits = *pHeader; + + // if have hashcode, just return it + if ((oldBits & BIT_SBLK_IS_HASHCODE) != 0) + { + // Found the hash code in the header + int h = oldBits & MASK_HASHCODE_INDEX; + Debug.Assert(h != 0); + return h; + } + + // if have something else, break, we need a syncblock. + if ((oldBits & MASK_HASHCODE_INDEX) != 0) { - // The header already stores some value break; } - // The header stores nothing. Try to store the hash code. - int newBits = oldBits | BIT_SBLK_IS_HASHCODE | newHash; + // there is nothing - try set hashcode inline + Debug.Assert((oldBits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) == 0); + int newBits = BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX | BIT_SBLK_IS_HASHCODE | oldBits | newHash; if (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) == oldBits) { return newHash; } - // Another thread modified the header; try again + // contention, try again } - if ((bitAndValue & BIT_SBLK_IS_HASHCODE) == 0) + if (!GetSyncEntryIndex(*pHeader, out int syncIndex)) { - // Set the hash code in SyncTable. This call will resolve the potential race. - return SyncTable.SetHashCode(bitAndValue, newHash); + // Assign a new sync entry + syncIndex = SyncTable.AssignEntry(o, pHeader); } - // Another thread set the hash code, use it - Debug.Assert((bitAndValue & ~BIT_SBLK_IS_HASHCODE) != 0); - return bitAndValue & ~BIT_SBLK_IS_HASHCODE; + // Set the hash code in SyncTable. This call will resolve the potential race. + return SyncTable.SetHashCode(syncIndex, newHash); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool HasSyncEntryIndex(int header) + { + return (header & (BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX | BIT_SBLK_IS_HASHCODE)) == BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX; } /// /// Extracts the sync entry index or the hash code from the header value. Returns true /// if the header value stores the sync entry index. /// - // Inlining is important for lock performance [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static bool GetSyncEntryIndex(int header, out int hashOrIndex) + public static bool GetSyncEntryIndex(int header, out int index) { - hashOrIndex = header & MASK_HASHCODE_INDEX; - // The following is equivalent to: - // return (hashOrIndex != 0) && ((header & BIT_SBLK_IS_HASHCODE) == 0); - // Shifting the BIT_SBLK_IS_HASHCODE bit to the sign bit saves one branch. - int bitAndValue = header & (BIT_SBLK_IS_HASHCODE | MASK_HASHCODE_INDEX); - return (bitAndValue << (31 - IS_HASHCODE_BIT_NUMBER)) > 0; + index = header & MASK_HASHCODE_INDEX; + return HasSyncEntryIndex(header); } /// /// Returns the Monitor synchronization object assigned to this object. If no synchronization /// object has yet been assigned, it assigns one in a thread-safe way. /// - // Called from Monitor.Enter only; inlining is important for lock performance - [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe Lock GetLockObject(object o) { - fixed (byte* pRawData = &o.GetRawData()) - { - // The header is 4 bytes before m_pEEType field on all architectures - int* pHeader = (int*)(pRawData - sizeof(IntPtr) - sizeof(int)); + return SyncTable.GetLockObject(GetSyncIndex(o)); + } - if (GetSyncEntryIndex(ReadVolatileMemory(pHeader), out int hashOrIndex)) + private static unsafe int GetSyncIndex(object o) + { + fixed (MethodTable** ppMethodTable = &o.GetMethodTableRef()) + { + int* pHeader = GetHeaderPtr(ppMethodTable); + if (GetSyncEntryIndex(*pHeader, out int syncIndex)) { - // Already have a sync entry for this object, return the synchronization object - // stored in the entry. - return SyncTable.GetLockObject(hashOrIndex); + return syncIndex; } + // Assign a new sync entry - int syncIndex = SyncTable.AssignEntry(o, pHeader); - return SyncTable.GetLockObject(syncIndex); + return SyncTable.AssignEntry(o, pHeader); } } @@ -183,32 +186,304 @@ public static unsafe void SetSyncEntryIndex(int* pHeader, int syncIndex) // Holding this lock implies there is at most one thread setting the sync entry index at // any given time. We also require that the sync entry index has not been already set. Debug.Assert(SyncTable.s_lock.IsAcquired); - Debug.Assert((syncIndex & MASK_HASHCODE_INDEX) == syncIndex); - int oldBits, newBits, hashOrIndex; + int oldBits, newBits; do { - oldBits = Volatile.Read(ref *pHeader); - newBits = oldBits; + oldBits = *pHeader; + // we should not have a sync index yet. + Debug.Assert(!HasSyncEntryIndex(oldBits)); - if (GetSyncEntryIndex(oldBits, out hashOrIndex)) + if ((oldBits & BIT_SBLK_IS_HASHCODE) != 0) { - // Must not get here; see the contract - throw new InvalidOperationException(); + // set the hash code in the sync entry + SyncTable.MoveHashCodeToNewEntry(syncIndex, oldBits & MASK_HASHCODE_INDEX); + // reset the lock info, in case we have set it in the previous iteration + SyncTable.MoveThinLockToNewEntry(syncIndex, 0, 0); } - - Debug.Assert(((oldBits & BIT_SBLK_IS_HASHCODE) == 0) || (hashOrIndex != 0)); - if (hashOrIndex != 0) + else { - // Move the hash code to the sync entry - SyncTable.MoveHashCodeToNewEntry(syncIndex, hashOrIndex); + // set the lock info + SyncTable.MoveThinLockToNewEntry( + syncIndex, + oldBits & SBLK_MASK_LOCK_THREADID, + (oldBits & SBLK_MASK_LOCK_RECLEVEL) >> SBLK_RECLEVEL_SHIFT); } // Store the sync entry index - newBits &= ~(BIT_SBLK_IS_HASHCODE | MASK_HASHCODE_INDEX); - newBits |= syncIndex; + newBits = oldBits & ~(BIT_SBLK_IS_HASHCODE | MASK_HASHCODE_INDEX); + newBits |= syncIndex | BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX; } while (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) != oldBits); } + + // + // A few words about spinning choices: + // + // Most locks are not contentious. In fact most locks provide exclusive access safety, but in reality are used by + // one thread. And then there are locks that do see multiple threads, but the accesses are short and not overlapping. + // Thin lock is an optimization for such scenarios. + // + // If we see a thin lock held by other thread for longer than ~5 microseconds, we will "inflate" the lock + // and let the adaptive spinning in the fat Lock sort it out whether we have a contentious lock or a long-held lock. + // + // Another thing to consider is that SpinWait(1) is calibrated to about 35-50 nanoseconds. + // It can take much longer only if nop/pause takes much longer, which it should not, as that would be getting + // close to the RAM latency. + // + // Considering that taking and releasing the lock takes 2 CAS instructions + some overhead, we can estimate shortest + // time the lock can be held to be in hundreds of nanoseconds. Thus it is unlikely to see more than + // 8-10 threads contending for the lock without inflating it. Therefore we can expect to acquire a thin lock in + // under 16 tries. + // + // As for the backoff strategy we have two choices: + // Exponential back-off with a lmit: + // 0, 1, 2, 4, 8, 8, 8, 8, 8, 8, 8, . . . . + // + // Linear back-off + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, . . . . + // + // In this case these strategies are close in terms of average and worst case latency, so we will prefer linear + // back-off as it favors micro-contention scenario, which we expect. + // + + // Returs: + // -1 - success + // 0 - failed + // syncIndex - retry with the Lock + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe int Acquire(object obj) + { + return TryAcquire(obj, oneShot: false); + } + + // -1 - success + // 0 - failed + // syncIndex - retry with the Lock + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe int TryAcquire(object obj, bool oneShot = true) + { + ArgumentNullException.ThrowIfNull(obj); + + Debug.Assert(!(obj is Lock), + "Do not use Monitor.Enter or TryEnter on a Lock instance; use Lock methods directly instead."); + + int currentThreadID = ManagedThreadId.CurrentManagedThreadIdUnchecked; + + // if thread ID is uninitialized or too big, we do "uncommon" part. + if ((uint)(currentThreadID - 1) <= (uint)SBLK_MASK_LOCK_THREADID) + { + // for an object used in locking there are two common cases: + // - header bits are unused or + // - there is a sync entry + fixed (MethodTable** ppMethodTable = &obj.GetMethodTableRef()) + { + int* pHeader = GetHeaderPtr(ppMethodTable); + int oldBits = *pHeader; + // if unused for anything, try setting our thread id + // N.B. hashcode, thread ID and sync index are never 0, and hashcode is largest of all + if ((oldBits & MASK_HASHCODE_INDEX) == 0) + { + if (Interlocked.CompareExchange(ref *pHeader, oldBits | currentThreadID, oldBits) == oldBits) + { + return -1; + } + } + else if (GetSyncEntryIndex(oldBits, out int syncIndex)) + { + if (SyncTable.GetLockObject(syncIndex).TryAcquireOneShot(currentThreadID)) + { + return -1; + } + + // has sync entry -> slow path + return syncIndex; + } + } + } + + return TryAcquireUncommon(obj, oneShot); + } + + // handling uncommon cases here - recursive lock, contention, retries + // -1 - success + // 0 - failed + // syncIndex - retry with the Lock + private static unsafe int TryAcquireUncommon(object obj, bool oneShot) + { + // does thread ID fit? + int currentThreadID = Environment.CurrentManagedThreadId; + if (currentThreadID > SBLK_MASK_LOCK_THREADID) + return GetSyncIndex(obj); + + // Lock.s_processorCount is lazy-initialized at fist contended acquire + // untill then it is 0 and we assume we have multicore machine + int retries = oneShot || Lock.s_processorCount == 1 ? 0 : 16; + + // retry when the lock is owned by somebody else. + // this loop will spinwait between iterations. + for (int i = 0; i <= retries; i++) + { + fixed (MethodTable** ppMethodTable = &obj.GetMethodTableRef()) + { + int* pHeader = GetHeaderPtr(ppMethodTable); + + // rare retries when lock is not owned by somebody else. + // these do not count as iterations and do not spinwait. + while (true) + { + int oldBits = *pHeader; + + // if unused for anything, try setting our thread id + // N.B. hashcode, thread ID and sync index are never 0, and hashcode is largest of all + if ((oldBits & MASK_HASHCODE_INDEX) == 0) + { + int newBits = oldBits | currentThreadID; + if (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) == oldBits) + { + return -1; + } + + // contention on a lock that noone owned, + // but we do not know if there is an owner yet, so try again + continue; + } + + // has sync entry -> slow path + if (GetSyncEntryIndex(oldBits, out int syncIndex)) + { + return syncIndex; + } + + // has hashcode -> slow path + if ((oldBits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) != 0) + { + return SyncTable.AssignEntry(obj, pHeader); + } + + // we own the lock already + if ((oldBits & SBLK_MASK_LOCK_THREADID) == currentThreadID) + { + // try incrementing recursion level, check for overflow + int newBits = oldBits + SBLK_LOCK_RECLEVEL_INC; + if ((newBits & SBLK_MASK_LOCK_RECLEVEL) != 0) + { + if (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) == oldBits) + { + return -1; + } + + // rare contention on owned lock, + // perhaps hashcode was installed or finalization bits were touched. + // we still own the lock though and may be able to increment, try again + continue; + } + else + { + // overflow, transition to a fat Lock + return SyncTable.AssignEntry(obj, pHeader); + } + } + + // someone else owns. + break; + } + } + + // spin a bit before retrying (1 spinwait is roughly 35 nsec) + // the object is not pinned here + Thread.SpinWaitInternal(i); + } + + // owned by somebody else + return 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void Release(object obj) + { + ArgumentNullException.ThrowIfNull(obj); + + Debug.Assert(!(obj is Lock), + "Do not use Monitor.Enter or TryEnter on a Lock instance; use Lock methods directly instead."); + + int currentThreadID = ManagedThreadId.CurrentManagedThreadIdUnchecked; + // transform uninitialized ID into -1, so it will not match any possible lock owner + currentThreadID |= (currentThreadID - 1) >> 31; + + Lock fatLock; + fixed (MethodTable** ppMethodTable = &obj.GetMethodTableRef()) + { + int* pHeader = GetHeaderPtr(ppMethodTable); + while (true) + { + int oldBits = *pHeader; + + // if we own the lock + if ((oldBits & SBLK_MASK_LOCK_THREADID) == currentThreadID && + (oldBits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) == 0) + { + // decrement count or release entirely. + int newBits = (oldBits & SBLK_MASK_LOCK_RECLEVEL) != 0 ? + oldBits - SBLK_LOCK_RECLEVEL_INC : + oldBits & ~SBLK_MASK_LOCK_THREADID; + + if (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) == oldBits) + { + return; + } + + // rare contention on owned lock, + // we still own the lock, try again + continue; + } + + if (!GetSyncEntryIndex(oldBits, out int syncIndex)) + { + // someone else owns or noone. + throw new SynchronizationLockException(); + } + + fatLock = SyncTable.GetLockObject(syncIndex); + break; + } + } + + fatLock.ReleaseByThread(currentThreadID); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe bool IsAcquired(object obj) + { + ArgumentNullException.ThrowIfNull(obj); + + Debug.Assert(!(obj is Lock), + "Do not use Monitor.Enter or TryEnter on a Lock instance; use Lock methods directly instead."); + + int currentThreadID = ManagedThreadId.CurrentManagedThreadIdUnchecked; + // transform uninitialized ID into -1, so it will not match any possible lock owner + currentThreadID |= (currentThreadID - 1) >> 31; + + fixed (MethodTable** ppMethodTable = &obj.GetMethodTableRef()) + { + int* pHeader = GetHeaderPtr(ppMethodTable); + int oldBits = *pHeader; + + // if we own the lock + if ((oldBits & SBLK_MASK_LOCK_THREADID) == currentThreadID && + (oldBits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) == 0) + { + return true; + } + + if (GetSyncEntryIndex(oldBits, out int syncIndex)) + { + return SyncTable.GetLockObject(syncIndex).IsAcquiredByThread(currentThreadID); + } + + // someone else owns or noone. + return false; + } + } } } diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/SyncTable.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/SyncTable.cs index e531323d62a81..de07c986834ec 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/SyncTable.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/SyncTable.cs @@ -4,6 +4,7 @@ using System.Diagnostics; using System.Runtime; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; namespace System.Threading { @@ -62,9 +63,9 @@ internal static class SyncTable private const int DoublingSizeThreshold = 1 << 20; /// - /// Protects all mutable operations on s_entrie, s_freeEntryList, s_unusedEntryIndex. Also protects growing the table. + /// Protects all mutable operations on s_entries, s_freeEntryList, s_unusedEntryIndex. Also protects growing the table. /// - internal static Lock s_lock = new Lock(); + internal static readonly Lock s_lock = new Lock(); /// /// The dynamically growing array of sync entries. @@ -83,6 +84,16 @@ internal static class SyncTable /// private static int s_unusedEntryIndex = 1; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ref Entry UnsafeEntryRef(int i) + { +#if DEBUG + return ref s_entries[i]; +#else + return ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(s_entries), i); +#endif + } + /// /// Assigns a sync table entry to the object in a thread-safe way. /// @@ -98,12 +109,11 @@ public static unsafe int AssignEntry(object obj, int* pHeader) using (LockHolder.Hold(s_lock)) { // After acquiring the lock check whether another thread already assigned the sync entry - if (ObjectHeader.GetSyncEntryIndex(*pHeader, out int hashOrIndex)) + if (ObjectHeader.GetSyncEntryIndex(*pHeader, out int syncIndex)) { - return hashOrIndex; + return syncIndex; } - int syncIndex; if (s_freeEntryList != 0) { // Grab a free entry from the list @@ -219,7 +229,7 @@ public static int GetHashCode(int syncIndex) // This thread may be looking at an old version of s_entries. If the old version had // no hash code stored, GetHashCode returns zero and the subsequent SetHashCode call // will resolve the potential race. - return s_entries[syncIndex].HashCode; + return UnsafeEntryRef(syncIndex).HashCode; } /// @@ -255,6 +265,18 @@ public static void MoveHashCodeToNewEntry(int syncIndex, int hashCode) s_entries[syncIndex].HashCode = hashCode; } + /// + /// Initializes the Lock assuming the caller holds s_lock. Use for not yet + /// published entries only. + /// + public static void MoveThinLockToNewEntry(int syncIndex, int threadId, int recursionLevel) + { + Debug.Assert(s_lock.IsAcquired); + Debug.Assert((0 < syncIndex) && (syncIndex < s_unusedEntryIndex)); + + s_entries[syncIndex].Lock.InitializeLocked(threadId, recursionLevel); + } + /// /// Returns the Monitor synchronization object. The return value is never null. /// @@ -262,7 +284,7 @@ public static Lock GetLockObject(int syncIndex) { // Note that we do not take a lock here. When we replace s_entries, we preserve all // indices and Lock references. - return s_entries[syncIndex].Lock; + return UnsafeEntryRef(syncIndex).Lock; } private sealed class DeadEntryCollector @@ -281,7 +303,7 @@ public DeadEntryCollector() return; Lock? lockToDispose = default; - DependentHandle dependentHadleToDispose = default; + DependentHandle dependentHandleToDispose = default; using (LockHolder.Hold(s_lock)) { @@ -294,7 +316,7 @@ public DeadEntryCollector() return; } - dependentHadleToDispose = entry.Owner; + dependentHandleToDispose = entry.Owner; entry.Owner = default; lockToDispose = entry.Lock; @@ -305,7 +327,7 @@ public DeadEntryCollector() } // Dispose outside the lock - dependentHadleToDispose.Dispose(); + dependentHandleToDispose.Dispose(); lockToDispose?.Dispose(); } } @@ -331,6 +353,10 @@ private struct Entry /// public DependentHandle Owner; + // Unused field to make the entry even number of words. + // we will likely put something in here eventually. + internal nint _unusedPadding; + /// /// For entries in use, this property gets or sets the hash code of the owner object. /// The zero value indicates the hash code has not yet been assigned. diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Thread.NativeAot.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Thread.NativeAot.cs index 543ba774c39a9..7c46ced230300 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Thread.NativeAot.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Thread.NativeAot.cs @@ -320,9 +320,22 @@ public bool Join(int millisecondsTimeout) /// appropriate for the processor. /// TODO: See issue https://github.com/dotnet/corert/issues/4430 /// - internal const int OptimalMaxSpinWaitsPerSpinIteration = 64; + internal const int OptimalMaxSpinWaitsPerSpinIteration = 8; - [MethodImpl(MethodImplOptions.NoInlining)] + // Max iterations to be done in RhSpinWait. + // RhSpinWait does not switch GC modes and we want to avoid native spinning in coop mode for too long. + private const int SpinWaitCoopThreshold = 1024; + + internal static void SpinWaitInternal(int iterations) + { + Debug.Assert(iterations <= SpinWaitCoopThreshold); + if (iterations > 0) + { + RuntimeImports.RhSpinWait(iterations); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] // Slow path method. Make sure that the caller frame does not pay for PInvoke overhead. private static void LongSpinWait(int iterations) { RuntimeImports.RhLongSpinWait(iterations); @@ -330,20 +343,13 @@ private static void LongSpinWait(int iterations) public static void SpinWait(int iterations) { - if (iterations <= 0) - return; - - // Max iterations to be done in RhSpinWait. - // RhSpinWait does not switch GC modes and we want to avoid native spinning in coop mode for too long. - const int spinWaitCoopThreshold = 10000; - - if (iterations > spinWaitCoopThreshold) + if (iterations > SpinWaitCoopThreshold) { LongSpinWait(iterations); } else { - RuntimeImports.RhSpinWait(iterations); + SpinWaitInternal(iterations); } }