Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NativeAOT] A few scalability fixes for the fat Lock #88633

Merged
merged 1 commit into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,7 @@ private static void MonitorEnter(object obj, ref bool lockTaken)
ObjectHeader.GetLockObject(obj) :
SyncTable.GetLockObject(resultOrIndex);

if (lck.TryAcquire(0))
Copy link
Member Author

@VSadov VSadov Jul 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This acquire attempt is unnecessary and is also too soon. It was probably here to pick "unusual" cases like recursive locking because TryAcquire(0) would defer internally to the slow path for one iteration. Calling the slow path directly is simpler though and just as good.

{
lockTaken = true;
return;
}

Monitor.TryAcquireContended(lck, obj, Timeout.Infinite);
Copy link
Member Author

@VSadov VSadov Jul 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've renamed TryAcquireContended -> TryAcquireSlow. The name was deceiving. This method is not just to handle contention. It is to handle all possible scenarios, while the initial attempts to acquire quickly can handle only a few common cases.

Monitor.TryAcquireSlow(lck, obj, Timeout.Infinite);
lockTaken = true;
}
private static void MonitorExit(object obj, ref bool lockTaken)
Expand Down Expand Up @@ -59,13 +53,7 @@ private static unsafe void MonitorEnterStatic(MethodTable* pMT, ref bool lockTak
ObjectHeader.GetLockObject(obj) :
SyncTable.GetLockObject(resultOrIndex);

if (lck.TryAcquire(0))
{
lockTaken = true;
return;
}

Monitor.TryAcquireContended(lck, obj, Timeout.Infinite);
Monitor.TryAcquireSlow(lck, obj, Timeout.Infinite);
lockTaken = true;
}
private static unsafe void MonitorExitStatic(MethodTable* pMT, ref bool lockTaken)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,20 @@ public sealed class Lock : IDisposable
// before going to sleep. The amount of spinning is dynamically adjusted based on past
// history of the lock and will stay in the following range.
//
private const uint MaxSpinLimit = 200;
private const uint MinSpinLimit = 10;
private const uint SpinningNotInitialized = MaxSpinLimit + 1;
private const uint SpinningDisabled = 0;
// We use doubling-up delays with a cap while spinning (1,2,4,8,16,32,64,64,64,64, ...)
// Thus 20 iterations is about 1000 speenwaits (20-50 ns each)
// Context switch costs may vary and typically in 2-20 usec range
// Even if we are the only thread trying to acquire the lock at 20-50 usec the cost of being
// blocked+awaken may not be more than 2x of what we have already spent, so that is the max CPU time
// that we will allow to burn while spinning.
//
// This may not be always optimal, but should be close enough.
// I.E. in a system consisting of exactly 2 threads, unlimited spinning may work better, but we
// will not optimize specifically for that.
private const ushort MaxSpinLimit = 20;
private const ushort MinSpinLimit = 3;
private const ushort SpinningNotInitialized = MaxSpinLimit + 1;
private const ushort SpinningDisabled = 0;

//
// We will use exponential backoff in rare cases when we need to change state atomically and cannot
Expand Down Expand Up @@ -71,8 +81,8 @@ public sealed class Lock : IDisposable
private int _owningThreadId;
private uint _recursionCount;
private int _state;
private uint _spinLimit = SpinningNotInitialized;
private int _wakeWatchDog;
private ushort _spinLimit = SpinningNotInitialized;
private short _wakeWatchDog;

// used to transfer the state when inflating thin locks
internal void InitializeLocked(int threadId, int recursionCount)
Expand Down Expand Up @@ -112,7 +122,7 @@ public void Acquire()
//
// Fall back to the slow path for contention
//
bool success = TryAcquireContended(currentThreadId, Timeout.Infinite);
bool success = TryAcquireSlow(currentThreadId, Timeout.Infinite);
Debug.Assert(success);
}

Expand All @@ -121,36 +131,48 @@ public bool TryAcquire(TimeSpan timeout)
return TryAcquire(WaitHandle.ToTimeoutMilliseconds(timeout));
}

public bool TryAcquire(int millisecondsTimeout, bool trackContentions = false)
public bool TryAcquire(int millisecondsTimeout)
{
ArgumentOutOfRangeException.ThrowIfLessThan(millisecondsTimeout, -1);

int currentThreadId = CurrentThreadId;
if (TryAcquireOneShot(currentThreadId))
return true;

//
// Fall back to the slow path for contention
//
return TryAcquireSlow(currentThreadId, millisecondsTimeout, trackContentions: false);
}

internal bool TryAcquireNoSpin()
{
//
// Make one quick attempt to acquire an uncontended lock
//
if (Interlocked.CompareExchange(ref _state, Locked, Uncontended) == Uncontended)
{
Debug.Assert(_owningThreadId == 0);
Debug.Assert(_recursionCount == 0);
_owningThreadId = currentThreadId;
int currentThreadId = CurrentThreadId;
if (TryAcquireOneShot(currentThreadId))
return true;
}

//
// Fall back to the slow path for contention
// If we already own the lock, just increment the recursion count.
//
return TryAcquireContended(currentThreadId, millisecondsTimeout, trackContentions);
if (_owningThreadId == currentThreadId)
{
checked { _recursionCount++; }
return true;
}

return false;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal bool TryAcquireOneShot(int currentThreadId)
{
//
// Make one quick attempt to acquire an uncontended lock
//
if (Interlocked.CompareExchange(ref _state, Locked, Uncontended) == Uncontended)
int origState = _state;
int expectedState = origState & ~(YieldToWaiters | Locked);
int newState = origState | Locked;
if (Interlocked.CompareExchange(ref _state, newState, expectedState) == expectedState)
{
Debug.Assert(_owningThreadId == 0);
Debug.Assert(_recursionCount == 0);
Expand All @@ -169,13 +191,14 @@ private static unsafe void ExponentialBackoff(uint iteration)
uint rand = ((uint)&iteration + iteration) * 2654435769u;
// set the highmost bit to ensure minimum number of spins is exponentialy increasing
// that is in case some stack location results in a sequence of very low spin counts
rand |= (1 << 32);
// it basically gurantees that we spin at least 1, 2, 4, 8, 16, times, and so on
rand |= (1u << 31);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not relevant to this particular change, but why is this additional randomization necessary? Given that the upper bit is set, it seems the randomization effect would be miniscule, especially at higher iterations. To me it doesn't seem like the added complication would be necessary.

Copy link
Member Author

@VSadov VSadov Aug 2, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that the upper bit is set, it seems the randomization effect would be miniscule, especially at higher iterations.

With upper bit set and other bits random, the range of possible waits should still be ~50% of the minimum for the given iteration.

The cost of this "randomization" is basically free next to the cost of occupying the core even for one spinwait cycle. It is a cheap insurance against threads arriving at the same time or synchronizing with each other via effects of cache misses.

Some scenarios/environments may not need this, and those that need may need it less after this PR, since we are trying to ensure that we do not see too much contentious spinning in the first place.
It is hard to prove that there will be no benefits though, since this code will run on very wide range of hardware. So it seems a nearly free insurance is still good to have.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, it seems the spin-wait backoff can yield fairly large gaps between checks at higher iterations, whereas CoreCLR's monitor's spin-wait backoff maxes out fairly quickly in favor of reducing latency. I guess a small amount of randomization may not hurt, and as you said it would probably be difficult to prove or disprove if it actually helps.

uint spins = rand >> (byte)(32 - Math.Min(iteration, MaxExponentialBackoffBits));
Thread.SpinWaitInternal((int)spins);
}
}

private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, bool trackContentions = false)
internal bool TryAcquireSlow(int currentThreadId, int millisecondsTimeout, bool trackContentions = false)
{
//
// If we already own the lock, just increment the recursion count.
Expand All @@ -192,20 +215,32 @@ private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, b
if (millisecondsTimeout == 0)
return false;

// since we have just made an attempt to accuire and failed, do a small pause
Thread.SpinWaitInternal(1);

if (_spinLimit == SpinningNotInitialized)
{
// Use RhGetProcessCpuCount directly to avoid Environment.ProcessorCount->ClassConstructorRunner->Lock->Environment.ProcessorCount cycle
if (s_processorCount == 0)
s_processorCount = RuntimeImports.RhGetProcessCpuCount();

_spinLimit = (s_processorCount > 1) ? MaxSpinLimit : SpinningDisabled;
_spinLimit = (s_processorCount > 1) ? MinSpinLimit : SpinningDisabled;
}

bool hasWaited = false;
// we will retry after waking up
while (true)
{
uint iteration = 0;

// We will count when we failed to change the state of the lock and increase pauses
// so that bursts of activity are better tolerated. This should not happen often.
uint collisions = 0;

// We will track the changes of ownership while we are trying to acquire the lock.
int oldOwner = _owningThreadId;
uint ownerChanged = 0;

uint localSpinLimit = _spinLimit;
// inner loop where we try acquiring the lock or registering as a waiter
while (true)
Expand All @@ -224,18 +259,34 @@ private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, b
{
int newState = oldState | Locked;
if (hasWaited)
newState = (newState - WaiterCountIncrement) & ~WaiterWoken & ~YieldToWaiters;
newState = (newState - WaiterCountIncrement) & ~(WaiterWoken | YieldToWaiters);

if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
{
// GOT THE LOCK!!
if (hasWaited)
_wakeWatchDog = 0;
else
// if spinning was successful, update spin count
if (iteration < localSpinLimit && localSpinLimit < MaxSpinLimit)
_spinLimit = localSpinLimit + 1;

// GOT THE LOCK!!
// now we can estimate how busy the lock is and adjust spinning accordingly
ushort spinLimit = _spinLimit;
if (ownerChanged != 0)
{
// The lock has changed ownership while we were trying to acquire it.
// It is a signal that we might want to spin less next time.
// Pursuing a lock that is being "stolen" by other threads is inefficient
// due to cache misses and unnecessary sharing of state that keeps invalidating.
if (spinLimit > MinSpinLimit)
{
_spinLimit = (ushort)(spinLimit - 1);
}
}
else if (spinLimit < MaxSpinLimit && iteration > spinLimit / 2)
{
// we used more than 50% of allowed iterations, but the lock does not look very contested,
// we can allow a bit more spinning.
_spinLimit = (ushort)(spinLimit + 1);
}
Comment on lines +270 to +288
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A useful aspect of a spin-tuning heuristic would be to avoid spin-waiting when it just uses CPU time but does not help to avoid context-switching. I'm not sure that this is addressed by the spin-tuning heuristic in this change. I suggest creating a test case where spin-waiting is wasted, and to compare the CPU time with profiling, with and without spin-waiting. Ideally, a self-tuning spin-wait heuristic would show roughly the same CPU time usage as disabling spin-waiting.

Copy link
Member Author

@VSadov VSadov Aug 2, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I understand the concern - this is about a lock that is typically long-held.
If the lock is held for much longer than it takes to block/awaken a thread, then spinning is useless.

Note that even if a thread decides to block right away, it will still pay thousands of cycles for blocking/waking. In the first approximation, if max spinning that we allow does not cost much more than the costs of block/awake, then the overall overhead even with the max spin should not be much worse than ideal.
Also, since such long-held lock will typically be acquired by an awoken waiter without any spinning, it would not raise the spin limit, so we will likely stay at the initial min spin setting.

We may have an occasional new thread arrive at precise time window around lock release, spin and acquire and thus cause spin count to slowly drift up and reach the max. But it is also possible for such "new" threads to contest the lock with an awoken waiter and reduce the spin.

Since the lock is long-held, there cannot be many spinners. If the lock is very popular, eventually all the interested threads will block.

I think the possibility of really bad behavior is not high, but it may be possible to further improve the heuristics for the long-held locks. It could be interesting to make some adversarial scenarios to see if we can get in trouble intentionally, and see how likely and how bad it can get.

Copy link
Member

@kouvel kouvel Aug 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that even if a thread decides to block right away, it will still pay thousands of cycles for blocking/waking. In the first approximation, if max spinning that we allow does not cost much more than the costs of block/awake, then the overall overhead even with the max spin should not be much worse than ideal.

It seems like the max spin count would use a not-insignificant amount of CPU time relative to context switching, though I haven't measured with this heuristic.

Also, since such long-held lock will typically be acquired by an awoken waiter without any spinning, it would not raise the spin limit, so we will likely stay at the initial min spin setting.

True, but if some other less-frequent lock contention on the same lock happens to increase the spin count, the spin-tuning would not tune down the spin-waiting for a more-frequent lock contention that doesn't benefit from spin-waiting. It may not be a significant issue at the moment since the spin-tuning heuristic seems to favor less spin-waiting anyway.


Debug.Assert((_state | Locked) != 0);
Debug.Assert(_owningThreadId == 0);
Debug.Assert(_recursionCount == 0);
Expand All @@ -244,13 +295,26 @@ private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, b
}
}

// spinning was unsuccessful. reduce spin count.
if (iteration == localSpinLimit && localSpinLimit > MinSpinLimit)
_spinLimit = localSpinLimit - 1;

if (iteration++ < localSpinLimit)
{
Thread.SpinWaitInternal(1);
int newOwner = _owningThreadId;
if (newOwner != 0 && newOwner != oldOwner)
{
ownerChanged++;
oldOwner = newOwner;
}

if (canAcquire)
{
collisions++;
}

// We failed to acquire the lock and want to retry after a pause.
// Ideally we will retry right when the lock becomes free, but we cannot know when that will happen.
// We will use a pause that doubles up on every iteration. It will not be more than 2x worse
// than the ideal guess, while minimizing the number of retries.
// We will allow pauses up to 64~128 spinwaits, or more if there are collisions.
ExponentialBackoff(Math.Min(iteration, 6) + collisions);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems the added collisions counter would make this expontential backoff at lot more than power-of-2-exponential, and perhaps fairly quickly depending on the situation. Is there a reason for this change?

Copy link
Member Author

@VSadov VSadov Aug 2, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are two factors that inform the backoff here - iteration and collisions.
The retry period would be normally driven by iterations. As we spin more, we reduce the rate of reads, as the comment explains. The goal is to acquire the lock within 2X of optimal wait if noone wants it, while avoiding possible interference with other threads acquiring/releasing the lock. This is a power-of-2 exponential backoff with a limit.

When we see collisions (i.e. our interlocked operations start failing), it is a signal that we may still have more competition at the lock than we would like, so we backoff more.
Collisions should be relatively rare - basically a "panic mode" to resolve unexpected bursts of activity.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should there be a cap on the collisions too similarly to the iteration, so as to avoid a perhaps rare but potentially very large backoffs?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ExponentialBackoff has a total limit controlled by MaxExponentialBackoffBits.
It would be extremely unlikely to have so many collisions, but we have a limit just in case.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless I'm missing something, 10 bits seems like a lot to me as a cap for one spin iteration. If we assume SpinWait(1) takes in the order of 10 ns, SpinWait((1 << 10) / 2) would take in the order of 5 us, which is quite a lot for one spin-wait iteration, despite it being rare.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because of double-up increases, by the time the pause reaches 5us, we have already spent 5us spinning. Basically the horribly long pauses will take horribly long time to achieve. The math kind of tells us that the limit is unnecessary, yet for many people, me included, an unbounded spin would trigger an alarm, so we need some limit. Because, who knows, we may not be considering everything ...

There is not much science behind this number though. I wanted to pick something that does not stand in the way in a typical scenario, yet protects from degenerate cases (i.e. not spinning for seconds). Thus I picked 2^10. I did not see it achieved in experiments (not even close), yet it is feels safer than 2^16 - another number I'd pick.

If there is any way to better estimate a better number, we could change it.

It would be interesting to get access to a 1024 core machine, or how high the core count goes these days, and see how high the collisions numbers could possibly get if trying intentionally for it (also to see if randomization helps).
I think experimenting on 32 cores will mostly tell us that this number does not matter.

continue;
}
else if (!canAcquire)
Expand All @@ -267,10 +331,11 @@ private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, b

if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
break;

collisions++;
}

Debug.Assert(iteration >= localSpinLimit);
ExponentialBackoff(iteration - localSpinLimit);
ExponentialBackoff(collisions);
}

//
Expand Down Expand Up @@ -390,39 +455,31 @@ private void ReleaseCore()
{
Debug.Assert(_recursionCount == 0);
_owningThreadId = 0;

//
// Make one quick attempt to release an uncontended lock
//
if (Interlocked.CompareExchange(ref _state, Uncontended, Locked) == Locked)
int origState = Interlocked.Decrement(ref _state);
if (origState < WaiterCountIncrement || (origState & WaiterWoken) != 0)
{
return;
}

//
// We have waiters; take the slow path.
//
ReleaseContended();
AwakeWaiterIfNeeded();
}

private void ReleaseContended()
private void AwakeWaiterIfNeeded()
{
Debug.Assert(_recursionCount == 0);
Debug.Assert(_owningThreadId == 0);

uint iteration = 0;
while (true)
{
int oldState = _state;

// clear the lock bit.
int newState = oldState & ~Locked;

if (oldState >= WaiterCountIncrement && (oldState & WaiterWoken) == 0)
{
// there are waiters, and nobody has woken one.
newState |= WaiterWoken;
int newState = oldState | WaiterWoken;

int lastWakeTicks = _wakeWatchDog;
if (lastWakeTicks != 0 && Environment.TickCount - lastWakeTicks > 100)
short lastWakeTicks = _wakeWatchDog;
if (lastWakeTicks != 0 && (short)Environment.TickCount - lastWakeTicks > WaiterWatchdogTicks)
{
newState |= YieldToWaiters;
}
Expand All @@ -432,7 +489,7 @@ private void ReleaseContended()
if (lastWakeTicks == 0)
{
// nonzero timestamp of the last wake
_wakeWatchDog = Environment.TickCount | 1;
_wakeWatchDog = (short)(Environment.TickCount | 1);
}

Event.Set();
Expand All @@ -442,8 +499,7 @@ private void ReleaseContended()
else
{
// no need to wake a waiter.
if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
return;
return;
}

ExponentialBackoff(iteration++);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,7 @@ public static void Enter(object obj)
ObjectHeader.GetLockObject(obj) :
SyncTable.GetLockObject(resultOrIndex);

if (lck.TryAcquire(0))
return;

TryAcquireContended(lck, obj, Timeout.Infinite);
TryAcquireSlow(lck, obj, Timeout.Infinite);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand Down Expand Up @@ -103,10 +100,10 @@ public static bool TryEnter(object obj, int millisecondsTimeout)
ObjectHeader.GetLockObject(obj) :
SyncTable.GetLockObject(resultOrIndex);

if (lck.TryAcquire(0))
return true;
if (millisecondsTimeout == 0)
return lck.TryAcquireNoSpin();

return TryAcquireContended(lck, obj, millisecondsTimeout);
return TryAcquireSlow(lck, obj, millisecondsTimeout);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand Down Expand Up @@ -164,11 +161,11 @@ public static void PulseAll(object obj)

#region Slow path for Entry/TryEnter methods.

internal static bool TryAcquireContended(Lock lck, object obj, int millisecondsTimeout)
internal static bool TryAcquireSlow(Lock lck, object obj, int millisecondsTimeout)
{
using (new DebugBlockingScope(obj, DebugBlockingItemType.MonitorCriticalSection, millisecondsTimeout, out _))
{
return lck.TryAcquire(millisecondsTimeout, trackContentions: true);
return lck.TryAcquireSlow(Environment.CurrentManagedThreadId, millisecondsTimeout, trackContentions: true);
}
}

Expand Down