diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs index 7f4a1c21e..1e2e30726 100644 --- a/regex-automata/src/util/pool.rs +++ b/regex-automata/src/util/pool.rs @@ -87,6 +87,8 @@ needing to re-create the scratch space for every search, which could wind up being quite expensive. */ +use alloc::boxed::Box; + /// A thread safe pool that works in an `alloc`-only context. /// /// Getting a value out comes with a guard. When that guard is dropped, the @@ -151,13 +153,13 @@ being quite expensive. /// let expected = Some(Match::must(0, 3..14)); /// assert_eq!(expected, RE.find(&mut CACHE.get(), b"zzzfoo12345barzzz")); /// ``` -pub struct Pool T>(alloc::boxed::Box>); +pub struct Pool T>(Box>); impl Pool { /// Create a new pool. The given closure is used to create values in /// the pool when necessary. pub fn new(create: F) -> Pool { - Pool(alloc::boxed::Box::new(inner::Pool::new(create))) + Pool(Box::new(inner::Pool::new(create))) } } @@ -235,7 +237,7 @@ mod inner { sync::atomic::{AtomicUsize, Ordering}, }; - use alloc::{boxed::Box, vec, vec::Vec}; + use alloc::{boxed::Box, vec::Vec}; use std::{sync::Mutex, thread_local}; @@ -268,6 +270,64 @@ mod inner { /// do. static THREAD_ID_DROPPED: usize = 2; + /// The number of stacks we use inside of the pool. These are only used for + /// non-owners. That is, these represent the "slow" path. + /// + /// In the original implementation of this pool, we only used a single + /// stack. While this might be okay for a couple threads, the prevalence of + /// 32, 64 and even 128 core CPUs has made it untenable. The contention + /// such an environment introduces when threads are doing a lot of searches + /// on short haystacks (a not uncommon use case) is palpable and leads to + /// huge slowdowns. + /// + /// This constant reflects a change from using one stack to the number of + /// stacks that this constant is set to. The stack for a particular thread + /// is simply chosen by `thread_id % MAX_POOL_STACKS`. The idea behind + /// this setup is that there should be a good chance that accesses to the + /// pool will be distributed over several stacks instead of all of them + /// converging to one. + /// + /// This is not a particularly smart or dynamic strategy. Fixing this to a + /// specific number has at least two downsides. First is that it will help, + /// say, an 8 core CPU more than it will a 128 core CPU. (But, crucially, + /// it will still help the 128 core case.) Second is that this may wind + /// up being a little wasteful with respect to memory usage. Namely, if a + /// regex is used on one thread and then moved to another thread, then it + /// could result in creating a new copy of the data in the pool even though + /// only one is actually needed. + /// + /// And that memory usage bit is why this is set to 8 and not, say, 64. + /// Keeping it at 8 limits, to an extent, how much unnecessary memory can + /// be allocated. + /// + /// In an ideal world, we'd be able to have something like this: + /// + /// * Grow the number of stacks as the number of concurrent callers + /// increases. I spent a little time trying this, but even just adding an + /// atomic addition/subtraction for each pop/push for tracking concurrent + /// callers led to a big perf hit. Since even more work would seemingly be + /// required than just an addition/subtraction, I abandoned this approach. + /// * The maximum amount of memory used should scale with respect to the + /// number of concurrent callers and *not* the total number of existing + /// threads. This is primarily why the `thread_local` crate isn't used, as + /// as some environments spin up a lot of threads. This led to multiple + /// reports of extremely high memory usage (often described as memory + /// leaks). + /// * Even more ideally, the pool should contract in size. That is, it + /// should grow with bursts and then shrink. But this is a pretty thorny + /// issue to tackle and it might be better to just not. + /// * It would be nice to explore the use of, say, a lock-free stack + /// instead of using a mutex to guard a `Vec` that is ultimately just + /// treated as a stack. The main thing preventing me from exploring this + /// is the ABA problem. The `crossbeam` crate has tools for dealing with + /// this sort of problem (via its epoch based memory reclamation strategy), + /// but I can't justify bringing in all of `crossbeam` as a dependency of + /// `regex` for this. + /// + /// See this issue for more context and discussion: + /// https://github.com/rust-lang/regex/issues/934 + const MAX_POOL_STACKS: usize = 8; + thread_local!( /// A thread local used to assign an ID to a thread. static THREAD_ID: usize = { @@ -291,6 +351,17 @@ mod inner { }; ); + /// This puts each stack in the pool below into its own cache line. This is + /// an absolutely critical optimization that tends to have the most impact + /// in high contention workloads. Without forcing each mutex protected + /// into its own cache line, high contention exacerbates the performance + /// problem by causing "false sharing." By putting each mutex in its own + /// cache-line, we avoid the false sharing problem and the affects of + /// contention are greatly reduced. + #[derive(Debug)] + #[repr(C, align(64))] + struct CacheLine(T); + /// A thread safe pool utilizing std-only features. /// /// The main difference between this and the simplistic alloc-only pool is @@ -299,12 +370,16 @@ mod inner { /// This makes the common case of running a regex within a single thread /// faster by avoiding mutex unlocking. pub(super) struct Pool { - /// A stack of T values to hand out. These are used when a Pool is - /// accessed by a thread that didn't create it. - stack: Mutex>>, /// A function to create more T values when stack is empty and a caller /// has requested a T. create: F, + /// Multiple stacks of T values to hand out. These are used when a Pool + /// is accessed by a thread that didn't create it. + /// + /// Conceptually this is `Mutex>>`, but sharded out to make + /// it scale better under high contention work-loads. We index into + /// this sequence via `thread_id % stacks.len()`. + stacks: Vec>>>>, /// The ID of the thread that owns this pool. The owner is the thread /// that makes the first call to 'get'. When the owner calls 'get', it /// gets 'owner_val' directly instead of returning a T from 'stack'. @@ -354,9 +429,17 @@ mod inner { unsafe impl Sync for Pool {} // If T is UnwindSafe, then since we provide exclusive access to any - // particular value in the pool, it should therefore also be considered - // RefUnwindSafe. Also, since we use std::sync::Mutex, we get poisoning - // from it if another thread panics while the lock is held. + // particular value in the pool, the pool should therefore also be + // considered UnwindSafe. + // + // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any + // point on demand, so it needs to be unwind safe on both dimensions for + // the entire Pool to be unwind safe. + impl UnwindSafe for Pool {} + + // If T is UnwindSafe, then since we provide exclusive access to any + // particular value in the pool, the pool should therefore also be + // considered RefUnwindSafe. // // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any // point on demand, so it needs to be unwind safe on both dimensions for @@ -375,9 +458,13 @@ mod inner { // 'Pool::new' method as 'const' too. (The alloc-only Pool::new // is already 'const', so that should "just work" too.) The only // thing we're waiting for is Mutex::new to be const. + let mut stacks = Vec::with_capacity(MAX_POOL_STACKS); + for _ in 0..stacks.capacity() { + stacks.push(CacheLine(Mutex::new(alloc::vec![]))); + } let owner = AtomicUsize::new(THREAD_ID_UNOWNED); let owner_val = UnsafeCell::new(None); // init'd on first access - Pool { stack: Mutex::new(vec![]), create, owner, owner_val } + Pool { create, stacks, owner, owner_val } } } @@ -401,6 +488,9 @@ mod inner { let caller = THREAD_ID.with(|id| *id); let owner = self.owner.load(Ordering::Acquire); if caller == owner { + // N.B. We could also do a CAS here instead of a load/store, + // but ad hoc benchmarking suggests it is slower. And a lot + // slower in the case where `get_slow` is common. self.owner.store(THREAD_ID_INUSE, Ordering::Release); return self.guard_owned(caller); } @@ -444,37 +534,82 @@ mod inner { return self.guard_owned(caller); } } - let mut stack = self.stack.lock().unwrap(); - let value = match stack.pop() { - None => Box::new((self.create)()), - Some(value) => value, - }; - self.guard_stack(value) + let stack_id = caller % self.stacks.len(); + // We try to acquire exclusive access to this thread's stack, and + // if so, grab a value from it if we can. We put this in a loop so + // that it's easy to tweak and experiment with a different number + // of tries. In the end, I couldn't see anything obviously better + // than one attempt in ad hoc testing. + for _ in 0..1 { + let mut stack = match self.stacks[stack_id].0.try_lock() { + Err(_) => continue, + Ok(stack) => stack, + }; + if let Some(value) = stack.pop() { + return self.guard_stack(value); + } + // Unlock the mutex guarding the stack before creating a fresh + // value since we no longer need the stack. + drop(stack); + let value = Box::new((self.create)()); + return self.guard_stack(value); + } + // We're only here if we could get access to our stack, so just + // create a new value. This seems like it could be wasteful, but + // waiting for exclusive access to a stack when there's high + // contention is brutal for perf. + self.guard_stack_transient(Box::new((self.create)())) } /// Puts a value back into the pool. Callers don't need to call this. /// Once the guard that's returned by 'get' is dropped, it is put back /// into the pool automatically. fn put_value(&self, value: Box) { - let mut stack = self.stack.lock().unwrap(); - stack.push(value); + let caller = THREAD_ID.with(|id| *id); + let stack_id = caller % self.stacks.len(); + // As with trying to pop a value from this thread's stack, we + // merely attempt to get access to push this value back on the + // stack. If there's too much contention, we just give up and throw + // the value away. + // + // Interestingly, in ad hoc benchmarking, it is beneficial to + // attempt to push the value back more than once, unlike when + // popping the value. I don't have a good theory for why this is. + // I guess if we drop too many values then that winds up forcing + // the pop operation to create new fresh values and thus leads to + // less reuse. There's definitely a balancing act here. + for _ in 0..10 { + let mut stack = match self.stacks[stack_id].0.try_lock() { + Err(_) => continue, + Ok(stack) => stack, + }; + stack.push(value); + return; + } } /// Create a guard that represents the special owned T. fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> { - PoolGuard { pool: self, value: Err(caller) } + PoolGuard { pool: self, value: Err(caller), discard: false } } /// Create a guard that contains a value from the pool's stack. fn guard_stack(&self, value: Box) -> PoolGuard<'_, T, F> { - PoolGuard { pool: self, value: Ok(value) } + PoolGuard { pool: self, value: Ok(value), discard: false } + } + + /// Create a guard that contains a value from the pool's stack with an + /// instruction to throw away the value instead of putting it back + /// into the pool. + fn guard_stack_transient(&self, value: Box) -> PoolGuard<'_, T, F> { + PoolGuard { pool: self, value: Ok(value), discard: true } } } impl core::fmt::Debug for Pool { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("Pool") - .field("stack", &self.stack) + .field("stacks", &self.stacks) .field("owner", &self.owner) .field("owner_val", &self.owner_val) .finish() @@ -490,6 +625,12 @@ mod inner { /// in the special case of `Err(THREAD_ID_DROPPED)`, it means the /// guard has been put back into the pool and should no longer be used. value: Result, usize>, + /// When true, the value should be discarded instead of being pushed + /// back into the pool. We tend to use this under high contention, and + /// this allows us to avoid inflating the size of the pool. (Because + /// under contention, we tend to create more values instead of waiting + /// for access to a stack of existing values.) + discard: bool, } impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { @@ -557,7 +698,17 @@ mod inner { #[inline(always)] fn put_imp(&mut self) { match core::mem::replace(&mut self.value, Err(THREAD_ID_DROPPED)) { - Ok(value) => self.pool.put_value(value), + Ok(value) => { + // If we were told to discard this value then don't bother + // trying to put it back into the pool. This occurs when + // the pop operation failed to acquire a lock and we + // decided to create a new value in lieu of contending for + // the lock. + if self.discard { + return; + } + self.pool.put_value(value); + } // If this guard has a value "owned" by the thread, then // the Pool guarantees that this is the ONLY such guard. // Therefore, in order to place it back into the pool and make