diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs
index 7f4a1c21e..1e2e30726 100644
--- a/regex-automata/src/util/pool.rs
+++ b/regex-automata/src/util/pool.rs
@@ -87,6 +87,8 @@ needing to re-create the scratch space for every search, which could wind up
 being quite expensive.
 */
 
+use alloc::boxed::Box;
+
 /// A thread safe pool that works in an `alloc`-only context.
 ///
 /// Getting a value out comes with a guard. When that guard is dropped, the
@@ -151,13 +153,13 @@ being quite expensive.
 /// let expected = Some(Match::must(0, 3..14));
 /// assert_eq!(expected, RE.find(&mut CACHE.get(), b"zzzfoo12345barzzz"));
 /// ```
-pub struct Pool<T, F = fn() -> T>(alloc::boxed::Box<inner::Pool<T, F>>);
+pub struct Pool<T, F = fn() -> T>(Box<inner::Pool<T, F>>);
 
 impl<T, F> Pool<T, F> {
     /// Create a new pool. The given closure is used to create values in
     /// the pool when necessary.
     pub fn new(create: F) -> Pool<T, F> {
-        Pool(alloc::boxed::Box::new(inner::Pool::new(create)))
+        Pool(Box::new(inner::Pool::new(create)))
     }
 }
 
@@ -235,7 +237,7 @@ mod inner {
         sync::atomic::{AtomicUsize, Ordering},
     };
 
-    use alloc::{boxed::Box, vec, vec::Vec};
+    use alloc::{boxed::Box, vec::Vec};
 
     use std::{sync::Mutex, thread_local};
 
@@ -268,6 +270,64 @@ mod inner {
     /// do.
     static THREAD_ID_DROPPED: usize = 2;
 
+    /// The number of stacks we use inside of the pool. These are only used for
+    /// non-owners. That is, these represent the "slow" path.
+    ///
+    /// In the original implementation of this pool, we only used a single
+    /// stack. While this might be okay for a couple threads, the prevalence of
+    /// 32, 64 and even 128 core CPUs has made it untenable. The contention
+    /// such an environment introduces when threads are doing a lot of searches
+    /// on short haystacks (a not uncommon use case) is palpable and leads to
+    /// huge slowdowns.
+    ///
+    /// This constant reflects a change from using one stack to the number of
+    /// stacks that this constant is set to. The stack for a particular thread
+    /// is simply chosen by `thread_id % MAX_POOL_STACKS`. The idea behind
+    /// this setup is that there should be a good chance that accesses to the
+    /// pool will be distributed over several stacks instead of all of them
+    /// converging to one.
+    ///
+    /// This is not a particularly smart or dynamic strategy. Fixing this to a
+    /// specific number has at least two downsides. First is that it will help,
+    /// say, an 8 core CPU more than it will a 128 core CPU. (But, crucially,
+    /// it will still help the 128 core case.) Second is that this may wind
+    /// up being a little wasteful with respect to memory usage. Namely, if a
+    /// regex is used on one thread and then moved to another thread, then it
+    /// could result in creating a new copy of the data in the pool even though
+    /// only one is actually needed.
+    ///
+    /// And that memory usage bit is why this is set to 8 and not, say, 64.
+    /// Keeping it at 8 limits, to an extent, how much unnecessary memory can
+    /// be allocated.
+    ///
+    /// In an ideal world, we'd be able to have something like this:
+    ///
+    /// * Grow the number of stacks as the number of concurrent callers
+    /// increases. I spent a little time trying this, but even just adding an
+    /// atomic addition/subtraction for each pop/push for tracking concurrent
+    /// callers led to a big perf hit. Since even more work would seemingly be
+    /// required than just an addition/subtraction, I abandoned this approach.
+    /// * The maximum amount of memory used should scale with respect to the
+    /// number of concurrent callers and *not* the total number of existing
+    /// threads. This is primarily why the `thread_local` crate isn't used, as
+    /// as some environments spin up a lot of threads. This led to multiple
+    /// reports of extremely high memory usage (often described as memory
+    /// leaks).
+    /// * Even more ideally, the pool should contract in size. That is, it
+    /// should grow with bursts and then shrink. But this is a pretty thorny
+    /// issue to tackle and it might be better to just not.
+    /// * It would be nice to explore the use of, say, a lock-free stack
+    /// instead of using a mutex to guard a `Vec` that is ultimately just
+    /// treated as a stack. The main thing preventing me from exploring this
+    /// is the ABA problem. The `crossbeam` crate has tools for dealing with
+    /// this sort of problem (via its epoch based memory reclamation strategy),
+    /// but I can't justify bringing in all of `crossbeam` as a dependency of
+    /// `regex` for this.
+    ///
+    /// See this issue for more context and discussion:
+    /// https://github.com/rust-lang/regex/issues/934
+    const MAX_POOL_STACKS: usize = 8;
+
     thread_local!(
         /// A thread local used to assign an ID to a thread.
         static THREAD_ID: usize = {
@@ -291,6 +351,17 @@ mod inner {
         };
     );
 
+    /// This puts each stack in the pool below into its own cache line. This is
+    /// an absolutely critical optimization that tends to have the most impact
+    /// in high contention workloads. Without forcing each mutex protected
+    /// into its own cache line, high contention exacerbates the performance
+    /// problem by causing "false sharing." By putting each mutex in its own
+    /// cache-line, we avoid the false sharing problem and the affects of
+    /// contention are greatly reduced.
+    #[derive(Debug)]
+    #[repr(C, align(64))]
+    struct CacheLine<T>(T);
+
     /// A thread safe pool utilizing std-only features.
     ///
     /// The main difference between this and the simplistic alloc-only pool is
@@ -299,12 +370,16 @@ mod inner {
     /// This makes the common case of running a regex within a single thread
     /// faster by avoiding mutex unlocking.
     pub(super) struct Pool<T, F> {
-        /// A stack of T values to hand out. These are used when a Pool is
-        /// accessed by a thread that didn't create it.
-        stack: Mutex<Vec<Box<T>>>,
         /// A function to create more T values when stack is empty and a caller
         /// has requested a T.
         create: F,
+        /// Multiple stacks of T values to hand out. These are used when a Pool
+        /// is accessed by a thread that didn't create it.
+        ///
+        /// Conceptually this is `Mutex<Vec<Box<T>>>`, but sharded out to make
+        /// it scale better under high contention work-loads. We index into
+        /// this sequence via `thread_id % stacks.len()`.
+        stacks: Vec<CacheLine<Mutex<Vec<Box<T>>>>>,
         /// The ID of the thread that owns this pool. The owner is the thread
         /// that makes the first call to 'get'. When the owner calls 'get', it
         /// gets 'owner_val' directly instead of returning a T from 'stack'.
@@ -354,9 +429,17 @@ mod inner {
     unsafe impl<T: Send, F: Send + Sync> Sync for Pool<T, F> {}
 
     // If T is UnwindSafe, then since we provide exclusive access to any
-    // particular value in the pool, it should therefore also be considered
-    // RefUnwindSafe. Also, since we use std::sync::Mutex, we get poisoning
-    // from it if another thread panics while the lock is held.
+    // particular value in the pool, the pool should therefore also be
+    // considered UnwindSafe.
+    //
+    // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any
+    // point on demand, so it needs to be unwind safe on both dimensions for
+    // the entire Pool to be unwind safe.
+    impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> UnwindSafe for Pool<T, F> {}
+
+    // If T is UnwindSafe, then since we provide exclusive access to any
+    // particular value in the pool, the pool should therefore also be
+    // considered RefUnwindSafe.
     //
     // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any
     // point on demand, so it needs to be unwind safe on both dimensions for
@@ -375,9 +458,13 @@ mod inner {
             // 'Pool::new' method as 'const' too. (The alloc-only Pool::new
             // is already 'const', so that should "just work" too.) The only
             // thing we're waiting for is Mutex::new to be const.
+            let mut stacks = Vec::with_capacity(MAX_POOL_STACKS);
+            for _ in 0..stacks.capacity() {
+                stacks.push(CacheLine(Mutex::new(alloc::vec![])));
+            }
             let owner = AtomicUsize::new(THREAD_ID_UNOWNED);
             let owner_val = UnsafeCell::new(None); // init'd on first access
-            Pool { stack: Mutex::new(vec![]), create, owner, owner_val }
+            Pool { create, stacks, owner, owner_val }
         }
     }
 
@@ -401,6 +488,9 @@ mod inner {
             let caller = THREAD_ID.with(|id| *id);
             let owner = self.owner.load(Ordering::Acquire);
             if caller == owner {
+                // N.B. We could also do a CAS here instead of a load/store,
+                // but ad hoc benchmarking suggests it is slower. And a lot
+                // slower in the case where `get_slow` is common.
                 self.owner.store(THREAD_ID_INUSE, Ordering::Release);
                 return self.guard_owned(caller);
             }
@@ -444,37 +534,82 @@ mod inner {
                     return self.guard_owned(caller);
                 }
             }
-            let mut stack = self.stack.lock().unwrap();
-            let value = match stack.pop() {
-                None => Box::new((self.create)()),
-                Some(value) => value,
-            };
-            self.guard_stack(value)
+            let stack_id = caller % self.stacks.len();
+            // We try to acquire exclusive access to this thread's stack, and
+            // if so, grab a value from it if we can. We put this in a loop so
+            // that it's easy to tweak and experiment with a different number
+            // of tries. In the end, I couldn't see anything obviously better
+            // than one attempt in ad hoc testing.
+            for _ in 0..1 {
+                let mut stack = match self.stacks[stack_id].0.try_lock() {
+                    Err(_) => continue,
+                    Ok(stack) => stack,
+                };
+                if let Some(value) = stack.pop() {
+                    return self.guard_stack(value);
+                }
+                // Unlock the mutex guarding the stack before creating a fresh
+                // value since we no longer need the stack.
+                drop(stack);
+                let value = Box::new((self.create)());
+                return self.guard_stack(value);
+            }
+            // We're only here if we could get access to our stack, so just
+            // create a new value. This seems like it could be wasteful, but
+            // waiting for exclusive access to a stack when there's high
+            // contention is brutal for perf.
+            self.guard_stack_transient(Box::new((self.create)()))
         }
 
         /// Puts a value back into the pool. Callers don't need to call this.
         /// Once the guard that's returned by 'get' is dropped, it is put back
         /// into the pool automatically.
         fn put_value(&self, value: Box<T>) {
-            let mut stack = self.stack.lock().unwrap();
-            stack.push(value);
+            let caller = THREAD_ID.with(|id| *id);
+            let stack_id = caller % self.stacks.len();
+            // As with trying to pop a value from this thread's stack, we
+            // merely attempt to get access to push this value back on the
+            // stack. If there's too much contention, we just give up and throw
+            // the value away.
+            //
+            // Interestingly, in ad hoc benchmarking, it is beneficial to
+            // attempt to push the value back more than once, unlike when
+            // popping the value. I don't have a good theory for why this is.
+            // I guess if we drop too many values then that winds up forcing
+            // the pop operation to create new fresh values and thus leads to
+            // less reuse. There's definitely a balancing act here.
+            for _ in 0..10 {
+                let mut stack = match self.stacks[stack_id].0.try_lock() {
+                    Err(_) => continue,
+                    Ok(stack) => stack,
+                };
+                stack.push(value);
+                return;
+            }
         }
 
         /// Create a guard that represents the special owned T.
         fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> {
-            PoolGuard { pool: self, value: Err(caller) }
+            PoolGuard { pool: self, value: Err(caller), discard: false }
         }
 
         /// Create a guard that contains a value from the pool's stack.
         fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T, F> {
-            PoolGuard { pool: self, value: Ok(value) }
+            PoolGuard { pool: self, value: Ok(value), discard: false }
+        }
+
+        /// Create a guard that contains a value from the pool's stack with an
+        /// instruction to throw away the value instead of putting it back
+        /// into the pool.
+        fn guard_stack_transient(&self, value: Box<T>) -> PoolGuard<'_, T, F> {
+            PoolGuard { pool: self, value: Ok(value), discard: true }
         }
     }
 
     impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> {
         fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
             f.debug_struct("Pool")
-                .field("stack", &self.stack)
+                .field("stacks", &self.stacks)
                 .field("owner", &self.owner)
                 .field("owner_val", &self.owner_val)
                 .finish()
@@ -490,6 +625,12 @@ mod inner {
         /// in the special case of `Err(THREAD_ID_DROPPED)`, it means the
         /// guard has been put back into the pool and should no longer be used.
         value: Result<Box<T>, usize>,
+        /// When true, the value should be discarded instead of being pushed
+        /// back into the pool. We tend to use this under high contention, and
+        /// this allows us to avoid inflating the size of the pool. (Because
+        /// under contention, we tend to create more values instead of waiting
+        /// for access to a stack of existing values.)
+        discard: bool,
     }
 
     impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
@@ -557,7 +698,17 @@ mod inner {
         #[inline(always)]
         fn put_imp(&mut self) {
             match core::mem::replace(&mut self.value, Err(THREAD_ID_DROPPED)) {
-                Ok(value) => self.pool.put_value(value),
+                Ok(value) => {
+                    // If we were told to discard this value then don't bother
+                    // trying to put it back into the pool. This occurs when
+                    // the pop operation failed to acquire a lock and we
+                    // decided to create a new value in lieu of contending for
+                    // the lock.
+                    if self.discard {
+                        return;
+                    }
+                    self.pool.put_value(value);
+                }
                 // If this guard has a value "owned" by the thread, then
                 // the Pool guarantees that this is the ONLY such guard.
                 // Therefore, in order to place it back into the pool and make