diff --git a/libs/vkd3d/swapchain.c b/libs/vkd3d/swapchain.c
index d8c8baadbb..0d937b7105 100644
--- a/libs/vkd3d/swapchain.c
+++ b/libs/vkd3d/swapchain.c
@@ -90,6 +90,7 @@ struct dxgi_vk_swap_chain
     vkd3d_native_sync_handle frame_latency_event_internal;
     vkd3d_native_sync_handle present_request_done_event;
     bool outstanding_present_request;
+    uint32_t frame_latency_event_internal_wait_counts;
 
     UINT frame_latency;
     UINT frame_latency_internal;
@@ -885,11 +886,81 @@ static bool dxgi_vk_swap_chain_present_is_occluded(struct dxgi_vk_swap_chain *ch
 
 static void dxgi_vk_swap_chain_present_callback(void *chain);
 
+static void dxgi_vk_swap_chain_wait_internal_handle(struct dxgi_vk_swap_chain *chain, bool low_latency_enable)
+{
+    const struct vkd3d_vk_device_procs *vk_procs = &chain->queue->device->vk_procs;
+    bool non_blocking_internal_handle_wait = low_latency_enable;
+    uint64_t completed_submissions = 0;
+    uint64_t user_submissions = 0;
+
+    chain->frame_latency_event_internal_wait_counts++;
+
+    if (non_blocking_internal_handle_wait)
+    {
+        /* If we're using low latency mode, we expect that applications sleep on their own in LatencySleep.
+         * If we start sleeping ourselves here, we sometimes end up fighting with NV's LL2 implementation over
+         * which sleep cycle gets to dominate. This can manifest as a random pumping pattern.
+         *
+         * If our sleep dominates, we end up in an unstable situation where LL2 may think we're
+         * more CPU bound than we actually are.
+         *
+         * In a FIFO bound scenario however where GPU completes long before vblank hits,
+         * we should rely on frame latency sleeps.
+         *
+         * Use a very simple heuristic. If the blit timeline semaphore lags behind by 2+ frames, assume we're
+         * fully GPU bound and we should back off and let low latency deal with it more gracefully. */
+        user_submissions = chain->user.blit_count;
+
+        if (VK_CALL(vkGetSemaphoreCounterValue(chain->queue->device->vk_device,
+                chain->present.vk_complete_semaphore,
+                &completed_submissions)) == VK_SUCCESS)
+        {
+            /* We just submitted frame N. If N - 2 is already complete, it means there is <= 2 frames worth of GPU work
+             * queued up. For a FIFO bound or CPU bound game, this is the case we expect, so we should use latency fences here.
+             * If we're GPU bound with <= 2 frames queued up, we'll likely not block in our own latency handles anyway. */
+            if (completed_submissions + 2 >= user_submissions)
+            {
+                non_blocking_internal_handle_wait = false;
+            }
+            else if (chain->debug_latency)
+            {
+                INFO("Completed count: %"PRIu64", submitted count: %"PRIu64". GPU queue is too deep, deferring to low latency sleep.\n",
+                        completed_submissions, user_submissions);
+            }
+        }
+        else
+        {
+            ERR("Failed to query semaphore complete value.\n");
+            non_blocking_internal_handle_wait = false;
+        }
+    }
+
+    if (non_blocking_internal_handle_wait)
+    {
+        /* Just make sure the counter doesn't get unbounded. */
+        while (chain->frame_latency_event_internal_wait_counts &&
+                vkd3d_native_sync_handle_acquire_timeout(chain->frame_latency_event_internal, 0))
+        {
+            chain->frame_latency_event_internal_wait_counts--;
+        }
+    }
+    else
+    {
+        while (chain->frame_latency_event_internal_wait_counts)
+        {
+            vkd3d_native_sync_handle_acquire(chain->frame_latency_event_internal);
+            chain->frame_latency_event_internal_wait_counts--;
+        }
+    }
+}
+
 static HRESULT STDMETHODCALLTYPE dxgi_vk_swap_chain_Present(IDXGIVkSwapChain *iface, UINT SyncInterval, UINT PresentFlags, const DXGI_PRESENT_PARAMETERS *pPresentParameters)
 {
     struct dxgi_vk_swap_chain *chain = impl_from_IDXGIVkSwapChain(iface);
     struct dxgi_vk_swap_chain_present_request *request;
     struct vkd3d_queue_timeline_trace_cookie cookie;
+    bool low_latency_enable;
+
     TRACE("iface %p, SyncInterval %u, PresentFlags #%x, pPresentParameters %p.\n",
             iface, SyncInterval, PresentFlags, pPresentParameters);
     (void)pPresentParameters;
@@ -937,12 +1008,14 @@ static HRESULT STDMETHODCALLTYPE dxgi_vk_swap_chain_Present(IDXGIVkSwapChain *if
         request->requested_low_latency_state = chain->requested_low_latency_state;
         request->low_latency_update_requested = chain->low_latency_update_requested;
         chain->low_latency_update_requested = false;
+        low_latency_enable = chain->requested_low_latency_state.mode;
         pthread_mutex_unlock(&chain->present.low_latency_state_update_lock);
     }
     else
     {
         memset(&request->requested_low_latency_state, 0, sizeof(request->requested_low_latency_state));
         request->low_latency_update_requested = false;
+        low_latency_enable = false;
     }
 
     /* Need to process this task in queue thread to deal with wait-before-signal.
@@ -960,7 +1033,7 @@ static HRESULT STDMETHODCALLTYPE dxgi_vk_swap_chain_Present(IDXGIVkSwapChain *if
 
     /* Relevant if application does not use latency fence, or we force a lower latency through VKD3D_SWAPCHAIN_FRAME_LATENCY overrides. */
     if (vkd3d_native_sync_handle_is_valid(chain->frame_latency_event_internal))
-        vkd3d_native_sync_handle_acquire(chain->frame_latency_event_internal);
+        dxgi_vk_swap_chain_wait_internal_handle(chain, low_latency_enable);
 
     if (vkd3d_native_sync_handle_is_valid(chain->present_request_done_event))
     {