From 3f29808c3ba8f1a5fb364d73f83a469ad791b516 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 25 Aug 2023 10:13:53 +0200
Subject: [PATCH] Busy-wait before doing nonblocking synchronization.

This should improve the latency of short operations.
---
 lib/cudadrv/synchronization.jl | 106 +++++++++++++++++++--------------
 1 file changed, 60 insertions(+), 46 deletions(-)

diff --git a/lib/cudadrv/synchronization.jl b/lib/cudadrv/synchronization.jl
index a055a50ae0..d512650511 100644
--- a/lib/cudadrv/synchronization.jl
+++ b/lib/cudadrv/synchronization.jl
@@ -64,13 +64,46 @@ Base.lock(c::BidirectionalChannel) = lock(c.cond_take)
 Base.unlock(c::BidirectionalChannel) = unlock(c.cond_take)
 
 
+#
+# fast-path synchronization
+#
+
+# before using a nonblocking mechanism, which has some overhead, use a busy-loop
+# that queries the state of the object to synchronize. this reduces latency,
+# especially for short operations. note that because it does not actually perform
+# the synchronization, when it returns true (indicating that the object is synchronized)
+# the actual synchronization API should be called again.
+
+function fast_synchronization(f, obj)
+    # fast path
+    f(obj) && return true
+
+    # minimize latency of short operations by busy-waiting,
+    # initially without even yielding to other tasks
+    spins = 0
+    while spins < 256
+        if spins < 32
+            ccall(:jl_cpu_pause, Cvoid, ())
+            # temporary solution before we have gc transition support in codegen.
+            ccall(:jl_gc_safepoint, Cvoid, ())
+        else
+            yield()
+        end
+        f(obj) && return true
+        spins += 1
+    end
+
+    return false
+end
+
+
 #
 # nonblocking sync
 #
 
 @static if VERSION >= v"1.9.2"
 
-# if we support foreign threads, perform the synchronization on a separate thread.
+# if we support foreign threads, perform the actual synchronization on a separate thread.
 
 const MAX_SYNC_THREADS = 4
 const sync_channels = Array{BidirectionalChannel{Any}}(undef, MAX_SYNC_THREADS)
@@ -133,29 +166,37 @@ end
 
 function device_synchronize()
     if use_nonblocking_synchronization
-        nonblocking_synchronize(context())
+        if fast_synchronization(isdone, legacy_stream())
+            cuCtxSynchronize()
+        else
+            nonblocking_synchronize(context())
+        end
     else
         cuCtxSynchronize()
     end
+
     check_exceptions()
 end
 
 function synchronize(stream::CuStream=stream())
     if use_nonblocking_synchronization
-        if !isdone(stream)
-            # slow path
+        if fast_synchronization(isdone, stream)
+            cuStreamSynchronize(stream)
+        else
             nonblocking_synchronize(stream)
         end
     else
         cuStreamSynchronize(stream)
     end
+
     check_exceptions()
 end
 
 function synchronize(event::CuEvent)
     if use_nonblocking_synchronization
-        if !isdone(event)
-            # slow path
+        if fast_synchronization(isdone, event)
+            cuEventSynchronize(event)
+        else
             nonblocking_synchronize(event)
         end
     else
@@ -171,32 +212,16 @@ else
 # requiring us to perform the actual API call again after nonblocking synchronization.
 
 function nonblocking_synchronize(stream::CuStream)
-    # fast path
-    isdone(stream) && return
-
-    # minimize latency of short operations by busy-waiting,
-    # initially without even yielding to other tasks
-    spins = 0
-    while spins < 256
-        if spins < 32
-            ccall(:jl_cpu_pause, Cvoid, ())
-            # Temporary solution before we have gc transition support in codegen.
-            ccall(:jl_gc_safepoint, Cvoid, ())
-        else
-            yield()
-        end
-        isdone(stream) && return
-        spins += 1
-    end
-
-    # minimize CPU usage of long-running kernels by waiting for an event signalled by CUDA
+    # wait for an event signalled by CUDA
     event = Base.Event()
     launch(; stream) do
         notify(event)
     end
+
     # if an error occurs, the callback may never fire, so use a timer to detect such cases
     dev = device()
     timer = Timer(0; interval=1)
+
     Base.@sync begin
         Threads.@spawn try
             device!(dev)
@@ -226,7 +251,10 @@ end
 
 function device_synchronize()
     if use_nonblocking_synchronization
-        nonblocking_synchronize(legacy_stream())
+        stream = legacy_stream()
+        if !fast_synchronization(isdone, stream)
+            nonblocking_synchronize(stream)
+        end
     end
     cuCtxSynchronize()
 
@@ -235,34 +263,20 @@ end
 
 function synchronize(stream::CuStream=stream())
     if use_nonblocking_synchronization
-        nonblocking_synchronize(stream)
+        if !fast_synchronization(isdone, stream)
+            nonblocking_synchronize(stream)
+        end
     end
     cuStreamSynchronize(stream)
 
     check_exceptions()
 end
 
-function synchronize(e::CuEvent)
+function synchronize(event::CuEvent)
     if use_nonblocking_synchronization
-        # fast path
-        isdone(e) && return
-
-        # spin (initially without yielding to minimize latency)
-        spins = 0
-        while spins < 256
-            if spins < 32
-                ccall(:jl_cpu_pause, Cvoid, ())
-                # Temporary solution before we have gc transition support in codegen.
-                ccall(:jl_gc_safepoint, Cvoid, ())
-            else
-                yield()
-            end
-            isdone(e) && return
-            spins += 1
-        end
+        fast_synchronization(isdone, event)
     end
-
-    cuEventSynchronize(e)
+    cuEventSynchronize(event)
 end
 
 end