JuliaGPU · maleadt · Apr 27, 2024 · Apr 26, 2024 · Apr 26, 2024 · Apr 26, 2024
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -140,8 +140,12 @@ steps:
               try
                 Pkg.instantiate()
               catch
-                # if we fail to instantiate, assume that we need a newer CUDA.jl
-                Pkg.develop(path=".")
+                # if we fail to instantiate, assume that we need newer dependencies
+                deps = [PackageSpec(path=".")]
+                if "{{matrix.package}}" == "cuTensorNet"
+                  push!(deps, PackageSpec(path="lib/cutensor"))
+                end
+                Pkg.develop(deps)
               end
 
               Pkg.add("CUDA_Runtime_jll")

diff --git a/lib/cublas/CUBLAS.jl b/lib/cublas/CUBLAS.jl
@@ -71,9 +71,20 @@ function math_mode!(handle, mode)
     return
 end
 
-# cache for created, but unused handles
-const idle_handles = HandleCache{CuContext,cublasHandle_t}()
-const idle_xt_handles = HandleCache{Any,cublasXtHandle_t}()
+
+## handles
+
+function handle_ctor(ctx)
+    context!(ctx) do
+        cublasCreate()
+    end
+end
+function handle_dtor(ctx, handle)
+    context!(ctx; skip_destroyed=true) do
+        cublasDestroy_v2(handle)
+    end
+end
+const idle_handles = HandleCache{CuContext,cublasHandle_t}(handle_ctor, handle_dtor)
 
 function handle()
     cuda = CUDA.active_state()
@@ -86,20 +97,12 @@ function handle()
 
     # get library state
     @noinline function new_state(cuda)
-        new_handle = pop!(idle_handles, cuda.context) do
-            cublasCreate()
-        end
-
+        new_handle = pop!(idle_handles, cuda.context)
         finalizer(current_task()) do task
-            push!(idle_handles, cuda.context, new_handle) do
-                context!(cuda.context; skip_destroyed=true) do
-                    cublasDestroy_v2(new_handle)
-                end
-            end
+            push!(idle_handles, cuda.context, new_handle)
         end
 
         cublasSetStream_v2(new_handle, cuda.stream)
-
         math_mode!(new_handle, cuda.math_mode)
 
         (; handle=new_handle, cuda.stream, cuda.math_mode)
@@ -129,6 +132,34 @@ function handle()
     return state.handle
 end
 
+
+## xt handles
+
+function xt_handle_ctor(ctx)
+    context!(ctx) do
+        cublasXtCreate()
+    end
+end
+function xt_handle_dtor(ctx, handle)
+    context!(ctx; skip_destroyed=true) do
+        cublasXtDestroy(handle)
+    end
+end
+const idle_xt_handles =
+    HandleCache{CuContext,cublasXtHandle_t}(xt_handle_ctor, xt_handle_dtor)
+
+function devices!(devs::Vector{CuDevice})
+    task_local_storage(:CUBLASxt_devices, sort(devs; by=deviceid))
+    return
+end
+
+devices() = get!(task_local_storage(), :CUBLASxt_devices) do
+    # by default, select all devices
+    sort(collect(CUDA.devices()); by=deviceid)
+end::Vector{CuDevice}
+
+ndevices() = length(devices())
+
 function xt_handle()
     cuda = CUDA.active_state()
 
@@ -147,15 +178,9 @@ function xt_handle()
 
     # get library state
     @noinline function new_state(cuda)
-        new_handle = pop!(idle_xt_handles, cuda.context) do
-            cublasXtCreate()
-        end
-
+        new_handle = pop!(idle_xt_handles, cuda.context)
         finalizer(current_task()) do task
-            push!(idle_xt_handles, cuda.context, new_handle) do
-                # TODO: which context do we need to destroy this on?
-                cublasXtDestroy(new_handle)
-            end
+            push!(idle_xt_handles, cuda.context, new_handle)
         end
 
         devs = convert.(Cint, devices())
@@ -170,18 +195,6 @@ function xt_handle()
     return state.handle
 end
 
-function devices!(devs::Vector{CuDevice})
-    task_local_storage(:CUBLASxt_devices, sort(devs; by=deviceid))
-    return
-end
-
-devices() = get!(task_local_storage(), :CUBLASxt_devices) do
-    # by default, select all devices
-    sort(collect(CUDA.devices()); by=deviceid)
-end::Vector{CuDevice}
-
-ndevices() = length(devices())
-
 
 ## logging
 

diff --git a/lib/cudnn/Project.toml b/lib/cudnn/Project.toml
@@ -11,7 +11,7 @@ CUDNN_jll = "62b44479-cb7b-5706-934f-f13b2eb2e645"
 
 [compat]
 CEnum = "0.2, 0.3, 0.4, 0.5"
-CUDA = "~5.3, ~5.4"
+CUDA = "~5.4"
 CUDA_Runtime_Discovery = "0.2"
 CUDNN_jll = "~9.0"
 julia = "1.8"
diff --git a/lib/cudnn/src/cuDNN.jl b/lib/cudnn/src/cuDNN.jl
@@ -62,8 +62,20 @@ function math_mode(mode=CUDA.math_mode())
     end
 end
 
-# cache for created, but unused handles
-const idle_handles = HandleCache{CuContext,cudnnHandle_t}()
+
+## handles
+
+function handle_ctor(ctx)
+    context!(ctx) do
+        cudnnCreate()
+    end
+end
+function handle_dtor(ctx, handle)
+    context!(ctx; skip_destroyed=true) do
+        cudnnDestroy(handle)
+    end
+end
+const idle_handles = HandleCache{CuContext,cudnnHandle_t}(handle_ctor, handle_dtor)
 
 function handle()
     cuda = CUDA.active_state()
@@ -76,16 +88,9 @@ function handle()
 
     # get library state
     @noinline function new_state(cuda)
-        new_handle = pop!(idle_handles, cuda.context) do
-            cudnnCreate()
-        end
-
+        new_handle = pop!(idle_handles, cuda.context)
         finalizer(current_task()) do task
-            push!(idle_handles, cuda.context, new_handle) do
-                context!(cuda.context; skip_destroyed=true) do
-                    cudnnDestroy(new_handle)
-                end
-            end
+            push!(idle_handles, cuda.context, new_handle)
         end
 
         cudnnSetStream(new_handle, cuda.stream)

diff --git a/lib/cufft/wrappers.jl b/lib/cufft/wrappers.jl
@@ -43,7 +43,7 @@ function cufftMakePlan(xtype::cufftType_t, xdims::Dims, region)
         end
         if ((region...,) == ((1:nrank)...,))
             # handle simple case, transforming the first nrank dimensions, ... simply! (for robustness)
-            # arguments are: plan, rank, transform-sizes, inembed, istride, idist, onembed, ostride, odist, type batch 
+            # arguments are: plan, rank, transform-sizes, inembed, istride, idist, onembed, ostride, odist, type batch
             cufftMakePlanMany(handle, nrank, Cint[rsz...], C_NULL, 1, 1, C_NULL, 1, 1,
                              xtype, batch, worksize_ref)
         else
@@ -151,29 +151,35 @@ function cufftMakePlan(xtype::cufftType_t, xdims::Dims, region)
     handle, worksize_ref[]
 end
 
-# plan cache
-const cufftHandleCacheKey = Tuple{CuContext, cufftType_t, Dims, Any}
-const idle_handles = HandleCache{cufftHandleCacheKey, cufftHandle}()
-function cufftGetPlan(args...)
 
-    ctx = context()
-    handle = pop!(idle_handles, (ctx, args...)) do
+## plan cache
+
+const cufftHandleCacheKey = Tuple{CuContext, cufftType_t, Dims, Any}
+function handle_ctor((ctx, args...))
+    context!(ctx) do
         # make the plan
         handle, worksize = cufftMakePlan(args...)
 
         # NOTE: we currently do not use the worksize to allocate our own workarea,
         #       instead relying on the automatic allocation strategy.
         handle
     end
+end
+function handle_dtor((ctx, args...), handle)
+    context!(ctx; skip_destroyed=true) do
+        cufftDestroy(handle)
+    end
+end
+const idle_handles = HandleCache{cufftHandleCacheKey, cufftHandle}(handle_ctor, handle_dtor)
+
+function cufftGetPlan(args...)
+    ctx = context()
+    handle = pop!(idle_handles, (ctx, args...))
 
-    # assign to the current stream
     cufftSetStream(handle, stream())
 
     return handle
 end
 function cufftReleasePlan(plan)
-    push!(idle_handles, plan) do
-        cufftDestroy(plan)
-    end
-
+    push!(idle_handles, plan)
 end
diff --git a/lib/curand/CURAND.jl b/lib/curand/CURAND.jl
@@ -21,8 +21,21 @@ include("wrappers.jl")
 # high-level integrations
 include("random.jl")
 
-# cache for created, but unused handles
-const idle_curand_rngs = HandleCache{CuContext,RNG}()
+
+## handles
+
+function handle_ctor(ctx)
+    context!(ctx) do
+        RNG()
+    end
+end
+function handle_dtor(ctx, handle)
+    context!(ctx; skip_destroyed=true) do
+        # no need to do anything, as the RNG is collected by its finalizer
+        # TODO: early free?
+    end
+end
+const idle_curand_rngs = HandleCache{CuContext,RNG}(handle_ctor, handle_dtor)
 
 function default_rng()
     cuda = CUDA.active_state()
@@ -35,17 +48,13 @@ function default_rng()
 
     # get library state
     @noinline function new_state(cuda)
-        new_rng = pop!(idle_curand_rngs, cuda.context) do
-            RNG()
-        end
-
+        new_rng = pop!(idle_curand_rngs, cuda.context)
         finalizer(current_task()) do task
-            push!(idle_curand_rngs, cuda.context, new_rng) do
-                # no need to do anything, as the RNG is collected by its finalizer
-            end
+            push!(idle_curand_rngs, cuda.context, new_rng)
         end
 
         Random.seed!(new_rng)
+
         (; rng=new_rng)
     end
     state = get!(states, cuda.context) do