Merge pull request #2224 from JuliaGPU/tb/backports_5.1

Backports for v5.1
JuliaGPU · Jan 7, 2024 · fc99b1d · fc99b1d · maleadt · Jan 7, 2024
2 parents ffcd7e3 + 3f1cf19
commit fc99b1d
Show file tree

Hide file tree

Showing 10 changed files with 80 additions and 116 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "CUDA"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "5.1.1"
+version = "5.1.2"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"

diff --git a/docs/src/development/kernel.md b/docs/src/development/kernel.md
@@ -66,28 +66,6 @@ julia> a
  42
 ```
 
-Kernels can also mutate `Ref` boxes:
-
-```julia
-function my_kernel(a)
-    a[] = 42
-    return
-end
-```
-
-```julia-repl
-julia> box = Ref(1)
-
-julia> CUDA.@sync @cuda my_kernel(box);
-
-julia> box[]
-42
-```
-
-Note the `CUDA.@sync` here: GPU operations always execute asynchronously, so we need to
-wait for the GPU to finish before we can access the result. This is not needed when using
-`CuArray`s, as they automatically synchronize on access.
-
 
 ## Launch configuration and indexing
 

diff --git a/lib/cusolver/sparse_factorizations.jl b/lib/cusolver/sparse_factorizations.jl
@@ -17,7 +17,6 @@ mutable struct SparseQR{T <: BlasFloat} <: Factorization{T}
   m::Cint
   nnzA::Cint
   mu::T
-  handle::cusolverSpHandle_t
   descA::CuMatrixDescriptor
   info::SparseQRInfo
   buffer::Union{CuPtr{Cvoid},CuVector{UInt8}}
@@ -27,12 +26,10 @@ function SparseQR(A::CuSparseMatrixCSR{T,Cint}, index::Char='O') where T <: Blas
     m,n = size(A)
     nnzA = nnz(A)
     mu = zero(T)
-    handle = sparse_handle()
     descA = CuMatrixDescriptor('G', 'L', 'N', index)
-    handle = sparse_handle()
     info = SparseQRInfo()
     buffer = CU_NULL
-    F = SparseQR{T}(n, m, nnzA, mu, handle, descA, info, buffer)
+    F = SparseQR{T}(n, m, nnzA, mu, descA, info, buffer)
     spqr_analyse(F, A)
     spqr_buffer(F, A)
     return F
@@ -50,7 +47,7 @@ end
 # const int *              csrColIndA,
 # csrqrInfo_t              info);
 function spqr_analyse(F::SparseQR{T}, A::CuSparseMatrixCSR{T,Cint}) where T <: BlasFloat
-    cusolverSpXcsrqrAnalysis(F.handle, F.m, F.n, F.nnzA, F.descA, A.rowPtr, A.colVal, F.info)
+    cusolverSpXcsrqrAnalysis(sparse_handle(), F.m, F.n, F.nnzA, F.descA, A.rowPtr, A.colVal, F.info)
     return F
 end
 
@@ -77,7 +74,7 @@ for (bname, iname, fname, sname, pname, elty, relty) in
         function spqr_buffer(F::SparseQR{$elty}, A::CuSparseMatrixCSR{$elty,Cint})
             internalDataInBytes = Ref{Csize_t}(0)
             workspaceInBytes = Ref{Csize_t}(0)
-            $bname(F.handle, F.m, F.n, F.nnzA, F.descA, A.nzVal, A.rowPtr, A.colVal, F.info, internalDataInBytes, workspaceInBytes)
+            $bname(sparse_handle(), F.m, F.n, F.nnzA, F.descA, A.nzVal, A.rowPtr, A.colVal, F.info, internalDataInBytes, workspaceInBytes)
             F.buffer = CuVector{UInt8}(undef, workspaceInBytes[])
             return F
         end
@@ -116,19 +113,19 @@ for (bname, iname, fname, sname, pname, elty, relty) in
         # double             tol,
         # int *              position);
         function spqr_factorise(F::SparseQR{$elty}, A::CuSparseMatrixCSR{$elty,Cint}, tol::$relty)
-            $iname(F.handle, F.m, F.n, F.nnzA, F.descA, A.nzVal, A.rowPtr, A.colVal, F.mu, F.info)
-            $fname(F.handle, F.m, F.n, F.nnzA, CU_NULL, CU_NULL, F.info, F.buffer)
+            $iname(sparse_handle(), F.m, F.n, F.nnzA, F.descA, A.nzVal, A.rowPtr, A.colVal, F.mu, F.info)
+            $fname(sparse_handle(), F.m, F.n, F.nnzA, CU_NULL, CU_NULL, F.info, F.buffer)
             singularity = Ref{Cint}(0)
-            $pname(F.handle, F.info, tol, singularity)
+            $pname(sparse_handle(), F.info, tol, singularity)
             (singularity[] ≥ 0) && throw(SingularException(singularity[]))
             return F
         end
 
         function spqr_factorise_solve(F::SparseQR{$elty}, A::CuSparseMatrixCSR{$elty,Cint}, b::CuVector{$elty}, x::CuVector{$elty}, tol::$relty)
-            $iname(F.handle, F.m, F.n, F.nnzA, F.descA, A.nzVal, A.rowPtr, A.colVal, F.mu, F.info)
-            $fname(F.handle, F.m, F.n, F.nnzA, b, x, F.info, F.buffer)
+            $iname(sparse_handle(), F.m, F.n, F.nnzA, F.descA, A.nzVal, A.rowPtr, A.colVal, F.mu, F.info)
+            $fname(sparse_handle(), F.m, F.n, F.nnzA, b, x, F.info, F.buffer)
             singularity = Ref{Cint}(0)
-            $pname(F.handle, F.info, tol, singularity)
+            $pname(sparse_handle(), F.info, tol, singularity)
             (singularity[] ≥ 0) && throw(SingularException(singularity[]))
             return F
         end
@@ -144,14 +141,14 @@ for (bname, iname, fname, sname, pname, elty, relty) in
         # csrqrInfo_t        info,
         # void *             pBuffer);
         function spqr_solve(F::SparseQR{$elty}, b::CuVector{$elty}, x::CuVector{$elty})
-            $sname(F.handle, F.m, F.n, b, x, F.info, F.buffer)
+            $sname(sparse_handle(), F.m, F.n, b, x, F.info, F.buffer)
             return x
         end
 
         function spqr_solve(F::SparseQR{$elty}, B::CuMatrix{$elty}, X::CuMatrix{$elty})
             m, p = size(B)
             for j=1:p
-                $sname(F.handle, F.m, F.n, view(B,:,j), view(X,:,j), F.info, F.buffer)
+                $sname(sparse_handle(), F.m, F.n, view(B,:,j), view(X,:,j), F.info, F.buffer)
             end
             return X
         end
@@ -175,7 +172,6 @@ Base.unsafe_convert(::Type{csrcholInfo_t}, info::SparseCholeskyInfo) = info.info
 mutable struct SparseCholesky{T <: BlasFloat} <: Factorization{T}
     n::Cint
     nnzA::Cint
-    handle::cusolverSpHandle_t
     descA::CuMatrixDescriptor
     info::SparseCholeskyInfo
     buffer::Union{CuPtr{Cvoid},CuVector{UInt8}}
@@ -184,11 +180,10 @@ end
 function SparseCholesky(A::Union{CuSparseMatrixCSC{T,Cint},CuSparseMatrixCSR{T,Cint}}, index::Char='O') where T <: BlasFloat
     n = checksquare(A)
     nnzA = nnz(A)
-    handle = sparse_handle()
     descA = CuMatrixDescriptor('G', 'L', 'N', index)
     info = SparseCholeskyInfo()
     buffer = CU_NULL
-    F = SparseCholesky{T}(n, nnzA, handle, descA, info, buffer)
+    F = SparseCholesky{T}(n, nnzA, descA, info, buffer)
     spcholesky_analyse(F, A)
     spcholesky_buffer(F, A)
     return F
@@ -206,9 +201,9 @@ end
 #   csrcholInfo_t            info);
 function spcholesky_analyse(F::SparseCholesky{T}, A::Union{CuSparseMatrixCSC{T,Cint},CuSparseMatrixCSR{T,Cint}}) where T <: BlasFloat
     if A isa CuSparseMatrixCSC
-        cusolverSpXcsrcholAnalysis(F.handle, F.n, F.nnzA, F.descA, A.colPtr, A.rowVal, F.info)
+        cusolverSpXcsrcholAnalysis(sparse_handle(), F.n, F.nnzA, F.descA, A.colPtr, A.rowVal, F.info)
     else
-        cusolverSpXcsrcholAnalysis(F.handle, F.n, F.nnzA, F.descA, A.rowPtr, A.colVal, F.info)
+        cusolverSpXcsrcholAnalysis(sparse_handle(), F.n, F.nnzA, F.descA, A.rowPtr, A.colVal, F.info)
     end
     return F
 end
@@ -236,9 +231,9 @@ for (bname, fname, pname, elty, relty) in
             internalDataInBytes = Ref{Csize_t}(0)
             workspaceInBytes = Ref{Csize_t}(0)
             if A isa CuSparseMatrixCSC
-                $bname(F.handle, F.n, F.nnzA, F.descA, A.nzVal, A.colPtr, A.rowVal, F.info, internalDataInBytes, workspaceInBytes)
+                $bname(sparse_handle(), F.n, F.nnzA, F.descA, A.nzVal, A.colPtr, A.rowVal, F.info, internalDataInBytes, workspaceInBytes)
             else
-                $bname(F.handle, F.n, F.nnzA, F.descA, A.nzVal, A.rowPtr, A.colVal, F.info, internalDataInBytes, workspaceInBytes)
+                $bname(sparse_handle(), F.n, F.nnzA, F.descA, A.nzVal, A.rowPtr, A.colVal, F.info, internalDataInBytes, workspaceInBytes)
             end
             F.buffer = CuVector{UInt8}(undef, workspaceInBytes[])
             return F
@@ -267,12 +262,12 @@ for (bname, fname, pname, elty, relty) in
         function spcholesky_factorise(F::SparseCholesky{$elty}, A::Union{CuSparseMatrixCSC{$elty,Cint},CuSparseMatrixCSR{$elty,Cint}}, tol::$relty)
             if A isa CuSparseMatrixCSC
                 nzval = $elty <: Complex ? conj(A.nzVal) : A.nzVal
-                $fname(F.handle, F.n, F.nnzA, F.descA, nzval, A.colPtr, A.rowVal, F.info, F.buffer)
+                $fname(sparse_handle(), F.n, F.nnzA, F.descA, nzval, A.colPtr, A.rowVal, F.info, F.buffer)
             else
-                $fname(F.handle, F.n, F.nnzA, F.descA, A.nzVal, A.rowPtr, A.colVal, F.info, F.buffer)
+                $fname(sparse_handle(), F.n, F.nnzA, F.descA, A.nzVal, A.rowPtr, A.colVal, F.info, F.buffer)
             end
             singularity = Ref{Cint}(0)
-            $pname(F.handle, F.info, tol, singularity)
+            $pname(sparse_handle(), F.info, tol, singularity)
             (singularity[] ≥ 0) && throw(SingularException(singularity[]))
             return F
         end
@@ -294,14 +289,14 @@ for (sname, dname, elty, relty) in ((:cusolverSpScsrcholSolve, :cusolverSpScsrch
         #   csrcholInfo_t          info,
         #   void *                 pBuffer);
         function spcholesky_solve(F::SparseCholesky{$elty}, b::CuVector{$elty}, x::CuVector{$elty})        
-            $sname(F.handle, F.n, b, x, F.info, F.buffer)
+            $sname(sparse_handle(), F.n, b, x, F.info, F.buffer)
             return x
         end
 
         function spcholesky_solve(F::SparseCholesky{$elty}, B::CuMatrix{$elty}, X::CuMatrix{$elty})
             n, p = size(B)
             for j=1:p
-                $sname(F.handle, F.n, view(B,:,j), view(X,:,j), F.info, F.buffer)
+                $sname(sparse_handle(), F.n, view(B,:,j), view(X,:,j), F.info, F.buffer)
             end
             return X
         end
@@ -313,7 +308,7 @@ for (sname, dname, elty, relty) in ((:cusolverSpScsrcholSolve, :cusolverSpScsrch
         #   csrcholInfo_t      info,
         #   float *            diag);
         function spcholesky_diag(F::SparseCholesky{$elty}, diag::CuVector{$relty})
-            $dname(F.handle, F.info, diag)
+            $dname(sparse_handle(), F.info, diag)
             return diag
         end
     end

diff --git a/src/array.jl b/src/array.jl
@@ -210,12 +210,15 @@ end
 ## unsafe_wrap
 
 """
+  # simple case, wrapping a CuArray around an existing GPU pointer
   unsafe_wrap(CuArray, ptr::CuPtr{T}, dims; own=false, ctx=context())
 
-  # requires
+  # wraps a CPU array object around a unified GPU array
   unsafe_wrap(Array, a::CuArray)
 
-  # requires HMM
+  # wraps a GPU array object around a CPU array.
+  # if your system supports HMM, this is a fast operation.
+  # in other cases, it has to use page locking, which can be slow.
   unsafe_wrap(CuArray, ptr::ptr{T}, dims)
   unsafe_wrap(CuArray, a::Array)
 

diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -153,38 +153,15 @@ function Adapt.adapt_storage(::KernelAdaptor, xs::DenseCuArray{T,N}) where {T,N}
 end
 
 # Base.RefValue isn't GPU compatible, so provide a compatible alternative.
-# however, `Ref` is commonly used for two different purposes:
-# - as a way to box a value and pass that box by (mutable) reference;
-# - to force treating an argument to broadcast as a scalar.
-# as the latter is often used with complex inputs like `CuArrays`, we need to adapt.
-# however, that breaks the ability to mutate, as adapting allocates a new object.
-# to support both, we differentiate based on the type of the value being boxed.
+# Note that it isn't safe to use unified or heterogeneous memory to support a
+# mutable Ref, because there's no guarantee that the memory would be kept alive
+# long enough (especially with broadcast using ephemeral Refs for scalar args).
 struct CuRefValue{T} <: Ref{T}
     val::T
 end
 Base.getindex(r::CuRefValue{T}) where T = r.val
-struct CuRefPointer{T} <: Ref{T}
-    ptr::Ptr{T}
-end
-Base.getindex(r::CuRefPointer{T}) where T = unsafe_load(r.ptr)
-Base.setindex!(r::CuRefPointer{T}, v) where T = unsafe_store!(r.ptr, convert(T, v))
-function Adapt.adapt_structure(to::KernelAdaptor, ref::Base.RefValue{T}) where T
-    if isbitstype(T) && sizeof(T) > 0
-        ptr = Base.unsafe_convert(Ptr{T}, ref)
-        if driver_version() < v"12.2" ||
-           attribute(device(), DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS) != 1
-            # no HMM, need to register this memory
-            ctx = context()
-            Mem.__pin(ptr, sizeof(T))
-            finalizer(ref) do _
-                Mem.__unpin(ptr, ctx)
-            end
-        end
-        CuRefPointer{T}(ptr)
-    else
-        CuRefValue(adapt(to, ref[]))
-    end
-end
+Adapt.adapt_structure(to::KernelAdaptor, ref::Base.RefValue) =
+    CuRefValue(adapt(to, ref[]))
 
 # broadcast sometimes passes a ref(type), resulting in a GPU-incompatible DataType box.
 # avoid that by using a special kind of ref that knows about the boxed type.

diff --git a/src/device/random.jl b/src/device/random.jl
@@ -50,9 +50,6 @@ function initialize_rng_state()
     @inbounds global_random_counters()[warpId] = 0
 end
 
-@device_override Random.make_seed() = clock(UInt32)
-
-
 # generators
 
 using Random123: philox2x_round, philox2x_bumpkey
@@ -108,8 +105,23 @@ function Random.seed!(rng::Philox2x32, seed::Integer, counter::Integer=0)
     return
 end
 
-@device_override Random.seed!(::Random._GLOBAL_RNG, seed) =
-    Random.seed!(Random.default_rng(), seed)
+if VERSION >= v"1.11-"
+    # `Random.seed!(::AbstractRNG)` now passes a `nothing` seed value
+    Random.seed!(rng::Philox2x32, seed::Nothing) =
+        Random.seed!(rng, clock(UInt32))
+else
+    # ... where it used to call `Random_make_seed()`
+    @device_override Random.make_seed() = clock(UInt32)
+end
+
+# seeding the implicit default RNG
+if VERSION >= v"1.11-"
+    @device_override Random.seed!(seed) =
+        Random.seed!(Random.default_rng(), seed)
+else
+    @device_override Random.seed!(::Random._GLOBAL_RNG, seed) =
+        Random.seed!(Random.default_rng(), seed)
+end
 
 """
     Random.rand(rng::Philox2x32, UInt32)

diff --git a/test/Project.toml b/test/Project.toml
@@ -23,3 +23,6 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+Aqua = "0.8"
diff --git a/test/base/aqua.jl b/test/base/aqua.jl
@@ -1,20 +1,21 @@
 using Aqua
 
-# FIXME: Adapt.WrappedArray contains subtypes that do not bind the N typevar
-#Aqua.test_unbound_args(CUDA)
-
 # FIXME: we have plenty of ambiguities, let's at least ensure that we don't create more
-#Aqua.test_ambiguities(CUDA)
 let ambs = Aqua.detect_ambiguities(CUDA; recursive=true)
     pkg_match(pkgname, pkgdir::Nothing) = false
     pkg_match(pkgname, pkgdir::AbstractString) = occursin(pkgname, pkgdir)
     filter!(x -> pkg_match("CUDA", pkgdir(last(x).module)), ambs)
     @test length(ambs) ≤ 18
 end
 
-Aqua.test_undefined_exports(CUDA)
-Aqua.test_stale_deps(CUDA; ignore=[:CUDA_Runtime_Discovery, :CUDA_Runtime_jll,
-                                   :SpecialFunctions])
-Aqua.test_deps_compat(CUDA)
-Aqua.test_project_extras(CUDA)
-Aqua.test_piracy(CUDA)
+Aqua.test_all(CUDA;
+    stale_deps=(ignore=[:CUDA_Runtime_Discovery, :CUDA_Runtime_jll,
+                        :SpecialFunctions],),
+
+    # tested above
+    ambiguities=false,
+
+    # FIXME: Adapt.WrappedArray contains subtypes that do not bind the N typevar
+    #Aqua.test_unbound_args(CUDA)
+    unbound_args=false
+)
diff --git a/test/core/execution.jl b/test/core/execution.jl
@@ -595,17 +595,6 @@ end
     @test f(2) == 2
 end
 
-@testset "Ref boxes" begin
-    function kernel(x)
-        x[] += 1
-        return
-    end
-
-    box = Ref(1)
-    CUDA.@sync @cuda kernel(box)
-    @test box[] == 2
-end
-
 end
 
 ############################################################################################