From 54bf5b08d7531f3b51a3fbb4626c157674317a4f Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Nov 2024 15:13:15 +0100 Subject: [PATCH 1/5] Native RNG: Improve randn determinism by using fixed grid. --- src/random.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/random.jl b/src/random.jl index f5a05ae3e1..c46f914275 100644 --- a/src/random.jl +++ b/src/random.jl @@ -150,11 +150,11 @@ function Random.randn!(rng::RNG, A::AnyCuArray{<:Union{AbstractFloat,Complex{<:A return end - kernel = @cuda launch=false name="rand!" kernel(A, rng.seed, rng.counter) - config = launch_configuration(kernel.fun; max_threads=64) - threads = max(32, min(config.threads, length(A)รท2)) - blocks = min(config.blocks, cld(cld(length(A), 2), threads)) - kernel(A, rng.seed, rng.counter; threads, blocks) + # see note in `rand!` about the launch configuration + threads = 32 + blocks = cld(cld(length(A), 2), threads) + + @cuda threads=threads blocks=blocks name="randn!" kernel(A, rng.seed, rng.counter) new_counter = Int64(rng.counter) + length(A) overflow, remainder = fldmod(new_counter, typemax(UInt32)) From 0a515fca1d49646ea0f483d49935af0d60c19a81 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Nov 2024 15:13:33 +0100 Subject: [PATCH 2/5] Native RNG: Support large arrays by avoiding counter overflowing. --- src/random.jl | 12 ++++++------ test/base/random.jl | 6 ++++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/random.jl b/src/random.jl index c46f914275..ff26d96a1d 100644 --- a/src/random.jl +++ b/src/random.jl @@ -52,8 +52,8 @@ function Random.rand!(rng::RNG, A::AnyCuArray) # grid-stride loop threadId = threadIdx().x - window = blockDim().x * gridDim().x - offset = (blockIdx().x - 1) * blockDim().x + window = widemul(blockDim().x, gridDim().x) + offset = widemul(blockIdx().x - 1i32, blockDim().x) while offset < length(A) i = threadId + offset if i <= length(A) @@ -96,8 +96,8 @@ function Random.randn!(rng::RNG, A::AnyCuArray{<:Union{AbstractFloat,Complex{<:A # grid-stride loop threadId = threadIdx().x - window = (blockDim().x - 1) * gridDim().x - offset = (blockIdx().x - 1) * blockDim().x + window = widemul(blockDim().x - 1i32, gridDim().x) + offset = widemul(blockIdx().x - 1i32, blockDim().x) while offset < length(A) i = threadId + offset j = threadId + offset + window @@ -129,8 +129,8 @@ function Random.randn!(rng::RNG, A::AnyCuArray{<:Union{AbstractFloat,Complex{<:A # grid-stride loop threadId = threadIdx().x - window = (blockDim().x - 1) * gridDim().x - offset = (blockIdx().x - 1) * blockDim().x + window = widemul(blockDim().x - 1i32, gridDim().x) + offset = widemul(blockIdx().x - 1i32, blockDim().x) while offset < length(A) i = threadId + offset if i <= length(A) diff --git a/test/base/random.jl b/test/base/random.jl index 85e0799458..d98a2bf04d 100644 --- a/test/base/random.jl +++ b/test/base/random.jl @@ -198,3 +198,9 @@ end end end +@testset "counter overflow" begin + rng = CUDA.RNG() + c = CUDA.zeros(Float16, (64, 32, 512, 32, 64)) + rand!(rng, c) + randn!(rng, c) +end From 0bbeb9766d1fea3fa387663445218b3d1cdcb231 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Nov 2024 16:58:34 +0100 Subject: [PATCH 3/5] Native RNG: Fix randn window calculation. --- src/random.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/random.jl b/src/random.jl index ff26d96a1d..b8b1f222c0 100644 --- a/src/random.jl +++ b/src/random.jl @@ -96,7 +96,7 @@ function Random.randn!(rng::RNG, A::AnyCuArray{<:Union{AbstractFloat,Complex{<:A # grid-stride loop threadId = threadIdx().x - window = widemul(blockDim().x - 1i32, gridDim().x) + window = widemul(blockDim().x, gridDim().x) offset = widemul(blockIdx().x - 1i32, blockDim().x) while offset < length(A) i = threadId + offset @@ -129,7 +129,7 @@ function Random.randn!(rng::RNG, A::AnyCuArray{<:Union{AbstractFloat,Complex{<:A # grid-stride loop threadId = threadIdx().x - window = widemul(blockDim().x - 1i32, gridDim().x) + window = widemul(blockDim().x, gridDim().x) offset = widemul(blockIdx().x - 1i32, blockDim().x) while offset < length(A) i = threadId + offset From c23e09f43d534216e6bbaf5ba29499c3d31364f5 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 10 Dec 2024 14:56:44 +0100 Subject: [PATCH 4/5] Use host memory to prevent OOM. --- test/base/random.jl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/base/random.jl b/test/base/random.jl index d98a2bf04d..d48cf7288b 100644 --- a/test/base/random.jl +++ b/test/base/random.jl @@ -200,7 +200,12 @@ end @testset "counter overflow" begin rng = CUDA.RNG() - c = CUDA.zeros(Float16, (64, 32, 512, 32, 64)) - rand!(rng, c) - randn!(rng, c) + # we may not be able to allocate over 4GB on the GPU, so use CPU memory + #c = CUDA.zeros(Float16, (64, 32, 512, 32, 64)) + c = Array{Float16}(undef, 64, 32, 512, 32, 64) + GC.@preserve c begin + dc = unsafe_wrap(CuArray, c) + rand!(rng, dc) + randn!(rng, dc) + end end From 150af15565c0ca260f58cf7ee573f2969308d2be Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 11 Dec 2024 08:42:34 +0100 Subject: [PATCH 5/5] Try using unified memory. --- test/base/random.jl | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/test/base/random.jl b/test/base/random.jl index d48cf7288b..62a6811539 100644 --- a/test/base/random.jl +++ b/test/base/random.jl @@ -200,12 +200,8 @@ end @testset "counter overflow" begin rng = CUDA.RNG() - # we may not be able to allocate over 4GB on the GPU, so use CPU memory - #c = CUDA.zeros(Float16, (64, 32, 512, 32, 64)) - c = Array{Float16}(undef, 64, 32, 512, 32, 64) - GC.@preserve c begin - dc = unsafe_wrap(CuArray, c) - rand!(rng, dc) - randn!(rng, dc) - end + # we may not be able to allocate over 4GB on the GPU, so use unified memory + c = CuArray{Float16, 5, CUDA.UnifiedMemory}(undef, 64, 32, 512, 32, 64) + rand!(rng, c) + randn!(rng, c) end