From 5f2e58e94937a653dccb96b5a9fa116cb79c9aa4 Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Fri, 17 May 2024 12:08:17 -0400 Subject: [PATCH] Specialize cases in run_field_matrix_solver, add debug info Try unrolled_all Fix --- ext/cuda/cuda_utils.jl | 54 ++++++++++++------- .../matrix_fields_multiple_field_solve.jl | 35 ++++-------- ext/cuda/matrix_fields_single_field_solve.jl | 43 +++++++++++---- src/MatrixFields/field_matrix_solver.jl | 26 ++++++--- 4 files changed, 97 insertions(+), 61 deletions(-) diff --git a/ext/cuda/cuda_utils.jl b/ext/cuda/cuda_utils.jl index 962219fd20..ae2006700c 100644 --- a/ext/cuda/cuda_utils.jl +++ b/ext/cuda/cuda_utils.jl @@ -10,6 +10,7 @@ get_n_items(arr::AbstractArray) = prod(size(parent(arr))) get_n_items(tup::Tuple) = prod(tup) const reported_stats = Dict() +# Call via ClimaCore.DataLayouts.empty_kernel_stats() empty_kernel_stats(::ClimaComms.CUDADevice) = empty!(reported_stats) collect_kernel_stats() = false @@ -22,58 +23,75 @@ collect_kernel_stats() = false AbstractData, Field, }; + auto = false, threads_s, blocks_s, always_inline = true ) -Launch a cuda kernel, using `CUDA.launch_configuration` +Launch a cuda kernel, using `CUDA.launch_configuration` (if `auto=true`) to determine the number of threads/blocks. Suggested threads and blocks (`threads_s`, `blocks_s`) can be given -to benchmark compare against auto-determined threads/blocks. +to benchmark compare against auto-determined threads/blocks (if `auto=false`). """ function auto_launch!( f!::F!, args, data; - threads_s, - blocks_s, + auto = false, + threads_s = nothing, + blocks_s = nothing, always_inline = true, + caller = :unknown, ) where {F!} - nitems = get_n_items(data) - # For now, we'll simply use the - # suggested threads and blocks: - kernel = - CUDA.@cuda always_inline = always_inline threads = threads_s blocks = - blocks_s f!(args...) + if auto + nitems = get_n_items(data) + kernel = CUDA.@cuda always_inline = true launch = false f!(args...) + config = CUDA.launch_configuration(kernel.fun) + threads = min(nitems, config.threads) + blocks = cld(nitems, threads) + kernel(args...; threads, blocks) # This knows to use always_inline from above. + else + kernel = + CUDA.@cuda always_inline = always_inline threads = threads_s blocks = + blocks_s f!(args...) + end if collect_kernel_stats() # only for development use - key = (F!, typeof(args)) + key = (F!, typeof(args), CUDA.registers(kernel)) + # CUDA.registers(kernel) > 50 || return nothing # for debugging + # occursin("single_field_solve_kernel", string(nameof(F!))) || return nothing if !haskey(reported_stats, key) + nitems = get_n_items(data) kernel = CUDA.@cuda always_inline = true launch = false f!(args...) config = CUDA.launch_configuration(kernel.fun) threads = min(nitems, config.threads) blocks = cld(nitems, threads) + # For now, let's just collect info, later we can benchmark +#! format: off s = "" s *= "Launching kernel $f! with following config:\n" s *= " nitems: $(nitems)\n" - s *= " threads_s: $(threads_s)\n" - s *= " blocks_s: $(blocks_s)\n" + isnothing(threads_s) || (s *= " threads_s: $(threads_s)\n") + isnothing(blocks_s) || (s *= " blocks_s: $(blocks_s)\n") s *= " threads: $(threads)\n" s *= " blocks: $(blocks)\n" - s *= " Δthreads: $(threads - prod(threads_s))\n" - s *= " Δblocks: $(blocks - prod(blocks_s))\n" + isnothing(threads_s) || (s *= " Δthreads: $(threads - prod(threads_s))\n") + isnothing(blocks_s) || (s *= " Δblocks: $(blocks - prod(blocks_s))\n") s *= " maxthreads: $(CUDA.maxthreads(kernel))\n" s *= " registers: $(CUDA.registers(kernel))\n" - s *= " threads_s_frac: $(prod(threads_s)/CUDA.maxthreads(kernel))\n" + isnothing(threads_s) || ( s *= " threads_s_frac: $(prod(threads_s)/CUDA.maxthreads(kernel))\n") s *= " memory: $(CUDA.memory(kernel))\n" @info s +#! format: on reported_stats[key] = true + # error("Oops") # for debugging + # Main.Infiltrator.@exfiltrate # for debugging/performance optimization end - # For now, let's just collect info, later we can benchmark - # kernel(args...; threads, blocks) # This knows to use always_inline from above. + # end end + return nothing end """ diff --git a/ext/cuda/matrix_fields_multiple_field_solve.jl b/ext/cuda/matrix_fields_multiple_field_solve.jl index 0da8418e23..64590a0c65 100644 --- a/ext/cuda/matrix_fields_multiple_field_solve.jl +++ b/ext/cuda/matrix_fields_multiple_field_solve.jl @@ -50,25 +50,15 @@ NVTX.@annotate function multiple_field_solve!( ) end -column_A(A::UniformScaling, i, j, h) = A -column_A(A, i, j, h) = Spaces.column(A, i, j, h) - -function get_ijhn(Ni, Nj, Nh, Nnames, blockIdx, threadIdx, blockDim, gridDim) - tidx = (blockIdx.x - 1) * blockDim.x + threadIdx.x - (i, j, h, n) = if 1 ≤ tidx ≤ prod((Ni, Nj, Nh, Nnames)) - CartesianIndices((1:Ni, 1:Nj, 1:Nh, 1:Nnames))[tidx].I - else - (-1, -1, -1, -1) - end - return (i, j, h, n) -end +Base.@propagate_inbounds column_A(A::UniformScaling, i, j, h) = A +Base.@propagate_inbounds column_A(A, i, j, h) = Spaces.column(A, i, j, h) @generated function generated_single_field_solve!( + device, caches, xs, As, bs, - device, i, j, h, @@ -78,11 +68,11 @@ end return quote Base.Cartesian.@nif $Nnames ξ -> (iname == ξ) ξ -> begin _single_field_solve!( + device, column_A(caches[ξ], i, j, h), column_A(xs[ξ], i, j, h), column_A(As[ξ], i, j, h), column_A(bs[ξ], i, j, h), - device, ) end end @@ -99,23 +89,16 @@ function multiple_field_solve_kernel!( ) where {Nnames} @inbounds begin Ni, Nj, _, _, Nh = size(Fields.field_values(x1)) - (i, j, h, iname) = get_ijhn( - Ni, - Nj, - Nh, - Nnames, - CUDA.blockIdx(), - CUDA.threadIdx(), - CUDA.blockDim(), - CUDA.gridDim(), - ) - if 1 ≤ i <= Ni && 1 ≤ j ≤ Nj && 1 ≤ h ≤ Nh && 1 ≤ iname ≤ Nnames + tidx = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x + if 1 ≤ tidx ≤ prod((Ni, Nj, Nh, Nnames)) + (i, j, h, iname) = + CartesianIndices((1:Ni, 1:Nj, 1:Nh, 1:Nnames))[tidx].I generated_single_field_solve!( + device, caches, xs, As, bs, - device, i, j, h, diff --git a/ext/cuda/matrix_fields_single_field_solve.jl b/ext/cuda/matrix_fields_single_field_solve.jl index 9fa78d82b4..4390061898 100644 --- a/ext/cuda/matrix_fields_single_field_solve.jl +++ b/ext/cuda/matrix_fields_single_field_solve.jl @@ -4,19 +4,44 @@ import LinearAlgebra: UniformScaling import ClimaCore.Operators import ClimaCore.Fields: Field import ClimaCore.Fields +import ClimaCore.Spaces +import ClimaCore.Topologies +import ClimaCore.MatrixFields: single_field_solve! import ClimaCore.MatrixFields: _single_field_solve! import ClimaCore.MatrixFields: band_matrix_solve!, unzip_tuple_field_values import ClimaCore.RecursiveApply: ⊠, ⊞, ⊟, rmap, rzero, rdiv -# called by TuplesOfNTuples.jl's `inner_dispatch`: -# which requires a particular argument order: -_single_field_solve!( - cache::Fields.Field, - x::Fields.Field, - A::Union{Fields.Field, UniformScaling}, - b::Fields.Field, - dev::ClimaComms.CUDADevice, -) = _single_field_solve!(dev, cache, x, A, b) +function single_field_solve!(device::ClimaComms.CUDADevice, cache, x, A, b) + Ni, Nj, _, _, Nh = size(Fields.field_values(A)) + Ni, Nj, _, _, Nh = size(Fields.field_values(A)) + nitems = Ni * Nj * Nh + nthreads = min(256, nitems) + nblocks = cld(nitems, nthreads) + args = (device, cache, x, A, b) + auto_launch!( + single_field_solve_kernel!, + args, + x; + threads_s = nthreads, + blocks_s = nblocks, + ) +end + +function single_field_solve_kernel!(device, cache, x, A, b) + idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + Ni, Nj, _, _, Nh = size(Fields.field_values(A)) + if idx <= Ni * Nj * Nh + i, j, h = Topologies._get_idx((Ni, Nj, Nh), idx) + _single_field_solve!( + device, + Spaces.column(cache, i, j, h), + Spaces.column(x, i, j, h), + Spaces.column(A, i, j, h), + Spaces.column(b, i, j, h), + ) + end + return nothing +end function _single_field_solve!( ::ClimaComms.CUDADevice, diff --git a/src/MatrixFields/field_matrix_solver.jl b/src/MatrixFields/field_matrix_solver.jl index 74624b1445..1ade13250a 100644 --- a/src/MatrixFields/field_matrix_solver.jl +++ b/src/MatrixFields/field_matrix_solver.jl @@ -247,14 +247,24 @@ function check_field_matrix_solver(::BlockDiagonalSolve, _, A, b) end end -NVTX.@annotate run_field_matrix_solver!(::BlockDiagonalSolve, cache, x, A, b) = - multiple_field_solve!(cache, x, A, b) - -# This may be helpful for debugging: -# run_field_matrix_solver!(::BlockDiagonalSolve, cache, x, A, b) = -# foreach(matrix_row_keys(keys(A))) do name -# single_field_solve!(cache[name], x[name], A[name, name], b[name]) -# end +NVTX.@annotate function run_field_matrix_solver!( + ::BlockDiagonalSolve, + cache, + x, + A, + b, +) + names = matrix_row_keys(keys(A)) + if length(names) == 1 || + all(name -> A[name, name] isa UniformScaling, names.values) + foreach(names) do name + single_field_solve!(cache[name], x[name], A[name, name], b[name]) + end + else + multiple_field_solve!(cache, x, A, b) + end + return nothing +end """ BlockLowerTriangularSolve(names₁...; [alg₁], [alg₂])