From 5f2e58e94937a653dccb96b5a9fa116cb79c9aa4 Mon Sep 17 00:00:00 2001
From: Charles Kawczynski <kawczynski.charles@gmail.com>
Date: Fri, 17 May 2024 12:08:17 -0400
Subject: [PATCH] Specialize cases in run_field_matrix_solver, add debug info

Try unrolled_all

Fix
---
 ext/cuda/cuda_utils.jl                        | 54 ++++++++++++-------
 .../matrix_fields_multiple_field_solve.jl     | 35 ++++--------
 ext/cuda/matrix_fields_single_field_solve.jl  | 43 +++++++++++----
 src/MatrixFields/field_matrix_solver.jl       | 26 ++++++---
 4 files changed, 97 insertions(+), 61 deletions(-)

diff --git a/ext/cuda/cuda_utils.jl b/ext/cuda/cuda_utils.jl
index 962219fd20..ae2006700c 100644
--- a/ext/cuda/cuda_utils.jl
+++ b/ext/cuda/cuda_utils.jl
@@ -10,6 +10,7 @@ get_n_items(arr::AbstractArray) = prod(size(parent(arr)))
 get_n_items(tup::Tuple) = prod(tup)
 
 const reported_stats = Dict()
+# Call via ClimaCore.DataLayouts.empty_kernel_stats()
 empty_kernel_stats(::ClimaComms.CUDADevice) = empty!(reported_stats)
 collect_kernel_stats() = false
 
@@ -22,58 +23,75 @@ collect_kernel_stats() = false
             AbstractData,
             Field,
         };
+        auto = false,
         threads_s,
         blocks_s,
         always_inline = true
     )
 
-Launch a cuda kernel, using `CUDA.launch_configuration`
+Launch a cuda kernel, using `CUDA.launch_configuration` (if `auto=true`)
 to determine the number of threads/blocks.
 
 Suggested threads and blocks (`threads_s`, `blocks_s`) can be given
-to benchmark compare against auto-determined threads/blocks.
+to benchmark compare against auto-determined threads/blocks (if `auto=false`).
 """
 function auto_launch!(
     f!::F!,
     args,
     data;
-    threads_s,
-    blocks_s,
+    auto = false,
+    threads_s = nothing,
+    blocks_s = nothing,
     always_inline = true,
+    caller = :unknown,
 ) where {F!}
-    nitems = get_n_items(data)
-    # For now, we'll simply use the
-    # suggested threads and blocks:
-    kernel =
-        CUDA.@cuda always_inline = always_inline threads = threads_s blocks =
-            blocks_s f!(args...)
+    if auto
+        nitems = get_n_items(data)
+        kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
+        config = CUDA.launch_configuration(kernel.fun)
+        threads = min(nitems, config.threads)
+        blocks = cld(nitems, threads)
+        kernel(args...; threads, blocks) # This knows to use always_inline from above.
+    else
+        kernel =
+            CUDA.@cuda always_inline = always_inline threads = threads_s blocks =
+                blocks_s f!(args...)
+    end
 
     if collect_kernel_stats() # only for development use
-        key = (F!, typeof(args))
+        key = (F!, typeof(args), CUDA.registers(kernel))
+        # CUDA.registers(kernel) > 50 || return nothing # for debugging
+        # occursin("single_field_solve_kernel", string(nameof(F!))) || return nothing
         if !haskey(reported_stats, key)
+            nitems = get_n_items(data)
             kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
             config = CUDA.launch_configuration(kernel.fun)
             threads = min(nitems, config.threads)
             blocks = cld(nitems, threads)
+            # For now, let's just collect info, later we can benchmark
+#! format: off
             s = ""
             s *= "Launching kernel $f! with following config:\n"
             s *= "     nitems:         $(nitems)\n"
-            s *= "     threads_s:      $(threads_s)\n"
-            s *= "     blocks_s:       $(blocks_s)\n"
+            isnothing(threads_s) || (s *= "     threads_s:      $(threads_s)\n")
+            isnothing(blocks_s) || (s *= "     blocks_s:       $(blocks_s)\n")
             s *= "     threads:        $(threads)\n"
             s *= "     blocks:         $(blocks)\n"
-            s *= "     Δthreads:       $(threads - prod(threads_s))\n"
-            s *= "     Δblocks:        $(blocks - prod(blocks_s))\n"
+            isnothing(threads_s) || (s *= "     Δthreads:       $(threads - prod(threads_s))\n")
+            isnothing(blocks_s) || (s *= "     Δblocks:        $(blocks - prod(blocks_s))\n")
             s *= "     maxthreads:     $(CUDA.maxthreads(kernel))\n"
             s *= "     registers:      $(CUDA.registers(kernel))\n"
-            s *= "     threads_s_frac: $(prod(threads_s)/CUDA.maxthreads(kernel))\n"
+            isnothing(threads_s) || ( s *= "     threads_s_frac: $(prod(threads_s)/CUDA.maxthreads(kernel))\n")
             s *= "     memory:         $(CUDA.memory(kernel))\n"
             @info s
+#! format: on
             reported_stats[key] = true
+            # error("Oops") # for debugging
+            # Main.Infiltrator.@exfiltrate # for debugging/performance optimization
         end
-        # For now, let's just collect info, later we can benchmark
-        # kernel(args...; threads, blocks) # This knows to use always_inline from above.
+        # end
     end
+    return nothing
 end
 
 """
diff --git a/ext/cuda/matrix_fields_multiple_field_solve.jl b/ext/cuda/matrix_fields_multiple_field_solve.jl
index 0da8418e23..64590a0c65 100644
--- a/ext/cuda/matrix_fields_multiple_field_solve.jl
+++ b/ext/cuda/matrix_fields_multiple_field_solve.jl
@@ -50,25 +50,15 @@ NVTX.@annotate function multiple_field_solve!(
     )
 end
 
-column_A(A::UniformScaling, i, j, h) = A
-column_A(A, i, j, h) = Spaces.column(A, i, j, h)
-
-function get_ijhn(Ni, Nj, Nh, Nnames, blockIdx, threadIdx, blockDim, gridDim)
-    tidx = (blockIdx.x - 1) * blockDim.x + threadIdx.x
-    (i, j, h, n) = if 1 ≤ tidx ≤ prod((Ni, Nj, Nh, Nnames))
-        CartesianIndices((1:Ni, 1:Nj, 1:Nh, 1:Nnames))[tidx].I
-    else
-        (-1, -1, -1, -1)
-    end
-    return (i, j, h, n)
-end
+Base.@propagate_inbounds column_A(A::UniformScaling, i, j, h) = A
+Base.@propagate_inbounds column_A(A, i, j, h) = Spaces.column(A, i, j, h)
 
 @generated function generated_single_field_solve!(
+    device,
     caches,
     xs,
     As,
     bs,
-    device,
     i,
     j,
     h,
@@ -78,11 +68,11 @@ end
     return quote
         Base.Cartesian.@nif $Nnames ξ -> (iname == ξ) ξ -> begin
             _single_field_solve!(
+                device,
                 column_A(caches[ξ], i, j, h),
                 column_A(xs[ξ], i, j, h),
                 column_A(As[ξ], i, j, h),
                 column_A(bs[ξ], i, j, h),
-                device,
             )
         end
     end
@@ -99,23 +89,16 @@ function multiple_field_solve_kernel!(
 ) where {Nnames}
     @inbounds begin
         Ni, Nj, _, _, Nh = size(Fields.field_values(x1))
-        (i, j, h, iname) = get_ijhn(
-            Ni,
-            Nj,
-            Nh,
-            Nnames,
-            CUDA.blockIdx(),
-            CUDA.threadIdx(),
-            CUDA.blockDim(),
-            CUDA.gridDim(),
-        )
-        if 1 ≤ i <= Ni && 1 ≤ j ≤ Nj && 1 ≤ h ≤ Nh && 1 ≤ iname ≤ Nnames
+        tidx = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
+        if 1 ≤ tidx ≤ prod((Ni, Nj, Nh, Nnames))
+            (i, j, h, iname) =
+                CartesianIndices((1:Ni, 1:Nj, 1:Nh, 1:Nnames))[tidx].I
             generated_single_field_solve!(
+                device,
                 caches,
                 xs,
                 As,
                 bs,
-                device,
                 i,
                 j,
                 h,
diff --git a/ext/cuda/matrix_fields_single_field_solve.jl b/ext/cuda/matrix_fields_single_field_solve.jl
index 9fa78d82b4..4390061898 100644
--- a/ext/cuda/matrix_fields_single_field_solve.jl
+++ b/ext/cuda/matrix_fields_single_field_solve.jl
@@ -4,19 +4,44 @@ import LinearAlgebra: UniformScaling
 import ClimaCore.Operators
 import ClimaCore.Fields: Field
 import ClimaCore.Fields
+import ClimaCore.Spaces
+import ClimaCore.Topologies
+import ClimaCore.MatrixFields: single_field_solve!
 import ClimaCore.MatrixFields: _single_field_solve!
 import ClimaCore.MatrixFields: band_matrix_solve!, unzip_tuple_field_values
 import ClimaCore.RecursiveApply: ⊠, ⊞, ⊟, rmap, rzero, rdiv
 
-# called by TuplesOfNTuples.jl's `inner_dispatch`:
-# which requires a particular argument order:
-_single_field_solve!(
-    cache::Fields.Field,
-    x::Fields.Field,
-    A::Union{Fields.Field, UniformScaling},
-    b::Fields.Field,
-    dev::ClimaComms.CUDADevice,
-) = _single_field_solve!(dev, cache, x, A, b)
+function single_field_solve!(device::ClimaComms.CUDADevice, cache, x, A, b)
+    Ni, Nj, _, _, Nh = size(Fields.field_values(A))
+    Ni, Nj, _, _, Nh = size(Fields.field_values(A))
+    nitems = Ni * Nj * Nh
+    nthreads = min(256, nitems)
+    nblocks = cld(nitems, nthreads)
+    args = (device, cache, x, A, b)
+    auto_launch!(
+        single_field_solve_kernel!,
+        args,
+        x;
+        threads_s = nthreads,
+        blocks_s = nblocks,
+    )
+end
+
+function single_field_solve_kernel!(device, cache, x, A, b)
+    idx = CUDA.threadIdx().x + (CUDA.blockIdx().x - 1) * CUDA.blockDim().x
+    Ni, Nj, _, _, Nh = size(Fields.field_values(A))
+    if idx <= Ni * Nj * Nh
+        i, j, h = Topologies._get_idx((Ni, Nj, Nh), idx)
+        _single_field_solve!(
+            device,
+            Spaces.column(cache, i, j, h),
+            Spaces.column(x, i, j, h),
+            Spaces.column(A, i, j, h),
+            Spaces.column(b, i, j, h),
+        )
+    end
+    return nothing
+end
 
 function _single_field_solve!(
     ::ClimaComms.CUDADevice,
diff --git a/src/MatrixFields/field_matrix_solver.jl b/src/MatrixFields/field_matrix_solver.jl
index 74624b1445..1ade13250a 100644
--- a/src/MatrixFields/field_matrix_solver.jl
+++ b/src/MatrixFields/field_matrix_solver.jl
@@ -247,14 +247,24 @@ function check_field_matrix_solver(::BlockDiagonalSolve, _, A, b)
     end
 end
 
-NVTX.@annotate run_field_matrix_solver!(::BlockDiagonalSolve, cache, x, A, b) =
-    multiple_field_solve!(cache, x, A, b)
-
-# This may be helpful for debugging:
-# run_field_matrix_solver!(::BlockDiagonalSolve, cache, x, A, b) =
-#     foreach(matrix_row_keys(keys(A))) do name
-#         single_field_solve!(cache[name], x[name], A[name, name], b[name])
-#     end
+NVTX.@annotate function run_field_matrix_solver!(
+    ::BlockDiagonalSolve,
+    cache,
+    x,
+    A,
+    b,
+)
+    names = matrix_row_keys(keys(A))
+    if length(names) == 1 ||
+       all(name -> A[name, name] isa UniformScaling, names.values)
+        foreach(names) do name
+            single_field_solve!(cache[name], x[name], A[name, name], b[name])
+        end
+    else
+        multiple_field_solve!(cache, x, A, b)
+    end
+    return nothing
+end
 
 """
     BlockLowerTriangularSolve(names₁...; [alg₁], [alg₂])