From 04461160942610dddb578c55f739859591545c49 Mon Sep 17 00:00:00 2001 From: sriharshakandala Date: Thu, 27 Jul 2023 14:18:48 -0700 Subject: [PATCH 1/2] Move to thread-per-node stencil operator kernels --- src/Operators/finitedifference.jl | 52 +++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/src/Operators/finitedifference.jl b/src/Operators/finitedifference.jl index 79b4d0c599..fb00c76116 100644 --- a/src/Operators/finitedifference.jl +++ b/src/Operators/finitedifference.jl @@ -3437,23 +3437,57 @@ function Base.copyto!( Nq = 1 Nh = 1 end - bounds = window_bounds(space, bc) - # executed - @cuda threads = (Nq, Nq) blocks = (Nh,) copyto_stencil_kernel!( + (li, lw, rw, ri) = bounds = window_bounds(space, bc) + nnodes = ri - li + 1 + args = ( strip_space(out, space), strip_space(bc, space), axes(out), bounds, + nnodes, + Nq, + Nh, + ) + kernel = @cuda launch = false copyto_stencil_kernel!(args...) + kernel_config = CUDA.launch_configuration(kernel.fun) + max_threads = kernel_config.threads + nitems = nnodes * Nq * Nq * Nh + nthreads = min(max_threads, nitems) + nblocks = cld(nitems, nthreads) + # executed + @cuda threads = (nthreads,) blocks = (nblocks,) copyto_stencil_kernel!( + args..., ) return out end -function copyto_stencil_kernel!(out, bc, space, bds) - i = threadIdx().x - j = threadIdx().y - h = blockIdx().x - hidx = (i, j, h) - apply_stencil!(space, out, bc, hidx, bds) +function copyto_stencil_kernel!(out, bc, space, bds, nnodes, Nq, Nh) + gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x + if gid ≤ nnodes * Nq * Nq * Nh + (li, lw, rw, ri) = bds + h = cld(gid, nnodes * Nq * Nq) + j = cld(gid - (h - 1) * nnodes * Nq * Nq, nnodes * Nq) + i = cld( + gid - (h - 1) * nnodes * Nq * Nq - (j - 1) * nnodes * Nq, + nnodes, + ) + ndidx = + gid - (h - 1) * nnodes * Nq * Nq - (j - 1) * nnodes * Nq - + (i - 1) * nnodes + hidx = (i, j, h) + fun = + !Topologies.isperiodic(Spaces.vertical_topology(space)) ? + ( + ndidx ≤ lw - 1 ? + LeftBoundaryWindow{Spaces.left_boundary_name(space)}() : + ( + ndidx ≥ rw + 1 ? + RightBoundaryWindow{Spaces.right_boundary_name(space)}() : + Interior() + ) + ) : Interior() + setidx!(space, out, ndidx, hidx, getidx(space, bc, fun, ndidx, hidx)) + end return nothing end From 9b890a2f9e51b98b69f4f0deafa2e780479e4e4c Mon Sep 17 00:00:00 2001 From: sriharshakandala Date: Thu, 27 Jul 2023 15:42:46 -0700 Subject: [PATCH 2/2] Manually specify max_threads --- src/Operators/finitedifference.jl | 110 ++++++++++++++++++++++-------- 1 file changed, 81 insertions(+), 29 deletions(-) diff --git a/src/Operators/finitedifference.jl b/src/Operators/finitedifference.jl index fb00c76116..919d733b66 100644 --- a/src/Operators/finitedifference.jl +++ b/src/Operators/finitedifference.jl @@ -3420,7 +3420,6 @@ function strip_space(bc::StencilBroadcasted{Style}, parent_space) where {Style} ) end - function Base.copyto!( out::Field, bc::Union{ @@ -3438,33 +3437,75 @@ function Base.copyto!( Nh = 1 end (li, lw, rw, ri) = bounds = window_bounds(space, bc) - nnodes = ri - li + 1 - args = ( + + # left window + if !Topologies.isperiodic(Spaces.vertical_topology(space)) + max_threads = 256 + nitems = Nq * Nq * Nh + nthreads = min(max_threads, nitems) + nblocks = cld(nitems, nthreads) + @cuda threads = (nthreads,) blocks = (nblocks,) copyto_stencil_lw_kernel!( + strip_space(out, space), + strip_space(bc, space), + axes(out), + bounds, + Nq, + Nh, + ) + end + # interior nodes + ninteriornodes = rw - lw + 1 + max_threads = 256 + nitems = ninteriornodes * Nq * Nq * Nh + nthreads = min(max_threads, nitems) + nblocks = cld(nitems, nthreads) + @cuda threads = (nthreads,) blocks = (nblocks,) copyto_stencil_interior_kernel!( strip_space(out, space), strip_space(bc, space), axes(out), bounds, - nnodes, + ninteriornodes, Nq, Nh, ) - kernel = @cuda launch = false copyto_stencil_kernel!(args...) - kernel_config = CUDA.launch_configuration(kernel.fun) - max_threads = kernel_config.threads - nitems = nnodes * Nq * Nq * Nh - nthreads = min(max_threads, nitems) - nblocks = cld(nitems, nthreads) - # executed - @cuda threads = (nthreads,) blocks = (nblocks,) copyto_stencil_kernel!( - args..., - ) + # right window + if !Topologies.isperiodic(Spaces.vertical_topology(space)) + max_threads = 256 + nitems = Nq * Nq * Nh + nthreads = min(max_threads, nitems) + nblocks = cld(nitems, nthreads) + @cuda threads = (nthreads,) blocks = (nblocks,) copyto_stencil_rw_kernel!( + strip_space(out, space), + strip_space(bc, space), + axes(out), + bounds, + Nq, + Nh, + ) + end return out end -function copyto_stencil_kernel!(out, bc, space, bds, nnodes, Nq, Nh) +function copyto_stencil_lw_kernel!(out, bc, space, bds, Nq, Nh) gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x - if gid ≤ nnodes * Nq * Nq * Nh + if gid ≤ Nq * Nq * Nh (li, lw, rw, ri) = bds + h = cld(gid, Nq * Nq) + j = cld(gid - (h - 1) * Nq * Nq, Nq) + i = gid - (h - 1) * Nq * Nq - (j - 1) * Nq + hidx = (i, j, h) + lbw = LeftBoundaryWindow{Spaces.left_boundary_name(space)}() + @inbounds for idx in li:(lw - 1) + setidx!(space, out, idx, hidx, getidx(space, bc, lbw, idx, hidx)) + end + end + return nothing +end + +function copyto_stencil_interior_kernel!(out, bc, space, bds, nnodes, Nq, Nh) + gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x + if gid ≤ nnodes * Nq * Nq * Nh + (_, lw, rw, _) = bds h = cld(gid, nnodes * Nq * Nq) j = cld(gid - (h - 1) * nnodes * Nq * Nq, nnodes * Nq) i = cld( @@ -3473,20 +3514,31 @@ function copyto_stencil_kernel!(out, bc, space, bds, nnodes, Nq, Nh) ) ndidx = gid - (h - 1) * nnodes * Nq * Nq - (j - 1) * nnodes * Nq - - (i - 1) * nnodes + (i - 1) * nnodes + lw - 1 hidx = (i, j, h) - fun = - !Topologies.isperiodic(Spaces.vertical_topology(space)) ? - ( - ndidx ≤ lw - 1 ? - LeftBoundaryWindow{Spaces.left_boundary_name(space)}() : - ( - ndidx ≥ rw + 1 ? - RightBoundaryWindow{Spaces.right_boundary_name(space)}() : - Interior() - ) - ) : Interior() - setidx!(space, out, ndidx, hidx, getidx(space, bc, fun, ndidx, hidx)) + setidx!( + space, + out, + ndidx, + hidx, + getidx(space, bc, Interior(), ndidx, hidx), + ) + end + return nothing +end + +function copyto_stencil_rw_kernel!(out, bc, space, bds, Nq, Nh) + gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x + if gid ≤ Nq * Nq * Nh + (li, lw, rw, ri) = bds + h = cld(gid, Nq * Nq) + j = cld(gid - (h - 1) * Nq * Nq, Nq) + i = gid - (h - 1) * Nq * Nq - (j - 1) * Nq + hidx = (i, j, h) + rbw = RightBoundaryWindow{Spaces.right_boundary_name(space)}() + @inbounds for idx in (rw + 1):ri + setidx!(space, out, idx, hidx, getidx(space, bc, rbw, idx, hidx)) + end end return nothing end