Skip to content

Commit

Permalink
Restructure. Set max_threads automatically
Browse files Browse the repository at this point in the history
  • Loading branch information
sriharshakandala committed Jul 19, 2023
1 parent be26e08 commit 23e6956
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 42 deletions.
44 changes: 9 additions & 35 deletions examples/hybrid/tuning/mwe_tune_ke.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,21 +68,16 @@ function compute_kinetic_ca!(
#end
end

function initialize_mwe()

FT = Float64

context = ClimaComms.SingletonCommsContext(ClimaComms.CUDADevice())
context_cpu =
ClimaComms.SingletonCommsContext(ClimaComms.CPUSingleThreaded()) # CPU context for comparison
function initialize_mwe(device, ::Type{FT}) where {FT}
context = ClimaComms.SingletonCommsContext(device)
R = FT(6.371229e6)

npoly = 3
z_max = FT(30e3)
z_elem = 10
h_elem = 12
println(
"running KE tuning test on $(context.device); h_elem = $h_elem; z_elem = $z_elem; npoly = $npoly; R = $R; z_max = $z_max; FT = $FT",
"initializing on $(context.device); h_elem = $h_elem; z_elem = $z_elem; npoly = $npoly; R = $R; z_max = $z_max; FT = $FT",
)
# horizontal space
domain = Domains.SphereDomain(R)
Expand All @@ -92,14 +87,8 @@ function initialize_mwe()
horizontal_mesh,
Topologies.spacefillingcurve(horizontal_mesh),
)
horizontal_topology_cpu = Topologies.Topology2D(
context_cpu,
horizontal_mesh,
Topologies.spacefillingcurve(horizontal_mesh),
)
quad = Spaces.Quadratures.GLL{npoly + 1}()
h_space = Spaces.SpectralElementSpace2D(horizontal_topology, quad)
h_space_cpu = Spaces.SpectralElementSpace2D(horizontal_topology_cpu, quad)

# vertical space
z_domain = Domains.IntervalDomain(
Expand All @@ -109,42 +98,27 @@ function initialize_mwe()
)
z_mesh = Meshes.IntervalMesh(z_domain, nelems = z_elem)
z_topology = Topologies.IntervalTopology(context, z_mesh)
z_topology_cpu = Topologies.IntervalTopology(context_cpu, z_mesh)

z_center_space = Spaces.CenterFiniteDifferenceSpace(z_topology)
z_center_space_cpu = Spaces.CenterFiniteDifferenceSpace(z_topology_cpu)

z_face_space = Spaces.FaceFiniteDifferenceSpace(z_topology)
z_face_space_cpu = Spaces.FaceFiniteDifferenceSpace(z_topology_cpu)

hv_center_space =
Spaces.ExtrudedFiniteDifferenceSpace(h_space, z_center_space)
hv_face_space = Spaces.FaceExtrudedFiniteDifferenceSpace(hv_center_space)

hv_center_space_cpu =
Spaces.ExtrudedFiniteDifferenceSpace(h_space_cpu, z_center_space_cpu)
hv_face_space_cpu =
Spaces.FaceExtrudedFiniteDifferenceSpace(hv_center_space_cpu)

# CPU
ᶜlocal_geometry_cpu = Fields.local_geometry_field(hv_center_space_cpu)
ᶠlocal_geometry_cpu = Fields.local_geometry_field(hv_face_space_cpu)
uₕ_cpu = center_initial_condition(ᶜlocal_geometry_cpu, R)
uᵥ_cpu = face_initial_condition(ᶠlocal_geometry_cpu)
κ_cpu = init_scalar_field(hv_center_space_cpu)

# GPU
ᶜlocal_geometry = Fields.local_geometry_field(hv_center_space)
ᶠlocal_geometry = Fields.local_geometry_field(hv_face_space)
uₕ = center_initial_condition(ᶜlocal_geometry, R)
uᵥ = face_initial_condition(ᶠlocal_geometry)
κ = init_scalar_field(hv_center_space)

return (; κ = κ, uₕ = uₕ, uᵥ = uᵥ, κ_cpu, uₕ_cpu = uₕ_cpu, uᵥ_cpu = uᵥ_cpu)
return (; κ = κ, uₕ = uₕ, uᵥ = uᵥ)
end

function profile_compute_kinetic()
κ, uₕ, uᵥ, κ_cpu, uₕ_cpu, uᵥ_cpu = initialize_mwe()
function profile_compute_kinetic(::Type{FT}) where {FT}
κ, uₕ, uᵥ = initialize_mwe(ClimaComms.CUDADevice(), FT)
κ_cpu, uₕ_cpu, uᵥ_cpu = initialize_mwe(ClimaComms.CPUSingleThreaded(), FT)
# compute kinetic energy
κ = compute_kinetic_ca!(κ, uₕ, uᵥ)
κ_cpu = compute_kinetic_ca!(κ_cpu, uₕ_cpu, uᵥ_cpu)
Expand All @@ -155,9 +129,9 @@ function profile_compute_kinetic()

for i in 1:nreps
NVTX.@range "compute_kinetic_ca!" color = colorant"blue" payload = i begin
κ = compute_kinetic_ca!(κ, uₕ, uᵥ)
CUDA.@sync κ = compute_kinetic_ca!(κ, uₕ, uᵥ)
end
end
end

profile_compute_kinetic()
profile_compute_kinetic(Float64)
19 changes: 12 additions & 7 deletions src/Operators/finitedifference.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3438,20 +3438,25 @@ function Base.copyto!(
Nh = 1
end
bounds = window_bounds(space, bc)

max_threads = 256
nitems = Nq * Nq * Nh
nthreads = min(max_threads, nitems)
nblocks = cld(nitems, nthreads)
# executed
@cuda threads = (nthreads) blocks = (nblocks,) copyto_stencil_kernel!(
args = (
strip_space(out, space),
strip_space(bc, space),
axes(out),
bounds,
Nq,
Nh,
)
kernel = @cuda launch = false copyto_stencil_kernel!(args...)
kernel_config = CUDA.launch_configuration(kernel.fun)
max_threads = kernel_config.threads
nitems = Nq * Nq * Nh
nthreads = min(max_threads, nitems)
nblocks = cld(nitems, nthreads)
# executed
@cuda threads = (nthreads,) blocks = (nblocks,) copyto_stencil_kernel!(
args...,
)

return out
end

Expand Down

0 comments on commit 23e6956

Please sign in to comment.