Skip to content

Commit

Permalink
Add reference GPU kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
sriharshakandala committed Jul 21, 2023
1 parent 23e6956 commit ff2dd7e
Showing 1 changed file with 113 additions and 1 deletion.
114 changes: 113 additions & 1 deletion examples/hybrid/tuning/mwe_tune_ke.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ import ClimaCore:
Operators,
Spaces,
Topologies,
DataLayouts
DataLayouts,
RecursiveApply

const C1 = ClimaCore.Geometry.Covariant1Vector
const C2 = ClimaCore.Geometry.Covariant2Vector
Expand All @@ -23,6 +24,8 @@ const CT123 = Geometry.Contravariant123Vector
const ᶜinterp = Operators.InterpolateF2C()
const ᶠinterp = Operators.InterpolateC2F()

const = RecursiveApply.radd

init_uθ(ϕ, z, R) = 1.0 / R
init_vθ(ϕ, z, R) = 1.0 / R
init_w(ϕ, z) = 1.0
Expand Down Expand Up @@ -68,6 +71,92 @@ function compute_kinetic_ca!(
#end
end

function compute_kinetic_ref!(
κ::Fields.Field,
uₕ::Fields.Field,
uᵥ::Fields.Field,
)
hvc_lgd = Spaces.local_geometry_data(axes(uₕ)) # local geometry data for center extruded FD space
hvf_lgd = Spaces.local_geometry_data(axes(uᵥ)) # local geometry data for face extruded FD space

κ_val = Fields.field_values(κ) # field values
uₕ_val = Fields.field_values(uₕ)
uᵥ_val = Fields.field_values(uᵥ)


(Nq, _, _, Nvc, Nh) = size(uₕ_val) # query dimensions
(Nq, _, _, Nvf, Nh) = size(uᵥ_val)

nitems = Nvc * Nq * Nq * Nh # # of items that can be independently processed
max_threads = 256
nthreads = min(max_threads, nitems)
nblocks = cld(nitems, nthreads)

@cuda threads = (nthreads,) blocks = (nblocks,) compute_kinetic_ref_kernel!(
κ_val,
uₕ_val,
uᵥ_val,
hvc_lgd,
hvf_lgd,
(Nq, Nvc, Nh),
)

return κ
end

function compute_kinetic_ref_kernel!(
κ_val,
uₕ_val,
uᵥ_val,
hvc_lgd,
hvf_lgd,
dims,
)
gid = threadIdx().x + (blockIdx().x - 1) * blockDim().x
(Nq, Nvc, Nh) = dims
if gid Nvc * Nq * Nq * Nh
h = cld(gid, Nvc * Nq * Nq)
offset = (h - 1) * Nvc * Nq * Nq
j = cld(gid - offset, Nvc * Nq)
offset += (j - 1) * Nvc * Nq
i = cld(gid - offset, Nvc)
vc = gid - offset - (i - 1) * Nvc

idx = CartesianIndex((i, j, 1, vc, h))
idx⁻ = idx
idx⁺ = CartesianIndex((i, j, 1, vc + 1, h))

c_lgd = hvc_lgd[i, j, nothing, vc, h]

f_lgd⁺ = hvf_lgd[i, j, nothing, vc + 1, h]
f_lgd⁻ = hvf_lgd[i, j, nothing, vc, h]

uₕ_val_idx = uₕ_val[idx]
uᵥ_val_idx⁺ = uᵥ_val[idx⁺]
uᵥ_val_idx⁻ = uᵥ_val[idx⁻]

uₕ_c = C123(uₕ_val_idx, c_lgd)
uₕ_ct = CT123(uₕ_val_idx, c_lgd)

uᵥ_c⁺ = C123(uᵥ_val_idx⁺, f_lgd⁺)
uᵥ_c⁻ = C123(uᵥ_val_idx⁻, f_lgd⁻)
uᵥ_c = RecursiveApply.rdiv(uᵥ_c⁺ uᵥ_c⁻, 2)

uᵥ_ct⁺ = CT123(uᵥ_val_idx⁺, f_lgd⁺)
uᵥ_ct⁻ = CT123(uᵥ_val_idx⁻, f_lgd⁻)

κ_val[idx] =
(
dot(uₕ_c, uₕ_ct) RecursiveApply.rdiv(
dot(uᵥ_c⁺, uᵥ_ct⁺) dot(uᵥ_c⁻, uᵥ_ct⁻),
2,
) 2 * dot(uₕ_ct, uᵥ_c)
) / 2
end

return nothing
end

function initialize_mwe(device, ::Type{FT}) where {FT}
context = ClimaComms.SingletonCommsContext(device)
R = FT(6.371229e6)
Expand Down Expand Up @@ -125,13 +214,36 @@ function profile_compute_kinetic(::Type{FT}) where {FT}

@show Array(parent(κ)) parent(κ_cpu)


κ_ref, uₕ_ref, uᵥ_ref = initialize_mwe(ClimaComms.CUDADevice(), FT)

@show "before ref version" maximum(parent(κ_ref))

κ_ref = compute_kinetic_ref!(κ_ref, uₕ_ref, uᵥ_ref)
@show "after ref version" maximum(parent(κ_ref))

nreps = 10

for i in 1:nreps
NVTX.@range "compute_kinetic_ca!" color = colorant"blue" payload = i begin
CUDA.@sync κ = compute_kinetic_ca!(κ, uₕ, uᵥ)
end
end

for i in 1:nreps
NVTX.@range "compute_kinetic_ref!" color = colorant"green" payload = i begin
CUDA.@sync κ_ref = compute_kinetic_ref!(κ_ref, uₕ_ref, uᵥ_ref)
end
end


for i in 1:nreps
t_ca = CUDA.@elapsed κ = compute_kinetic_ca!(κ, uₕ, uᵥ)
t_ref =
CUDA.@elapsed κ_ref = compute_kinetic_ref!(κ_ref, uₕ_ref, uᵥ_ref)
println("t_ca = $t_ca (sec); t_ref = $t_ref (sec)")
end

end

profile_compute_kinetic(Float64)

0 comments on commit ff2dd7e

Please sign in to comment.