Skip to content

Commit

Permalink
Merge pull request #348 from JuliaGPU/jps/localmem-array
Browse files Browse the repository at this point in the history
Add local memory allocation helpers
  • Loading branch information
jpsamaroo authored Jan 14, 2023
2 parents 5b216b3 + 12b60ac commit 191afc4
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 50 deletions.
38 changes: 19 additions & 19 deletions docs/src/memory.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,28 +110,22 @@ wait(@roc groupsize=4 kernel(RC, RA, RB))
@assert Array(RC) Array(RA) .+ Array(RB) .+ RC_elem
```

Local memory may be allocated within a kernel by calling
`AMDGPU.Device.alloc_local(id, T, len)`, where `id` is some sort of bitstype ID
for the local allocation, `T` is the Julia element type, and `len` is the
number of elements of type `T` to allocate. Local memory does not need to be
freed, as it is automatically freed by the hardware. If `len == 0`, then local
memory is dynamically allocated at kernel runtime; the `localmem` option to
`@roc` must be set appropriately to ensure that enough local memory is
allocated by the hardware.

```julia

```
Local memory may be allocated within a kernel by calling either
`@ROCStaticLocalArray(T, dims)` or `@ROCDynamicLocalArray(T, dims)` - use the
former if `dims` is passed as a constant value, and otherwise use the latter.
Local memory does not need to be freed, as it is automatically freed by the
hardware. If `@ROCDynamicLocalArray` is used, then local memory is dynamically
allocated at kernel execution time; therefore, the `localmem` option to `@roc`
must be set appropriately to ensure that enough local memory is allocated by
the hardware.

```julia
function kernel(C, A, B)
# Allocate local memory dynamically (0 means dynamic)
Ctmp_ptr = AMDGPU.Device.alloc_local(:localmem, Float64, 0)
# Or, allocate local memory statically
# Ctmp_ptr = AMDGPU.Device.alloc_local(:localmem, Float64, 4)
# Allocate local memory dynamically
Ctmp = @ROCDynamicLocalArray(Float64, length(C))
# Or, allocate local memory statically if the size is known ahead-of-time
# Ctmp = @ROCStaticLocalArray(Float64, 8) # if we want 8 elements

# Turn it (a pointer to Float64 elements in Local memory) into a device-side array
Ctmp = ROCDeviceArray(length(C), Ctmp_ptr)
# Use it
idx = AMDGPU.workitemIdx().x
Ctmp[idx] = A[idx] + B[idx] + C[1]
Expand All @@ -140,10 +134,16 @@ function kernel(C, A, B)
nothing
end
# ...
# The `localmem` option isn't necessary if memory is statically allocated
# Note: The `localmem` option isn't necessary if `@ROCStaticLocalArray` is used
wait(@roc groupsize=4 localmem=sizeof(Float64)*length(RC) kernel(RC, RA, RB))
```

Note that like CUDA's shared memory, AMDGPU's local memory is zero-initialized
automatically. If this behavior is unnecessary (and undesired for performance
reasons), zero-initialization can be disabled with `@ROCDynamicLocalArray(T,
dims, false)` or `@ROCStaticLocalArray(T, dims, false)` (the last argument
is `zeroinit`).

## Memory Modification Intrinsics

Like C, AMDGPU.jl provides the `memset!` and `memcpy!` intrinsics, which are
Expand Down
3 changes: 2 additions & 1 deletion src/AMDGPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -114,12 +114,13 @@ module Device
end
import .Device: malloc, signal_exception, report_exception, report_oom, report_exception_frame
import .Device: ROCDeviceArray, AS, HostCall, hostcall!
import .Device: @ROCDynamicLocalArray, @ROCStaticLocalArray
import .Device: workitemIdx, workgroupIdx, workgroupDim, gridItemDim, gridGroupDim
import .Device: threadIdx, blockIdx, blockDim
import .Device: sync_workgroup
import .Device: @rocprint, @rocprintln, @rocprintf

export ROCDeviceArray
export ROCDeviceArray, @ROCDynamicLocalArray, @ROCStaticLocalArray
export @rocprint, @rocprintln, @rocprintf
export workitemIdx, workgroupIdx, workgroupDim, gridItemDim, gridGroupDim
export sync_workgroup
Expand Down
2 changes: 1 addition & 1 deletion src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ end
# memory

@inline function GPUArrays.LocalMemory(::ROCKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}) where {T,dims,id}
ptr = AMDGPU.Device.alloc_special(Val(id), T, AMDGPU.AS.Local, Val(prod(dims)))
ptr = AMDGPU.Device.alloc_special(Val{id}(), T, Val{AMDGPU.AS.Local}(), Val{prod(dims)}())
ROCDeviceArray(dims, ptr)
end

Expand Down
44 changes: 44 additions & 0 deletions src/compiler.jl
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ function GPUCompiler.process_module!(job::ROCCompilerJob, mod::LLVM.Module)
# Run this early (before optimization) to ensure we link OCKL
emit_exception_user!(mod)
end
function GPUCompiler.process_entry!(job::ROCCompilerJob, mod::LLVM.Module, entry::LLVM.Function)
invoke(GPUCompiler.process_entry!,
Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod), typeof(entry)},
job, mod, entry)
# Workaround for the lack of zeroinitializer support for LDS
zeroinit_lds!(mod, entry)
end
function GPUCompiler.finish_module!(job::ROCCompilerJob, mod::LLVM.Module)
invoke(GPUCompiler.finish_module!,
Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod)},
Expand All @@ -90,6 +97,43 @@ GPUCompiler.ci_cache(::ROCCompilerJob) = AMDGPU.ci_cache

GPUCompiler.method_table(::ROCCompilerJob) = AMDGPU.method_table

function zeroinit_lds!(mod::LLVM.Module, entry::LLVM.Function)
if LLVM.callconv(entry) != LLVM.API.LLVMAMDGPUKERNELCallConv
return entry
end
to_init = []
for gbl in LLVM.globals(mod)
if startswith(LLVM.name(gbl), "__zeroinit")
as = LLVM.addrspace(llvmtype(gbl))
if as == AMDGPU.Device.AS.Local
push!(to_init, gbl)
end
end
end
if length(to_init) > 0
ctx = LLVM.context(mod)
T_void = LLVM.VoidType(ctx)
LLVM.@dispose builder=LLVM.Builder(ctx) begin
# Make these the first operations we do
position!(builder, first(LLVM.instructions(first(LLVM.blocks(entry)))))

# Use memset to clear all values to 0
for gbl in to_init
sz = llvmsize(eltype(llvmtype(gbl)))
if sz > 0
LLVM.memset!(builder, gbl, ConstantInt(UInt8(0); ctx), ConstantInt(sz; ctx), LLVM.alignment(gbl))
end
end

# Synchronize the workgroup to prevent races
sync_f = LLVM.Function(mod, LLVM.Intrinsic("llvm.amdgcn.s.barrier"))
call!(builder, sync_f)
end
end

return entry
end

"""
rocfunction(f, tt=Tuple{}; kwargs...)
Expand Down
55 changes: 51 additions & 4 deletions src/device/gcn/memory_static.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
"Allocates on-device memory statically from the specified address space."
@generated function alloc_special(::Val{id}, ::Type{T}, ::Val{as}, ::Val{len}) where {id,T,as,len}
@generated function alloc_special(::Val{id}, ::Type{T}, ::Val{as}, ::Val{len}, ::Val{zeroinit}=Val{false}()) where {id,T,as,len,zeroinit}
Context() do ctx
eltyp = convert(LLVMType, T; ctx)

# old versions of GPUArrays invoke _shmem with an integer id; make sure those are unique
if !isa(id, String) || !isa(id, Symbol)
id = "alloc_special_$id"
end
if zeroinit
id = "__zeroinit_" * id
end

T_ptr_i8 = convert(LLVMType, LLVMPtr{T,as}; ctx)

Expand All @@ -18,8 +21,13 @@
gv_typ = LLVM.ArrayType(eltyp, len)
gv = GlobalVariable(mod, gv_typ, string(id), as)
if len > 0
linkage!(gv, LLVM.API.LLVMExternalLinkage)
# NOTE: Backend doesn't support initializer for local AS
if as == AS.Local
linkage!(gv, LLVM.API.LLVMExternalLinkage)
# NOTE: Backend doesn't support initializer for local AS
elseif as == AS.Private
linkage!(gv, LLVM.API.LLVMInternalLinkage)
initializer!(gv, null(gv_typ))
end
end

# by requesting a larger-than-datatype alignment, we might be able to vectorize.
Expand All @@ -41,7 +49,46 @@
end
end

@inline alloc_local(id, T, len) = alloc_special(Val(id), T, Val(AS.Local), Val(len))
@inline alloc_local(id, T, len, zeroinit=false) = alloc_special(Val{id}(), T, Val{AS.Local}(), Val{len}(), Val{zeroinit}())
@inline alloc_scratch(id, T, len) = alloc_special(Val{id}(), T, Val{AS.Private}(), Val{len}(), Val{false}())

macro ROCStaticLocalArray(T, dims, zeroinit=true)
dims = dims isa Expr ? dims.args[1] : dims
@assert dims isa Integer || dims isa Tuple "@ROCStaticLocalArray requires a constant `dims` argument"

zeroinit = zeroinit isa Expr ? zeroinit.args[1] : zeroinit
@assert zeroinit isa Bool "@ROCStaticLocalArray requires a constant `zeroinit` argument"

@gensym id
len = prod(dims)
quote
$ROCDeviceArray($dims, $alloc_local($(QuoteNode(Symbol(:ROCStaticLocalArray_, id))), $T, $len, $zeroinit))
end
end
macro ROCDynamicLocalArray(T, dims, zeroinit=true)
if Base.libllvm_version < v"14"
@warn "@ROCDynamicLocalArray is unsupported on LLVM <14\nUndefined behavior may result"
end

zeroinit = zeroinit isa Expr ? zeroinit.args[1] : zeroinit
@assert zeroinit isa Bool "@ROCDynamicLocalArray requires a constant `zeroinit` argument"

@gensym id DA
quote
let
$DA = $ROCDeviceArray($(esc(dims)), $alloc_local($(QuoteNode(Symbol(:ROCDynamicLocalArray_, id))), $T, 0, $zeroinit))
if $zeroinit
# Zeroinit doesn't work at the compiler level for dynamic LDS
# allocations, so zero it here
for idx in 1:prod($(esc(dims)))
@inbounds $DA[idx] = zero($T)
end
$sync_workgroup()
end
$DA
end
end
end

@inline @generated function alloc_string(::Val{str}) where str
Context() do ctx
Expand Down
49 changes: 24 additions & 25 deletions test/device/memory.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
@testset "Memory: Static" begin
@testset "Fixed-size Allocation" begin
function memory_static_kernel(a,b)
function memory_static_kernel(A, B, C)
idx = workitemIdx().x

# Local
ptr_local = AMDGPU.Device.alloc_local(:local, Float32, 1)
unsafe_store!(ptr_local, a[1])
b[1] = unsafe_load(ptr_local)

# Region
#= TODO: AMDGPU target cannot select
ptr_region = alloc_special(Val(:region), Float32, Val(AS.Region), Val(1))
unsafe_store!(ptr_region, a[2])
b[2] = unsafe_load(ptr_region)
=#
arr_local = @ROCStaticLocalArray(Float32, 8)
C[idx] = arr_local[idx]
arr_local[idx] = A[idx]
B[idx] = arr_local[idx]

# Private
#= TODO
Expand All @@ -23,38 +19,41 @@
nothing
end

A = ones(Float32, 1)
B = zeros(Float32, 1)

RA = ROCArray(A)
RB = ROCArray(B)
RA = ROCArray(ones(Float32, 8))
RB = ROCArray(zeros(Float32, 8))
RC = ROCArray(ones(Float32, 8))

wait(@roc memory_static_kernel(RA, RB))
wait(@roc groupsize=8 memory_static_kernel(RA, RB, RC))

@test Array(RA) Array(RB)
# Test zero-initialization
@test all(iszero, Array(RC))
end

# https://reviews.llvm.org/D82496
if Base.libllvm_version.major >= 14
@testset "Dynamic-size Local Allocation" begin
function dynamic_localmem_kernel(RA)
ptr = AMDGPU.Device.alloc_local(:local, Float32, 0)
RB = ROCDeviceArray(length(RA), ptr)
for i in 1:length(RA)
RB[i] = RA[i] + 1f0
function dynamic_localmem_kernel(A, C)
B = @ROCDynamicLocalArray(Float32, length(A))
for i in 1:length(A)
@inbounds C[i] = B[i]
@inbounds B[i] = A[i] + 1f0
end
for i in 1:length(RA)
RA[i] = RB[i]
for i in 1:length(A)
@inbounds A[i] = B[i]
end
end

N = 2^10
A = rand(Float32, N)
RA = ROCArray(A)
RC = ROCArray(ones(Float32, N))

wait(@roc localmem=N*sizeof(Float32) dynamic_localmem_kernel(RA))
wait(@roc localmem=N*sizeof(Float32) dynamic_localmem_kernel(RA, RC))

@test Array(RA) A .+ 1f0
# Test zero-initialization
@test all(iszero, Array(RC))
end
end

Expand Down

0 comments on commit 191afc4

Please sign in to comment.