Merge pull request #348 from JuliaGPU/jps/localmem-array

Add local memory allocation helpers
JuliaGPU · Jan 14, 2023 · 191afc4 · 191afc4
2 parents 5b216b3 + 12b60ac
commit 191afc4
Show file tree

Hide file tree

Showing 6 changed files with 141 additions and 50 deletions.
diff --git a/docs/src/memory.md b/docs/src/memory.md
@@ -110,28 +110,22 @@ wait(@roc groupsize=4 kernel(RC, RA, RB))
 @assert Array(RC) ≈ Array(RA) .+ Array(RB) .+ RC_elem
 ```
 
-Local memory may be allocated within a kernel by calling
-`AMDGPU.Device.alloc_local(id, T, len)`, where `id` is some sort of bitstype ID
-for the local allocation, `T` is the Julia element type, and `len` is the
-number of elements of type `T` to allocate. Local memory does not need to be
-freed, as it is automatically freed by the hardware. If `len == 0`, then local
-memory is dynamically allocated at kernel runtime; the `localmem` option to
-`@roc` must be set appropriately to ensure that enough local memory is
-allocated by the hardware.
-
-```julia
-
-```
+Local memory may be allocated within a kernel by calling either
+`@ROCStaticLocalArray(T, dims)` or `@ROCDynamicLocalArray(T, dims)` - use the
+former if `dims` is passed as a constant value, and otherwise use the latter.
+Local memory does not need to be freed, as it is automatically freed by the
+hardware. If `@ROCDynamicLocalArray` is used, then local memory is dynamically
+allocated at kernel execution time; therefore, the `localmem` option to `@roc`
+must be set appropriately to ensure that enough local memory is allocated by
+the hardware.
 
 ```julia
 function kernel(C, A, B)
-    # Allocate local memory dynamically (0 means dynamic)
-    Ctmp_ptr = AMDGPU.Device.alloc_local(:localmem, Float64, 0)
-    # Or, allocate local memory statically
-    # Ctmp_ptr = AMDGPU.Device.alloc_local(:localmem, Float64, 4)
+    # Allocate local memory dynamically
+    Ctmp = @ROCDynamicLocalArray(Float64, length(C))
+    # Or, allocate local memory statically if the size is known ahead-of-time
+    # Ctmp = @ROCStaticLocalArray(Float64, 8) # if we want 8 elements
 
-    # Turn it (a pointer to Float64 elements in Local memory) into a device-side array
-    Ctmp = ROCDeviceArray(length(C), Ctmp_ptr)
     # Use it
     idx = AMDGPU.workitemIdx().x
     Ctmp[idx] = A[idx] + B[idx] + C[1]
@@ -140,10 +134,16 @@ function kernel(C, A, B)
     nothing
 end
 # ...
-# The `localmem` option isn't necessary if memory is statically allocated
+# Note: The `localmem` option isn't necessary if `@ROCStaticLocalArray` is used
 wait(@roc groupsize=4 localmem=sizeof(Float64)*length(RC) kernel(RC, RA, RB))
 ```
 
+Note that like CUDA's shared memory, AMDGPU's local memory is zero-initialized
+automatically. If this behavior is unnecessary (and undesired for performance
+reasons), zero-initialization can be disabled with `@ROCDynamicLocalArray(T,
+dims, false)` or `@ROCStaticLocalArray(T, dims, false)` (the last argument
+is `zeroinit`).
+
 ## Memory Modification Intrinsics
 
 Like C, AMDGPU.jl provides the `memset!` and `memcpy!` intrinsics, which are

diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl
@@ -114,12 +114,13 @@ module Device
 end
 import .Device: malloc, signal_exception, report_exception, report_oom, report_exception_frame
 import .Device: ROCDeviceArray, AS, HostCall, hostcall!
+import .Device: @ROCDynamicLocalArray, @ROCStaticLocalArray
 import .Device: workitemIdx, workgroupIdx, workgroupDim, gridItemDim, gridGroupDim
 import .Device: threadIdx, blockIdx, blockDim
 import .Device: sync_workgroup
 import .Device: @rocprint, @rocprintln, @rocprintf
 
-export ROCDeviceArray
+export ROCDeviceArray, @ROCDynamicLocalArray, @ROCStaticLocalArray
 export @rocprint, @rocprintln, @rocprintf
 export workitemIdx, workgroupIdx, workgroupDim, gridItemDim, gridGroupDim
 export sync_workgroup

diff --git a/src/array.jl b/src/array.jl
@@ -40,7 +40,7 @@ end
 # memory
 
 @inline function GPUArrays.LocalMemory(::ROCKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}) where {T,dims,id}
-    ptr = AMDGPU.Device.alloc_special(Val(id), T, AMDGPU.AS.Local, Val(prod(dims)))
+    ptr = AMDGPU.Device.alloc_special(Val{id}(), T, Val{AMDGPU.AS.Local}(), Val{prod(dims)}())
     ROCDeviceArray(dims, ptr)
 end
 

diff --git a/src/compiler.jl b/src/compiler.jl
@@ -71,6 +71,13 @@ function GPUCompiler.process_module!(job::ROCCompilerJob, mod::LLVM.Module)
     # Run this early (before optimization) to ensure we link OCKL
     emit_exception_user!(mod)
 end
+function GPUCompiler.process_entry!(job::ROCCompilerJob, mod::LLVM.Module, entry::LLVM.Function)
+    invoke(GPUCompiler.process_entry!,
+           Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod), typeof(entry)},
+           job, mod, entry)
+    # Workaround for the lack of zeroinitializer support for LDS
+    zeroinit_lds!(mod, entry)
+end
 function GPUCompiler.finish_module!(job::ROCCompilerJob, mod::LLVM.Module)
     invoke(GPUCompiler.finish_module!,
            Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod)},
@@ -90,6 +97,43 @@ GPUCompiler.ci_cache(::ROCCompilerJob) = AMDGPU.ci_cache
 
 GPUCompiler.method_table(::ROCCompilerJob) = AMDGPU.method_table
 
+function zeroinit_lds!(mod::LLVM.Module, entry::LLVM.Function)
+    if LLVM.callconv(entry) != LLVM.API.LLVMAMDGPUKERNELCallConv
+        return entry
+    end
+    to_init = []
+    for gbl in LLVM.globals(mod)
+        if startswith(LLVM.name(gbl), "__zeroinit")
+            as = LLVM.addrspace(llvmtype(gbl))
+            if as == AMDGPU.Device.AS.Local
+                push!(to_init, gbl)
+            end
+        end
+    end
+    if length(to_init) > 0
+        ctx = LLVM.context(mod)
+        T_void = LLVM.VoidType(ctx)
+        LLVM.@dispose builder=LLVM.Builder(ctx) begin
+            # Make these the first operations we do
+            position!(builder, first(LLVM.instructions(first(LLVM.blocks(entry)))))
+
+            # Use memset to clear all values to 0
+            for gbl in to_init
+                sz = llvmsize(eltype(llvmtype(gbl)))
+                if sz > 0
+                    LLVM.memset!(builder, gbl, ConstantInt(UInt8(0); ctx), ConstantInt(sz; ctx), LLVM.alignment(gbl))
+                end
+            end
+
+            # Synchronize the workgroup to prevent races
+            sync_f = LLVM.Function(mod, LLVM.Intrinsic("llvm.amdgcn.s.barrier"))
+            call!(builder, sync_f)
+        end
+    end
+
+    return entry
+end
+
 """
     rocfunction(f, tt=Tuple{}; kwargs...)
 

diff --git a/src/device/gcn/memory_static.jl b/src/device/gcn/memory_static.jl
@@ -1,12 +1,15 @@
 "Allocates on-device memory statically from the specified address space."
-@generated function alloc_special(::Val{id}, ::Type{T}, ::Val{as}, ::Val{len}) where {id,T,as,len}
+@generated function alloc_special(::Val{id}, ::Type{T}, ::Val{as}, ::Val{len}, ::Val{zeroinit}=Val{false}()) where {id,T,as,len,zeroinit}
     Context() do ctx
         eltyp = convert(LLVMType, T; ctx)
 
         # old versions of GPUArrays invoke _shmem with an integer id; make sure those are unique
         if !isa(id, String) || !isa(id, Symbol)
             id = "alloc_special_$id"
         end
+        if zeroinit
+            id = "__zeroinit_" * id
+        end
 
         T_ptr_i8 = convert(LLVMType, LLVMPtr{T,as}; ctx)
 
@@ -18,8 +21,13 @@
         gv_typ = LLVM.ArrayType(eltyp, len)
         gv = GlobalVariable(mod, gv_typ, string(id), as)
         if len > 0
-            linkage!(gv, LLVM.API.LLVMExternalLinkage)
-            # NOTE: Backend doesn't support initializer for local AS
+            if as == AS.Local
+                linkage!(gv, LLVM.API.LLVMExternalLinkage)
+                # NOTE: Backend doesn't support initializer for local AS
+            elseif as == AS.Private
+                linkage!(gv, LLVM.API.LLVMInternalLinkage)
+                initializer!(gv, null(gv_typ))
+            end
         end
 
         # by requesting a larger-than-datatype alignment, we might be able to vectorize.
@@ -41,7 +49,46 @@
     end
 end
 
-@inline alloc_local(id, T, len) = alloc_special(Val(id), T, Val(AS.Local), Val(len))
+@inline alloc_local(id, T, len, zeroinit=false) = alloc_special(Val{id}(), T, Val{AS.Local}(), Val{len}(), Val{zeroinit}())
+@inline alloc_scratch(id, T, len) = alloc_special(Val{id}(), T, Val{AS.Private}(), Val{len}(), Val{false}())
+
+macro ROCStaticLocalArray(T, dims, zeroinit=true)
+    dims = dims isa Expr ? dims.args[1] : dims
+    @assert dims isa Integer || dims isa Tuple "@ROCStaticLocalArray requires a constant `dims` argument"
+
+    zeroinit = zeroinit isa Expr ? zeroinit.args[1] : zeroinit
+    @assert zeroinit isa Bool "@ROCStaticLocalArray requires a constant `zeroinit` argument"
+
+    @gensym id
+    len = prod(dims)
+    quote
+        $ROCDeviceArray($dims, $alloc_local($(QuoteNode(Symbol(:ROCStaticLocalArray_, id))), $T, $len, $zeroinit))
+    end
+end
+macro ROCDynamicLocalArray(T, dims, zeroinit=true)
+    if Base.libllvm_version < v"14"
+        @warn "@ROCDynamicLocalArray is unsupported on LLVM <14\nUndefined behavior may result"
+    end
+
+    zeroinit = zeroinit isa Expr ? zeroinit.args[1] : zeroinit
+    @assert zeroinit isa Bool "@ROCDynamicLocalArray requires a constant `zeroinit` argument"
+
+    @gensym id DA
+    quote
+        let
+            $DA = $ROCDeviceArray($(esc(dims)), $alloc_local($(QuoteNode(Symbol(:ROCDynamicLocalArray_, id))), $T, 0, $zeroinit))
+            if $zeroinit
+                # Zeroinit doesn't work at the compiler level for dynamic LDS
+                # allocations, so zero it here
+                for idx in 1:prod($(esc(dims)))
+                    @inbounds $DA[idx] = zero($T)
+                end
+                $sync_workgroup()
+            end
+            $DA
+        end
+    end
+end
 
 @inline @generated function alloc_string(::Val{str}) where str
     Context() do ctx

diff --git a/test/device/memory.jl b/test/device/memory.jl
@@ -1,17 +1,13 @@
 @testset "Memory: Static" begin
 @testset "Fixed-size Allocation" begin
-    function memory_static_kernel(a,b)
+    function memory_static_kernel(A, B, C)
+        idx = workitemIdx().x
+
         # Local
-        ptr_local = AMDGPU.Device.alloc_local(:local, Float32, 1)
-        unsafe_store!(ptr_local, a[1])
-        b[1] = unsafe_load(ptr_local)
-
-        # Region
-        #= TODO: AMDGPU target cannot select
-        ptr_region = alloc_special(Val(:region), Float32, Val(AS.Region), Val(1))
-        unsafe_store!(ptr_region, a[2])
-        b[2] = unsafe_load(ptr_region)
-        =#
+        arr_local = @ROCStaticLocalArray(Float32, 8)
+        C[idx] = arr_local[idx]
+        arr_local[idx] = A[idx]
+        B[idx] = arr_local[idx]
 
         # Private
         #= TODO
@@ -23,38 +19,41 @@
         nothing
     end
 
-    A = ones(Float32, 1)
-    B = zeros(Float32, 1)
-
-    RA = ROCArray(A)
-    RB = ROCArray(B)
+    RA = ROCArray(ones(Float32, 8))
+    RB = ROCArray(zeros(Float32, 8))
+    RC = ROCArray(ones(Float32, 8))
 
-    wait(@roc memory_static_kernel(RA, RB))
+    wait(@roc groupsize=8 memory_static_kernel(RA, RB, RC))
 
     @test Array(RA) ≈ Array(RB)
+    # Test zero-initialization
+    @test all(iszero, Array(RC))
 end
 
 # https://reviews.llvm.org/D82496
 if Base.libllvm_version.major >= 14
 @testset "Dynamic-size Local Allocation" begin
-    function dynamic_localmem_kernel(RA)
-        ptr = AMDGPU.Device.alloc_local(:local, Float32, 0)
-        RB = ROCDeviceArray(length(RA), ptr)
-        for i in 1:length(RA)
-            RB[i] = RA[i] + 1f0
+    function dynamic_localmem_kernel(A, C)
+        B = @ROCDynamicLocalArray(Float32, length(A))
+        for i in 1:length(A)
+            @inbounds C[i] = B[i]
+            @inbounds B[i] = A[i] + 1f0
         end
-        for i in 1:length(RA)
-            RA[i] = RB[i]
+        for i in 1:length(A)
+            @inbounds A[i] = B[i]
         end
     end
 
     N = 2^10
     A = rand(Float32, N)
     RA = ROCArray(A)
+    RC = ROCArray(ones(Float32, N))
 
-    wait(@roc localmem=N*sizeof(Float32) dynamic_localmem_kernel(RA))
+    wait(@roc localmem=N*sizeof(Float32) dynamic_localmem_kernel(RA, RC))
 
     @test Array(RA) ≈ A .+ 1f0
+    # Test zero-initialization
+    @test all(iszero, Array(RC))
 end
 end