From e5daf5d896130c6cac41755701798945466a8ac5 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Tue, 16 Nov 2021 09:26:12 +0100 Subject: [PATCH] Revert "Replace StaticArrays with a simple immutable array type (#83)" This reverts commit db0ba14c104ec18b387a071c7c68fd018a08e5d2. --- .buildkite/pipeline.yml | 2 ++ Project.toml | 2 ++ src/GemmKernels.jl | 1 - src/array.jl | 43 -------------------------------- src/kernel.jl | 55 ++++++++++++++++++++--------------------- src/layout.jl | 1 + 6 files changed, 32 insertions(+), 72 deletions(-) delete mode 100644 src/array.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d5086d1c..920d2dfd 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -24,6 +24,8 @@ steps: cap: "recent" if: build.message !~ /\[skip tests\]/ timeout_in_minutes: 60 + soft_fail: + - exit_status: 1 - label: "Julia nightly" plugins: diff --git a/Project.toml b/Project.toml index 45e70e3c..550053c3 100644 --- a/Project.toml +++ b/Project.toml @@ -9,10 +9,12 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" GPUifyLoops = "ba82f77b-6841-5d2e-bd9f-4daf811aec27" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] CUDA = "3.5" ForwardDiff = "0.10" GPUifyLoops = "0.2" LLVM = "3, 4" +StaticArrays = "0.12, 1" julia = "1.6" diff --git a/src/GemmKernels.jl b/src/GemmKernels.jl index 3449c233..3fb7063b 100644 --- a/src/GemmKernels.jl +++ b/src/GemmKernels.jl @@ -4,7 +4,6 @@ include("tiling.jl") include("config.jl") include("epilogue.jl") -include("array.jl") include("kernel.jl") include("layout.jl") include("operator.jl") diff --git a/src/array.jl b/src/array.jl deleted file mode 100644 index b4d3067b..00000000 --- a/src/array.jl +++ /dev/null @@ -1,43 +0,0 @@ -# a simple immutable array type backed by stack memory -# -# similar to StaticArrays, but immutable to prevent optimization bugs (JuliaLang/julia#41800) - -struct LocalArray{S <: Tuple, T, N, L} <: AbstractArray{T,N} - data::NTuple{L,T} - - LocalArray{S,T,N,L}(::UndefInitializer) where {S,T,N,L} = new{S,T,N,L}() - LocalArray{S,T,N,L}(data::NTuple{L,T}) where {S,T,N,L} = new{S,T,N,L}(data) -end - -@inline @generated function LocalArray{S,T}(args...) where {S,T} - dims = (S.parameters...,) - N = length(dims) - L = prod(dims) - @assert isbitstype(T) - quote - LocalArray{S, T, $N, $L}(args...) - end -end - -# array interface -Base.IndexStyle(::Type{<:LocalArray}) = IndexLinear() -Base.size(x::LocalArray{S}) where {S} = (S.parameters...,) - -# indexing -Base.@propagate_inbounds function Base.getindex(v::LocalArray, i::Int) - @boundscheck checkbounds(v,i) - @inbounds v.data[i] -end -Base.@propagate_inbounds function Base.setindex(v::LocalArray{S,T,N,L} , val, i::Int) where {S,T,N,L} - @boundscheck checkbounds(v,i) - new_data = Base.setindex(v.data, val, i) - LocalArray{S,T,N,L}(new_data) -end -## XXX: Base's setindex doesn't have a ND version -Base.@propagate_inbounds function Base.setindex(v::LocalArray{S,T,N,L} , val, is::Int...) where {S,T,N,L} - @boundscheck checkbounds(v,is...) - I = CartesianIndex(is...) - i = LinearIndices(v)[I] - new_data = Base.setindex(v.data, val, i) - LocalArray{S,T,N,L}(new_data) -end diff --git a/src/kernel.jl b/src/kernel.jl index 5f12e51f..2c7102e0 100644 --- a/src/kernel.jl +++ b/src/kernel.jl @@ -4,9 +4,8 @@ module Kernel using CUDA using GemmKernels using GemmKernels.Tiling -using GemmKernels: LocalArray using GPUifyLoops: @unroll -using Base: setindex +using StaticArrays function matmul_singlestage(a, b, c, d, transf_gl2sh_a, transf_gl2sh_b, transf_gl2sh_c, transf_sh2gl_d, @@ -43,12 +42,12 @@ function matmul_singlestage(a, b, c, d, # (2) Load a compute_warp.M x compute_warp.N tile of C from shared memory into registers warp_tile = subdivide(block_tile.MN, Tile(conf.compute_warp).MN, warpId, conf.warps_per_block) - c_frags = LocalArray{Tuple{num_fragments_m, num_fragments_n}, Operator.fragtype_accum(conf.operator, conf.shared_c_layout)}(undef) + c_frags = MArray{Tuple{num_fragments_m, num_fragments_n}, Operator.fragtype_accum(conf.operator, conf.shared_c_layout)}(undef) @unroll for i = 1 : num_fragments_m @unroll for j = 1 : num_fragments_n tile = translate_offset(warp_tile, (M = (i-1)*conf.compute_op_shape.M, N = (j-1)*conf.compute_op_shape.N)) - @inbounds c_frags = setindex(c_frags, transf_sh2rf_c(Operator.load_c(conf.operator, conf.shared_c_layout, shmem_c, tile), tile), i ,j) + @inbounds c_frags[i, j] = transf_sh2rf_c(Operator.load_c(conf.operator, conf.shared_c_layout, shmem_c, tile), tile) end end @@ -84,25 +83,25 @@ function matmul_singlestage(a, b, c, d, # (3.3) Calculate a compute_warp.M x compute_warp.N tile of D, using a compute_warp.M x compute_warp.N x compute_warp.K operation @unroll for warp_tile = parallellise(block_tile, Tile(conf.compute_warp), warpId, conf.warps_per_block) # (3.3.1) Load a compute_warp.M x compute_warp.K tile of A from shared memory into registers - a_frags = LocalArray{Tuple{num_fragments_m}, Operator.fragtype_a(conf.operator, conf.shared_a_layout)}(undef) + a_frags = MArray{Tuple{num_fragments_m}, Operator.fragtype_a(conf.operator, conf.shared_a_layout)}(undef) @unroll for i = 1 : num_fragments_m a_tile = translate_offset(warp_tile.MK, (M = (i-1)*conf.compute_op_shape.M, K = 0)) - @inbounds a_frags = setindex(a_frags, transf_sh2rf_a(Operator.load_a(conf.operator, conf.shared_a_layout, shmem_a, a_tile), a_tile), i) + @inbounds a_frags[i] = transf_sh2rf_a(Operator.load_a(conf.operator, conf.shared_a_layout, shmem_a, a_tile), a_tile) end # (3.3.2) Load a compute_warp.K x compute_warp.N tile of B from shared memory into registers - b_frags = LocalArray{Tuple{num_fragments_n}, Operator.fragtype_b(conf.operator, conf.shared_b_layout)}(undef) + b_frags = MArray{Tuple{num_fragments_n}, Operator.fragtype_b(conf.operator, conf.shared_b_layout)}(undef) @unroll for j = 1 : num_fragments_n b_tile = translate_offset(warp_tile.KN, (K = 0, N = (j-1)*conf.compute_op_shape.N)) - @inbounds b_frags = setindex(b_frags, transf_sh2rf_b(Operator.load_b(conf.operator, conf.shared_b_layout, shmem_b, b_tile), b_tile), j) + @inbounds b_frags[j] = transf_sh2rf_b(Operator.load_b(conf.operator, conf.shared_b_layout, shmem_b, b_tile), b_tile) end # (3.3.3) Compute a compute_warp.M x compute_warp.N x compute_warp.K matrix product within one warp @unroll for i = 1 : num_fragments_m @unroll for j = 1 : num_fragments_n - @inbounds c_frags = setindex(c_frags, Operator.mma(conf.operator, a_frags[i], b_frags[j], c_frags[i, j]), i, j) + @inbounds c_frags[i, j] = Operator.mma(conf.operator, a_frags[i], b_frags[j], c_frags[i, j]) end end end @@ -119,7 +118,7 @@ function matmul_singlestage(a, b, c, d, @unroll for i = 1 : num_fragments_m @unroll for j = 1 : num_fragments_n tile = translate_offset(warp_tile, (M = (i-1)*conf.compute_op_shape.M, N = (j-1)*conf.compute_op_shape.N)) - @inbounds Operator.store_d(conf.operator, conf.shared_d_layout, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile) + Operator.store_d(conf.operator, conf.shared_d_layout, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile) end end @@ -166,12 +165,12 @@ function matmul_pipelined(a, b, c, d, # (2) Load a compute_warp.M x compute_warp.N tile of C from shared memory into registers warp_tile = subdivide(block_tile.MN, Tile(conf.compute_warp).MN, warpId, conf.warps_per_block) - c_frags = LocalArray{Tuple{num_fragments_m, num_fragments_n}, Operator.fragtype_accum(conf.operator, conf.shared_c_layout)}(undef) + c_frags = MArray{Tuple{num_fragments_m, num_fragments_n}, Operator.fragtype_accum(conf.operator, conf.shared_c_layout)}(undef) @unroll for i = 1 : num_fragments_m @unroll for j = 1 : num_fragments_n tile = translate_offset(warp_tile, (M = (i-1)*conf.compute_op_shape.M, N = (j-1)*conf.compute_op_shape.N)) - @inbounds c_frags = setindex(c_frags, transf_sh2rf_c(Operator.load_c(conf.operator, conf.shared_c_layout, shmem_c, tile), tile), i, j) + @inbounds c_frags[i, j] = transf_sh2rf_c(Operator.load_c(conf.operator, conf.shared_c_layout, shmem_c, tile), tile) end end @@ -188,24 +187,24 @@ function matmul_pipelined(a, b, c, d, b_frag_i = (block_tile.size.K * block_tile.size.N) ÷ (conf.mem_b_warp.K * conf.mem_b_warp.N * conf.warps_per_block) b_frag_j = (conf.mem_b_warp.K * conf.mem_b_warp.N) ÷ (conf.mem_b_thread.K * conf.mem_b_thread.N * 32) - a_fragment = LocalArray{Tuple{a_frag_i, a_frag_j}, Layout.fragtype(conf.global_a_layout, conf.mem_a_thread)}(undef) - b_fragment = LocalArray{Tuple{b_frag_i, b_frag_j}, Layout.fragtype(conf.global_b_layout, conf.mem_b_thread)}(undef) + a_fragment = MArray{Tuple{a_frag_i, a_frag_j}, Layout.fragtype(conf.global_a_layout, conf.mem_a_thread)}(undef) + b_fragment = MArray{Tuple{b_frag_i, b_frag_j}, Layout.fragtype(conf.global_b_layout, conf.mem_b_thread)}(undef) - a_frags = LocalArray{Tuple{2, num_fragments_m}, Operator.fragtype_a(conf.operator, conf.shared_a_layout)}(undef) - b_frags = LocalArray{Tuple{2, num_fragments_n}, Operator.fragtype_b(conf.operator, conf.shared_b_layout)}(undef) + a_frags = MArray{Tuple{2, num_fragments_m}, Operator.fragtype_a(conf.operator, conf.shared_a_layout)}(undef) + b_frags = MArray{Tuple{2, num_fragments_n}, Operator.fragtype_b(conf.operator, conf.shared_b_layout)}(undef) warp_tile_mn = subdivide(block_tile, Tile(conf.compute_warp), warpId, conf.warps_per_block) # ld.global(0 : block_shape.K) @unroll for (i, warp_tile) = enumerate(parallellise(block_tile.MK, Tile(conf.mem_a_warp), warpId, conf.warps_per_block, conf.is_a_col_major)) @unroll for (j, thread_tile) = enumerate(parallellise(warp_tile, Tile(conf.mem_a_thread), laneId, 32, conf.is_a_col_major)) - @inbounds a_fragment = setindex(a_fragment, Layout.load(conf.global_a_layout, a, translate_base(thread_tile, (M = block_i, K = 0))), i, j) + @inbounds a_fragment[i, j] = Layout.load(conf.global_a_layout, a, translate_base(thread_tile, (M = block_i, K = 0))) end end @unroll for (i, warp_tile) = enumerate(parallellise(block_tile.KN, Tile(conf.mem_b_warp), warpId, conf.warps_per_block, conf.is_b_col_major)) @unroll for (j, thread_tile) = enumerate(parallellise(warp_tile, Tile(conf.mem_b_thread), laneId, 32, conf.is_b_col_major)) - @inbounds b_fragment = setindex(b_fragment, Layout.load(conf.global_b_layout, b, translate_base(thread_tile, (K = 0, N = block_j))), i, j) + @inbounds b_fragment[i, j] = Layout.load(conf.global_b_layout, b, translate_base(thread_tile, (K = 0, N = block_j))) end end @@ -231,24 +230,24 @@ function matmul_pipelined(a, b, c, d, @unroll for i = 1 : num_fragments_m a_tile = translate_offset(warp_tile.MK, (M = (i-1)*conf.compute_op_shape.M, K = 0)) - @inbounds a_frags = setindex(a_frags, transf_sh2rf_a(Operator.load_a(conf.operator, conf.shared_a_layout, shmem_a, a_tile), a_tile), 1, i) + @inbounds a_frags[1, i] = transf_sh2rf_a(Operator.load_a(conf.operator, conf.shared_a_layout, shmem_a, a_tile), a_tile) end @unroll for j = 1 : num_fragments_n b_tile = translate_offset(warp_tile.KN, (K = 0, N = (j-1)*conf.compute_op_shape.N)) - @inbounds b_frags = setindex(b_frags, transf_sh2rf_b(Operator.load_b(conf.operator, conf.shared_b_layout, shmem_b, b_tile), b_tile), 1, j) + @inbounds b_frags[1, j] = transf_sh2rf_b(Operator.load_b(conf.operator, conf.shared_b_layout, shmem_b, b_tile), b_tile) end # ld.global(block_shape.K : 2 * block_shape.K) @unroll for (i, warp_tile) = enumerate(parallellise(block_tile.MK, Tile(conf.mem_a_warp), warpId, conf.warps_per_block, conf.is_a_col_major)) @unroll for (j, thread_tile) = enumerate(parallellise(warp_tile, Tile(conf.mem_a_thread), laneId, 32, conf.is_a_col_major)) - @inbounds a_fragment = setindex(a_fragment, Layout.load(conf.global_a_layout, a, translate_base(thread_tile, (M = block_i, K = block_tile.size.K))), i, j) + @inbounds a_fragment[i, j] = Layout.load(conf.global_a_layout, a, translate_base(thread_tile, (M = block_i, K = block_tile.size.K))) end end @unroll for (i, warp_tile) = enumerate(parallellise(block_tile.KN, Tile(conf.mem_b_warp), warpId, conf.warps_per_block, conf.is_b_col_major)) @unroll for (j, thread_tile) = enumerate(parallellise(warp_tile, Tile(conf.mem_b_thread), laneId, 32, conf.is_b_col_major)) - @inbounds b_fragment = setindex(b_fragment, Layout.load(conf.global_b_layout, b, translate_base(thread_tile, (K = block_tile.size.K, N = block_j))), i, j) + @inbounds b_fragment[i, j] = Layout.load(conf.global_b_layout, b, translate_base(thread_tile, (K = block_tile.size.K, N = block_j))) end end @@ -282,13 +281,13 @@ function matmul_pipelined(a, b, c, d, # ld.global(block_k + 2 * block_shape.K : block_k + 3 * block_shape.K) @unroll for (i, warp_tile) = enumerate(parallellise(block_tile.MK, Tile(conf.mem_a_warp), warpId, conf.warps_per_block, conf.is_a_col_major)) @unroll for (j, thread_tile) = enumerate(parallellise(warp_tile, Tile(conf.mem_a_thread), laneId, 32, conf.is_a_col_major)) - @inbounds a_fragment = setindex(a_fragment, Layout.load(conf.global_a_layout, a, translate_base(thread_tile, (M = block_i, K = block_k + 2 * block_tile.size.K))), i, j) + @inbounds a_fragment[i, j] = Layout.load(conf.global_a_layout, a, translate_base(thread_tile, (M = block_i, K = block_k + 2 * block_tile.size.K))) end end @unroll for (i, warp_tile) = enumerate(parallellise(block_tile.KN, Tile(conf.mem_b_warp), warpId, conf.warps_per_block, conf.is_b_col_major)) @unroll for (j, thread_tile) = enumerate(parallellise(warp_tile, Tile(conf.mem_b_thread), laneId, 32, conf.is_b_col_major)) - @inbounds b_fragment = setindex(b_fragment, Layout.load(conf.global_b_layout, b, translate_base(thread_tile, (K = block_k + 2 * block_tile.size.K, N = block_j))), i, j) + @inbounds b_fragment[i, j] = Layout.load(conf.global_b_layout, b, translate_base(thread_tile, (K = block_k + 2 * block_tile.size.K, N = block_j))) end end end @@ -299,18 +298,18 @@ function matmul_pipelined(a, b, c, d, @unroll for i = 1 : num_fragments_m a_tile = translate_offset(warp_tile.MK, (M = (i-1)*conf.compute_op_shape.M, K = 0)) - @inbounds a_frags = setindex(a_frags, transf_sh2rf_a(Operator.load_a(conf.operator, conf.shared_a_layout, shmem_a, a_tile), a_tile), nxt_stage, i) + @inbounds a_frags[nxt_stage, i] = transf_sh2rf_a(Operator.load_a(conf.operator, conf.shared_a_layout, shmem_a, a_tile), a_tile) end @unroll for j = 1 : num_fragments_n b_tile = translate_offset(warp_tile.KN, (K = 0, N = (j-1)*conf.compute_op_shape.N)) - @inbounds b_frags = setindex(b_frags, transf_sh2rf_b(Operator.load_b(conf.operator, conf.shared_b_layout, shmem_b, b_tile), b_tile), nxt_stage, j) + @inbounds b_frags[nxt_stage, j] = transf_sh2rf_b(Operator.load_b(conf.operator, conf.shared_b_layout, shmem_b, b_tile), b_tile) end # mma(cur_stage) @unroll for i = 1 : num_fragments_m @unroll for j = 1 : num_fragments_n - @inbounds c_frags = setindex(c_frags, Operator.mma(conf.operator, a_frags[cur_stage, i], b_frags[cur_stage, j], c_frags[i, j]), i, j) + @inbounds c_frags[i, j] = Operator.mma(conf.operator, a_frags[cur_stage, i], b_frags[cur_stage, j], c_frags[i, j]) end end end @@ -326,7 +325,7 @@ function matmul_pipelined(a, b, c, d, @unroll for i = 1 : num_fragments_m @unroll for j = 1 : num_fragments_n tile = translate_offset(warp_tile, (M = (i-1)*conf.compute_op_shape.M, N = (j-1)*conf.compute_op_shape.N)) - @inbounds Operator.store_d(conf.operator, conf.shared_d_layout, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile) + Operator.store_d(conf.operator, conf.shared_d_layout, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile) end end diff --git a/src/layout.jl b/src/layout.jl index 9854bcd5..801c0e53 100644 --- a/src/layout.jl +++ b/src/layout.jl @@ -4,6 +4,7 @@ module Layout using CUDA using GPUifyLoops: @unroll using GemmKernels.Tiling +using StaticArrays # --------------------- # Customise computation