From e692d53ec898d7a60b17de82370a36c65264d68a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 27 Jun 2023 20:59:41 +0200 Subject: [PATCH 1/4] Convert values when doing a vectorized store. --- src/layout.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/layout.jl b/src/layout.jl index 336b7d34..d4eda5cb 100644 --- a/src/layout.jl +++ b/src/layout.jl @@ -4,6 +4,7 @@ module Layout using CUDA using LLVMLoopInfo: @loopinfo using GemmKernels.Tiling +using Base.Cartesian: @ntuple # --------------------- # Customise computation @@ -30,8 +31,9 @@ end alignment = sizeof(T) * N return quote + y = @ntuple $N i -> VecElement{T}(x[i].value) vec_ptr = Base.bitcast(Core.LLVMPtr{NTuple{N, VecElement{T}}, AS}, ptr) - return unsafe_store!(vec_ptr, x, (i-1) ÷ N + 1, Val($alignment)) + return unsafe_store!(vec_ptr, y, (i-1) ÷ N + 1, Val($alignment)) end end From 17e1a0d6eca9e3aad17401d2b6d7ad5008e76223 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 27 Jun 2023 21:00:00 +0200 Subject: [PATCH 2/4] Perform multiple vectorized stores when there's a type mismatch. --- src/layout.jl | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/layout.jl b/src/layout.jl index d4eda5cb..08ac46a6 100644 --- a/src/layout.jl +++ b/src/layout.jl @@ -81,13 +81,21 @@ abstract type AlignedColMajor{T} <: LayoutBase{T} end return vloada(Vec{N, T}, pointer(workspace), linear_base + linear_offset - 1) end -@inline function store!(::Type{<:AlignedColMajor{T}}, workspace, value, tile::Tile{size}) where {T, size} +@inline @generated function store!(::Type{<:AlignedColMajor{T}}, workspace, value, tile::Tile{size}) where {T, size} N = 16 ÷ sizeof(T) - linear_base = linearise(tile.base, Base.size(workspace)) - linear_offset = linearise(tile.offset, Base.size(workspace)) + quote + linear_base = linearise(tile.base, Base.size(workspace)) + linear_offset = linearise(tile.offset, Base.size(workspace)) - vstorea!(Vec{N, T}, pointer(workspace), value, linear_base + linear_offset - 1) + # we may be storing more values than we can using a single vectorized operation + # (e.g., when types mismatch, storing 8 Float16s in a Float32 shared memory layout) + @loopinfo unroll for value_offset = 1:$N:length(value) + x = @ntuple $N i -> (value[value_offset+i-1]) + vstorea!(Vec{$N, T}, pointer(workspace), x, + linear_base + linear_offset + value_offset - 2) + end + end end # -------- From 69e97c1644849ccf833fa787f21d72e1f5573e9f Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 29 Jun 2023 11:24:07 +0200 Subject: [PATCH 3/4] Move the loop into vstorea. --- src/layout.jl | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/layout.jl b/src/layout.jl index 08ac46a6..3d75ec8d 100644 --- a/src/layout.jl +++ b/src/layout.jl @@ -18,7 +18,8 @@ using Base.Cartesian: @ntuple struct Vec{N, T} end -@inline @generated function vloada(::Type{Vec{N, T}}, ptr::Core.LLVMPtr{T, AS}, i::Integer = 1) where {N, T, AS} +@inline @generated function vloada(::Type{Vec{N, T}}, ptr::Core.LLVMPtr{T, AS}, + i::Integer = 1) where {N, T, AS} alignment = sizeof(T) * N return quote @@ -27,13 +28,20 @@ struct Vec{N, T} end end end -@inline @generated function vstorea!(::Type{Vec{N, T}}, ptr::Core.LLVMPtr{T, AS}, x, i::Integer = 1) where {N, T, AS} +@inline @generated function vstorea!(::Type{Vec{N, T}}, ptr::Core.LLVMPtr{T, AS}, x, + i::Integer = 1) where {N, T, AS} alignment = sizeof(T) * N return quote - y = @ntuple $N i -> VecElement{T}(x[i].value) - vec_ptr = Base.bitcast(Core.LLVMPtr{NTuple{N, VecElement{T}}, AS}, ptr) - return unsafe_store!(vec_ptr, y, (i-1) ÷ N + 1, Val($alignment)) + # we may be storing more values than we can using a single vectorized operation + # (e.g., when types mismatch, storing 8 Float16s in a Float32 shared memory layout) + @loopinfo unroll for offset = 1:$N:length(x) + y = @ntuple $N j -> VecElement{T}(x[j+offset-1].value) + vec_ptr = Base.bitcast(Core.LLVMPtr{NTuple{N, VecElement{T}}, AS}, ptr) + unsafe_store!(vec_ptr, y, (i+offset-2) ÷ N + 1, Val($alignment)) + end + + return end end @@ -81,21 +89,13 @@ abstract type AlignedColMajor{T} <: LayoutBase{T} end return vloada(Vec{N, T}, pointer(workspace), linear_base + linear_offset - 1) end -@inline @generated function store!(::Type{<:AlignedColMajor{T}}, workspace, value, tile::Tile{size}) where {T, size} +@inline function store!(::Type{<:AlignedColMajor{T}}, workspace, value, tile::Tile{size}) where {T, size} N = 16 ÷ sizeof(T) - quote - linear_base = linearise(tile.base, Base.size(workspace)) - linear_offset = linearise(tile.offset, Base.size(workspace)) + linear_base = linearise(tile.base, Base.size(workspace)) + linear_offset = linearise(tile.offset, Base.size(workspace)) - # we may be storing more values than we can using a single vectorized operation - # (e.g., when types mismatch, storing 8 Float16s in a Float32 shared memory layout) - @loopinfo unroll for value_offset = 1:$N:length(value) - x = @ntuple $N i -> (value[value_offset+i-1]) - vstorea!(Vec{$N, T}, pointer(workspace), x, - linear_base + linear_offset + value_offset - 2) - end - end + vstorea!(Vec{N, T}, pointer(workspace), value, linear_base + linear_offset - 1) end # -------- From 2b25c994829dd02835000b70225ecabeaa844d31 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 29 Jun 2023 13:38:10 +0200 Subject: [PATCH 4/4] Manually unroll generated code. --- src/layout.jl | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/layout.jl b/src/layout.jl index 3d75ec8d..2dbe3c03 100644 --- a/src/layout.jl +++ b/src/layout.jl @@ -28,21 +28,23 @@ struct Vec{N, T} end end end -@inline @generated function vstorea!(::Type{Vec{N, T}}, ptr::Core.LLVMPtr{T, AS}, x, - i::Integer = 1) where {N, T, AS} +@inline @generated function vstorea!(::Type{Vec{N, T}}, ptr::Core.LLVMPtr{T, AS}, + x::NTuple{M,<:Any}, i::Integer = 1) where {N, T, AS, M} alignment = sizeof(T) * N - return quote - # we may be storing more values than we can using a single vectorized operation - # (e.g., when types mismatch, storing 8 Float16s in a Float32 shared memory layout) - @loopinfo unroll for offset = 1:$N:length(x) - y = @ntuple $N j -> VecElement{T}(x[j+offset-1].value) - vec_ptr = Base.bitcast(Core.LLVMPtr{NTuple{N, VecElement{T}}, AS}, ptr) - unsafe_store!(vec_ptr, y, (i+offset-2) ÷ N + 1, Val($alignment)) - end + ex = quote end - return + # we may be storing more values than we can using a single vectorized operation + # (e.g., when types mismatch, storing 8 Float16s in a Float32 shared memory layout) + for offset = 0:N:M-1 + append!(ex.args, (quote + y = @ntuple $N j -> VecElement{T}(x[j+$offset].value) + vec_ptr = Base.bitcast(Core.LLVMPtr{NTuple{N, VecElement{T}}, AS}, ptr) + unsafe_store!(vec_ptr, y, (i+$offset-1) ÷ N + 1, Val($alignment)) + end).args) end + + return ex end # -----------