Skip to content

Commit

Permalink
Updates for better OffsetArray support. Check the start values of ite…
Browse files Browse the repository at this point in the history
…rables, and handle different starting offsets of OffsetArrays based on linear vs cartesian indexing.
  • Loading branch information
chriselrod committed Mar 5, 2020
1 parent b720d11 commit 73e2ed5
Show file tree
Hide file tree
Showing 10 changed files with 144 additions and 23 deletions.
13 changes: 9 additions & 4 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"

[[OffsetArrays]]
git-tree-sha1 = "707e34562700b81e8aa13548eb6b23b18112e49b"
uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
version = "1.0.2"

[[OrderedCollections]]
deps = ["Random", "Serialization", "Test"]
git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
Expand All @@ -49,9 +54,9 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[[SIMDPirates]]
deps = ["VectorizationBase"]
git-tree-sha1 = "34dff4f4715f871e71b38f31397d96e62621f14d"
git-tree-sha1 = "f91198b7ef74b04028f98e0eed7c556b93538a2e"
uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
version = "0.6.5"
version = "0.6.6"

[[SLEEFPirates]]
deps = ["Libdl", "SIMDPirates", "VectorizationBase"]
Expand All @@ -71,6 +76,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[VectorizationBase]]
deps = ["CpuId", "LinearAlgebra"]
git-tree-sha1 = "006d7b7f276db8d728f8bfd70ebf2efd132f9548"
git-tree-sha1 = "8abb5697fb64cadccd1bba444c955942d3181e5c"
uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
version = "0.7.0"
version = "0.7.1"
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ version = "0.6.20"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a"
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
Expand Down
3 changes: 2 additions & 1 deletion src/LoopVectorization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ using VectorizationBase, SIMDPirates, SLEEFPirates, Parameters
using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr,
mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub, _MM,
maybestaticlength, maybestaticsize, staticm1, subsetview, vzero, stridedpointer_for_broadcast,
Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange,
Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, unwrap, maybestaticrange,
PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct
using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
Expand All @@ -22,6 +22,7 @@ export LowDimArray, stridedpointer, vectorizable,
vfilter, vfilter!


include("vectorizationbase_extensions.jl")
include("map.jl")
include("filter.jl")
include("costs.jl")
Expand Down
10 changes: 0 additions & 10 deletions src/add_loads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,3 @@ function add_loopvalue!(ls::LoopSet, arg::Symbol, elementbytes::Int)
loopsymop
end


struct LoopValue end
@inline VectorizationBase.stridedpointer(::LoopValue) = LoopValue()
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
# @inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Unsigned) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Mask) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
@inline VectorizationBase.vload(::LoopValue, i::Integer) = i + one(i)
@inline VectorizationBase.vload(::LoopValue, i::Tuple{I}) where {I<:Integer} = @inbounds(i[1]) + one(I)
@inline Base.eltype(::LoopValue) = Int8

16 changes: 10 additions & 6 deletions src/graphs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ function startloop(loop::Loop, isvectorized, W, itersymbol = loop.itersymbol)
elseif startexact
Expr(:(=), itersymbol, loop.starthint)
else
Expr(:(=), itersymbol, loop.startsym)
Expr(:(=), itersymbol, Expr(:call, lv(:unwrap), loop.startsym))
end
end
function vec_looprange(loop::Loop, isunrolled::Bool, W::Symbol, U::Int)
Expand Down Expand Up @@ -397,9 +397,11 @@ function register_single_loop!(ls::LoopSet, looprange::Expr)
Loop(itersym, L, U)
end
elseif f === :eachindex
N = gensym(Symbol(:loop, itersym))
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticlength), r.args[2])))
Loop(itersym, 0, N)
N = gensym(Symbol(:loopeachindex, itersym))
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r)))
L = add_loop_bound!(ls, itersym, Expr(:call, :first, N), false)
U = add_loop_bound!(ls, itersym, Expr(:call, :last, N), true)
Loop(itersym, L, U)
elseif f === :OneTo || f == Expr(:(.), :Base, QuoteNode(:OneTo))
otN = r.args[2]
if otN isa Integer
Expand All @@ -416,8 +418,10 @@ function register_single_loop!(ls::LoopSet, looprange::Expr)
elseif isa(r, Symbol)
# Treat similar to `eachindex`
N = gensym(Symbol(:loop, itersym))
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticlength), r)))
loop = Loop(itersym, 0, N)
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r)))
L = add_loop_bound!(ls, itersym, Expr(:call, :first, N), false)
U = add_loop_bound!(ls, itersym, Expr(:call, :last, N), true)
loop = Loop(itersym, L, U)
else
throw("Unrecognized loop range type: $r.")
end
Expand Down
13 changes: 12 additions & 1 deletion src/reconstruct_loopset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ function Loop(ls::LoopSet, l::Int, ::Type{StaticLowerUnitRange{L}}) where {L}
pushpreamble!(ls, Expr(:(=), stop, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), Expr(:ref, :lb, l), QuoteNode(:U)))))
Loop(gensym(:n), L, L + 1024, Symbol(""), stop, true, false)::Loop
end
# Is there any likely way to generate such a range?
# function Loop(ls::LoopSet, l::Int, ::Type{StaticLengthUnitRange{N}}) where {N}
# start = gensym(:loopstart); stop = gensym(:loopstop)
# pushpreamble!(ls, Expr(:(=), start, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), Expr(:ref, :lb, l), QuoteNode(:L)))))
# pushpreamble!(ls, Expr(:(=), stop, Expr(:call, :(+), start, N - 1)))
# Loop(gensym(:n), 0, N, start, stop, false, false)::Loop
# end
function Loop(ls, l, ::Type{StaticUnitRange{L,U}}) where {L,U}
Loop(gensym(:n), L, U, Symbol(""), Symbol(""), true, true)::Loop
end
Expand Down Expand Up @@ -63,14 +70,18 @@ extract_varg(i) = Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__
pushvarg!(ls::LoopSet, ar::ArrayReferenceMeta, i) = pushpreamble!(ls, Expr(:(=), vptr(ar), extract_varg(i)))
function pushvarg′!(ls::LoopSet, ar::ArrayReferenceMeta, i)
reverse!(ar.loopedindex); reverse!(getindices(ar)) # reverse the listed indices here, and transpose it to make it column major
pushpreamble!(ls, Expr(:(=), vptr(ar), Expr(:call, lv(:Transpose), extract_varg(i))))
pushpreamble!(ls, Expr(:(=), vptr(ar), Expr(:call, lv(:transpose), extract_varg(i))))
end
function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{PackedStridedPointer{T, N}}) where {T, N}
pushvarg!(ls, ar, i)
end
function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{RowMajorStridedPointer{T, N}}) where {T, N}
pushvarg′!(ls, ar, i)
end
function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{OffsetStridedPointer{T,N,P}}) where {T,N,P}
add_mref!(ls, ar, i, P)
end

function add_mref!(
ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{S}
) where {T, X <: Tuple, S <: VectorizationBase.AbstractStaticStridedPointer{T,X}}
Expand Down
38 changes: 38 additions & 0 deletions src/vectorizationbase_extensions.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

struct LoopValue end
@inline VectorizationBase.stridedpointer(::LoopValue) = LoopValue()
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
# @inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Unsigned) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Mask) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
@inline VectorizationBase.vload(::LoopValue, i::Integer) = i + one(i)
@inline VectorizationBase.vload(::LoopValue, i::Tuple{I}) where {I<:Integer} = @inbounds(i[1]) + one(I)
@inline Base.eltype(::LoopValue) = Int8

import OffsetArrays

# If ndim(::OffsetArray) == 1, we can convert to a regular strided pointer and offset.
@inline VectorizationBase.stridedpointer(a::OffsetArrays.OffsetArray{<:Any,1}) = gesp(stridedpointer(parent(a)), (-@inbounds(a.offsets[1]),))

struct OffsetStridedPointer{T, N, P <: VectorizationBase.AbstractStridedPointer{T}} <: VectorizationBase.AbstractStridedPointer{T}
ptr::P
offsets::NTuple{N,Int}
end
# if ndim(A::OffsetArray) ≥ 2, then eachindex(A) isa Base.OneTo, index starting at 1.
# but multiple indexing is calculated using offsets, so we need a special type to express this.
@inline function VectorizationBase.stridedpointer(A::OffsetArrays.OffsetArray)
OffsetStridedPointer(stridedpointer(parent(A)), A.offsets)
end
# Tuple of length == 1, use ind directly.
# @inline VectorizationBase.offset(ptr::OffsetStridedPointer, ind::Tuple{I}) where {I} = VectorizationBase.offset(ptr.ptr, ind)
# Tuple of length > 1, subtract offsets.
# @inline VectorizationBase.offset(ptr::OffsetStridedPointer{<:Any,N}, ind::Tuple) where {N} = VectorizationBase.offset(ptr.ptr, ntuple(n -> ind[n] + ptr.offsets[n], Val{N}()))
@inline VectorizationBase.offset(ptr::OffsetStridedPointer, ind::Tuple{I}) where {I} = ind
# Tuple of length > 1, subtract offsets.
@inline VectorizationBase.offset(ptr::OffsetStridedPointer{<:Any,N}, ind::Tuple) where {N} = ntuple(n -> ind[n] - ptr.offsets[n], Val{N}())
@inline Base.similar(p::OffsetStridedPointer, ptr::Ptr) = OffsetStridedPointer(similar(p.ptr, ptr), p.offsets)

# If an OffsetArray is getting indexed by a (loop-)constant value, then this particular vptr object cannot also be eachindexed, so we can safely return a stridedpointer
@inline function VectorizationBase.subsetview(ptr::OffsetStridedPointer{<:Any,N}, ::Val{I}, i) where {I,N}
subsetview(gesp(ptr.ptr, ntuple(n -> 0 - @inbounds(ptr.offsets[n]), Val{N}())), Val{I}(), i)
end

5 changes: 4 additions & 1 deletion test/dot.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using LoopVectorization
using LoopVectorization, OffsetArrays
using Test

@testset "dot" begin
Expand Down Expand Up @@ -190,9 +190,12 @@ using Test
N = 127
R = T <: Integer ? (T(-100):T(100)) : T
a = rand(T, N); b = rand(R, N);
ao = OffsetArray(a, -60:66); bo = OffsetArray(b, -60:66);
s = mydot(a, b)
@test mydotavx(a,b) s
@test mydot_avx(a,b) s
@test mydotavx(ao,bo) s
@test mydot_avx(ao,bo) s
@test dot_unroll2avx(a,b) s
@test dot_unroll3avx(a,b) s
@test dot_unroll2_avx(a,b) s
Expand Down
66 changes: 66 additions & 0 deletions test/offsetarrays.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
using LoopVectorization, OffsetArrays
using Test

@testset "OffsetArrays" begin

function old2d!(out::AbstractMatrix, A::AbstractMatrix, kern, R=CartesianIndices(out), z=zero(eltype(out)))
rng1k, rng2k = axes(kern)
rng1, rng2 = R.indices
for j in rng2, i in rng1
tmp = z
@inbounds for jk in rng2k, ik in rng1k
tmp += oftype(tmp, A[i+ik,j+jk])*kern[ik,jk]
end
@inbounds out[i,j] = tmp
end
out
end
function avx2d!(out::AbstractMatrix, A::AbstractMatrix, kern::OffsetArray, R=CartesianIndices(out), z=zero(eltype(out)))
rng1k, rng2k = axes(kern)
rng1, rng2 = R.indices
# Manually unpack the OffsetArray
kernA = parent(kern)
o1, o2 = kern.offsets
for j in rng2, i in rng1
tmp = z
@avx for jk in rng2k, ik in rng1k
tmp += A[i+ik,j+jk]*kernA[ik-o1,jk-o2]
end
out[i,j] = tmp
end
out
end
function avx2douter!(out::AbstractMatrix, A::AbstractMatrix, kern::OffsetArray, R=CartesianIndices(out), z=zero(eltype(out)))
rng1k, rng2k = axes(kern)
rng1, rng2 = R.indices
# Manually unpack the OffsetArray
kernA = parent(kern)
o1, o2 = kern.offsets
@avx for j in rng2, i in rng1
tmp = z
for jk in rng2k, ik in rng1k
tmp += A[i+ik,j+jk]*kernA[ik-o1,jk-o2]
1
end
out[i,j] = tmp
end
out
end

for T (Float32, Float64)
@show T, @__LINE__
A = rand(T, 100, 100);
kern = OffsetArray(rand(T, 3, 3), -1:1, -1:1);
out1 = OffsetArray(similar(A, size(A).-2), 1, 1); # stay away from the edges of A
out2 = similar(out1); out3 = similar(out1);

old2d!(out1, A, kern);
avx2d!(out2, A, kern);
@test out1 out2
avx2douter!(out3, A, kern);
@test out1 out3
end


end

2 changes: 2 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ end
@time @testset "LoopVectorization.jl" begin

@time include("printmethods.jl")

@time include("offsetarrays.jl")

@time include("map.jl")

Expand Down

0 comments on commit 73e2ed5

Please sign in to comment.