diff --git a/Manifest.toml b/Manifest.toml index 371dd8e06..6306d09a7 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -31,6 +31,11 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" +[[OffsetArrays]] +git-tree-sha1 = "707e34562700b81e8aa13548eb6b23b18112e49b" +uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" +version = "1.0.2" + [[OrderedCollections]] deps = ["Random", "Serialization", "Test"] git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1" @@ -49,9 +54,9 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[SIMDPirates]] deps = ["VectorizationBase"] -git-tree-sha1 = "34dff4f4715f871e71b38f31397d96e62621f14d" +git-tree-sha1 = "f91198b7ef74b04028f98e0eed7c556b93538a2e" uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a" -version = "0.6.5" +version = "0.6.6" [[SLEEFPirates]] deps = ["Libdl", "SIMDPirates", "VectorizationBase"] @@ -71,6 +76,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[VectorizationBase]] deps = ["CpuId", "LinearAlgebra"] -git-tree-sha1 = "006d7b7f276db8d728f8bfd70ebf2efd132f9548" +git-tree-sha1 = "8abb5697fb64cadccd1bba444c955942d3181e5c" uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" -version = "0.7.0" +version = "0.7.1" diff --git a/Project.toml b/Project.toml index e76310030..807065699 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.6.20" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a" SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa" diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl index 88c661da6..056a5678c 100644 --- a/src/LoopVectorization.jl +++ b/src/LoopVectorization.jl @@ -4,7 +4,7 @@ using VectorizationBase, SIMDPirates, SLEEFPirates, Parameters using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr, mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub, _MM, maybestaticlength, maybestaticsize, staticm1, subsetview, vzero, stridedpointer_for_broadcast, - Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, + Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, unwrap, maybestaticrange, PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod, sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!, @@ -12,6 +12,7 @@ using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_pr vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone using Base.Broadcast: Broadcasted, DefaultArrayStyle using LinearAlgebra: Adjoint, Transpose +using Base.Meta: isexpr const SUPPORTED_TYPES = Union{Float16,Float32,Float64,Integer} @@ -21,6 +22,7 @@ export LowDimArray, stridedpointer, vectorizable, vfilter, vfilter! +include("vectorizationbase_extensions.jl") include("map.jl") include("filter.jl") include("costs.jl") diff --git a/src/add_loads.jl b/src/add_loads.jl index b4d453a44..fe4a0e131 100644 --- a/src/add_loads.jl +++ b/src/add_loads.jl @@ -70,13 +70,3 @@ function add_loopvalue!(ls::LoopSet, arg::Symbol, elementbytes::Int) loopsymop end - -struct LoopValue end -@inline VectorizationBase.stridedpointer(::LoopValue) = LoopValue() -@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}) where {W} = _MM{W}(@inbounds(i[1].i) + 1) -# @inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Unsigned) where {W} = _MM{W}(@inbounds(i[1].i) + 1) -@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Mask) where {W} = _MM{W}(@inbounds(i[1].i) + 1) -@inline VectorizationBase.vload(::LoopValue, i::Integer) = i + one(i) -@inline VectorizationBase.vload(::LoopValue, i::Tuple{I}) where {I<:Integer} = @inbounds(i[1]) + one(I) -@inline Base.eltype(::LoopValue) = Int8 - diff --git a/src/graphs.jl b/src/graphs.jl index ad42a9ecb..66203f4f1 100644 --- a/src/graphs.jl +++ b/src/graphs.jl @@ -33,7 +33,7 @@ # For passing options like array types and mask # struct LoopSetOptions - + # end struct Loop @@ -70,7 +70,7 @@ function startloop(loop::Loop, isvectorized, W, itersymbol = loop.itersymbol) elseif startexact Expr(:(=), itersymbol, loop.starthint) else - Expr(:(=), itersymbol, loop.startsym) + Expr(:(=), itersymbol, Expr(:call, lv(:unwrap), loop.startsym)) end end function vec_looprange(loop::Loop, isunrolled::Bool, W::Symbol, U::Int) @@ -84,7 +84,7 @@ function vec_looprange(loop::Loop, isunrolled::Bool, W::Symbol, U::Int) else Expr(:call, :<, loop.itersymbol, Expr(:call, :-, loop.stopsym, incr)) end -end +end function looprange(loop::Loop, incr::Int, mangledname::Symbol) incr -= 1#one(Int32) if iszero(incr) @@ -369,47 +369,59 @@ This function creates a loop, while switching from 1 to 0 based indices """ function register_single_loop!(ls::LoopSet, looprange::Expr) itersym = (looprange.args[1])::Symbol - r = (looprange.args[2])::Expr - @assert r.head === :call - f = first(r.args) - loop::Loop = if f === :(:) - lower = r.args[2] - upper = r.args[3] - lii::Bool = lower isa Integer - liiv::Int = lii ? (convert(Int, lower)-1) : 0 - uii::Bool = upper isa Integer - if lii & uii # both are integers - Loop(itersym, liiv, convert(Int, upper)) - elseif lii # only lower bound is an integer - if upper isa Symbol - Loop(itersym, liiv, upper) - elseif upper isa Expr - Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true)) - else - Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true)) + r = looprange.args[2] + if isexpr(r, :call) + f = first(r.args) + loop::Loop = if f === :(:) + lower = r.args[2] + upper = r.args[3] + lii::Bool = lower isa Integer + liiv::Int = lii ? (convert(Int, lower)-1) : 0 + uii::Bool = upper isa Integer + if lii & uii # both are integers + Loop(itersym, liiv, convert(Int, upper)) + elseif lii # only lower bound is an integer + if upper isa Symbol + Loop(itersym, liiv, upper) + elseif upper isa Expr + Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true)) + else + Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true)) + end + elseif uii # only upper bound is an integer + uiiv = convert(Int, upper) + Loop(itersym, add_loop_bound!(ls, itersym, lower, false), uiiv) + else # neither are integers + L = add_loop_bound!(ls, itersym, lower, false) + U = add_loop_bound!(ls, itersym, upper, true) + Loop(itersym, L, U) end - elseif uii # only upper bound is an integer - uiiv = convert(Int, upper) - Loop(itersym, add_loop_bound!(ls, itersym, lower, false), uiiv) - else # neither are integers - L = add_loop_bound!(ls, itersym, lower, false) - U = add_loop_bound!(ls, itersym, upper, true) + elseif f === :eachindex + N = gensym(Symbol(:loopeachindex, itersym)) + pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r))) + L = add_loop_bound!(ls, itersym, Expr(:call, :first, N), false) + U = add_loop_bound!(ls, itersym, Expr(:call, :last, N), true) Loop(itersym, L, U) - end - elseif f === :eachindex - N = gensym(Symbol(:loop, itersym)) - pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticlength), r.args[2]))) - Loop(itersym, 0, N) - elseif f === :OneTo || f == Expr(:(.), :Base, QuoteNode(:OneTo)) - otN = r.args[2] - if otN isa Integer - Loop(itersym, 0, otN) + elseif f === :OneTo || f == Expr(:(.), :Base, QuoteNode(:OneTo)) + otN = r.args[2] + if otN isa Integer + Loop(itersym, 0, otN) + else + otN isa Expr && maybestatic!(otN) + N = gensym(Symbol(:loop, itersym)) + pushpreamble!(ls, Expr(:(=), N, otN)) + Loop(itersym, 0, N) + end else - otN isa Expr && maybestatic!(otN) - N = gensym(Symbol(:loop, itersym)) - pushpreamble!(ls, Expr(:(=), N, otN)) - Loop(itersym, 0, N) + throw("Unrecognized loop range type: $r.") end + elseif isa(r, Symbol) + # Treat similar to `eachindex` + N = gensym(Symbol(:loop, itersym)) + pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r))) + L = add_loop_bound!(ls, itersym, Expr(:call, :first, N), false) + U = add_loop_bound!(ls, itersym, Expr(:call, :last, N), true) + loop = Loop(itersym, L, U) else throw("Unrecognized loop range type: $r.") end @@ -546,7 +558,3 @@ function Base.push!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int) throw("Don't know how to handle expression:\n$ex") end end - - - - diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl index 68555dc52..d4c695dec 100644 --- a/src/reconstruct_loopset.jl +++ b/src/reconstruct_loopset.jl @@ -14,6 +14,13 @@ function Loop(ls::LoopSet, l::Int, ::Type{StaticLowerUnitRange{L}}) where {L} pushpreamble!(ls, Expr(:(=), stop, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), Expr(:ref, :lb, l), QuoteNode(:U))))) Loop(gensym(:n), L, L + 1024, Symbol(""), stop, true, false)::Loop end +# Is there any likely way to generate such a range? +# function Loop(ls::LoopSet, l::Int, ::Type{StaticLengthUnitRange{N}}) where {N} +# start = gensym(:loopstart); stop = gensym(:loopstop) +# pushpreamble!(ls, Expr(:(=), start, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), Expr(:ref, :lb, l), QuoteNode(:L))))) +# pushpreamble!(ls, Expr(:(=), stop, Expr(:call, :(+), start, N - 1))) +# Loop(gensym(:n), 0, N, start, stop, false, false)::Loop +# end function Loop(ls, l, ::Type{StaticUnitRange{L,U}}) where {L,U} Loop(gensym(:n), L, U, Symbol(""), Symbol(""), true, true)::Loop end @@ -63,7 +70,7 @@ extract_varg(i) = Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__ pushvarg!(ls::LoopSet, ar::ArrayReferenceMeta, i) = pushpreamble!(ls, Expr(:(=), vptr(ar), extract_varg(i))) function pushvarg′!(ls::LoopSet, ar::ArrayReferenceMeta, i) reverse!(ar.loopedindex); reverse!(getindices(ar)) # reverse the listed indices here, and transpose it to make it column major - pushpreamble!(ls, Expr(:(=), vptr(ar), Expr(:call, lv(:Transpose), extract_varg(i)))) + pushpreamble!(ls, Expr(:(=), vptr(ar), Expr(:call, lv(:transpose), extract_varg(i)))) end function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{PackedStridedPointer{T, N}}) where {T, N} pushvarg!(ls, ar, i) @@ -71,6 +78,10 @@ end function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{RowMajorStridedPointer{T, N}}) where {T, N} pushvarg′!(ls, ar, i) end +function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{OffsetStridedPointer{T,N,P}}) where {T,N,P} + add_mref!(ls, ar, i, P) +end + function add_mref!( ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{S} ) where {T, X <: Tuple, S <: VectorizationBase.AbstractStaticStridedPointer{T,X}} diff --git a/src/vectorizationbase_extensions.jl b/src/vectorizationbase_extensions.jl new file mode 100644 index 000000000..613c2687e --- /dev/null +++ b/src/vectorizationbase_extensions.jl @@ -0,0 +1,38 @@ + +struct LoopValue end +@inline VectorizationBase.stridedpointer(::LoopValue) = LoopValue() +@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}) where {W} = _MM{W}(@inbounds(i[1].i) + 1) +# @inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Unsigned) where {W} = _MM{W}(@inbounds(i[1].i) + 1) +@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Mask) where {W} = _MM{W}(@inbounds(i[1].i) + 1) +@inline VectorizationBase.vload(::LoopValue, i::Integer) = i + one(i) +@inline VectorizationBase.vload(::LoopValue, i::Tuple{I}) where {I<:Integer} = @inbounds(i[1]) + one(I) +@inline Base.eltype(::LoopValue) = Int8 + +import OffsetArrays + +# If ndim(::OffsetArray) == 1, we can convert to a regular strided pointer and offset. +@inline VectorizationBase.stridedpointer(a::OffsetArrays.OffsetArray{<:Any,1}) = gesp(stridedpointer(parent(a)), (-@inbounds(a.offsets[1]),)) + +struct OffsetStridedPointer{T, N, P <: VectorizationBase.AbstractStridedPointer{T}} <: VectorizationBase.AbstractStridedPointer{T} + ptr::P + offsets::NTuple{N,Int} +end +# if ndim(A::OffsetArray) ≥ 2, then eachindex(A) isa Base.OneTo, index starting at 1. +# but multiple indexing is calculated using offsets, so we need a special type to express this. +@inline function VectorizationBase.stridedpointer(A::OffsetArrays.OffsetArray) + OffsetStridedPointer(stridedpointer(parent(A)), A.offsets) +end +# Tuple of length == 1, use ind directly. +# @inline VectorizationBase.offset(ptr::OffsetStridedPointer, ind::Tuple{I}) where {I} = VectorizationBase.offset(ptr.ptr, ind) +# Tuple of length > 1, subtract offsets. +# @inline VectorizationBase.offset(ptr::OffsetStridedPointer{<:Any,N}, ind::Tuple) where {N} = VectorizationBase.offset(ptr.ptr, ntuple(n -> ind[n] + ptr.offsets[n], Val{N}())) +@inline VectorizationBase.offset(ptr::OffsetStridedPointer, ind::Tuple{I}) where {I} = ind +# Tuple of length > 1, subtract offsets. +@inline VectorizationBase.offset(ptr::OffsetStridedPointer{<:Any,N}, ind::Tuple) where {N} = ntuple(n -> ind[n] - ptr.offsets[n], Val{N}()) +@inline Base.similar(p::OffsetStridedPointer, ptr::Ptr) = OffsetStridedPointer(similar(p.ptr, ptr), p.offsets) + +# If an OffsetArray is getting indexed by a (loop-)constant value, then this particular vptr object cannot also be eachindexed, so we can safely return a stridedpointer +@inline function VectorizationBase.subsetview(ptr::OffsetStridedPointer{<:Any,N}, ::Val{I}, i) where {I,N} + subsetview(gesp(ptr.ptr, ntuple(n -> 0 - @inbounds(ptr.offsets[n]), Val{N}())), Val{I}(), i) +end + diff --git a/test/dot.jl b/test/dot.jl index 391130dd2..6bdb256ca 100644 --- a/test/dot.jl +++ b/test/dot.jl @@ -1,3 +1,6 @@ +using LoopVectorization, OffsetArrays +using Test + @testset "dot" begin dotq = :(for i ∈ eachindex(a,b) s += a[i]*b[i] @@ -46,6 +49,14 @@ end s end + function myselfdotavx_range(a) + s = zero(eltype(a)) + rng = axes(a, 1) + @avx for i ∈ rng + s += a[i]*a[i] + end + s + end function myselfdot_avx(a) s = zero(eltype(a)) @_avx for i ∈ eachindex(a) @@ -167,7 +178,7 @@ end 4acc/length(x) end - + # @macroexpand @_avx for i = 1:length(a_re) - 1 # c_re[i] = b_re[i] * a_re[i + 1] - b_im[i] * a_im[i + 1] # c_im[i] = b_re[i] * a_im[i + 1] + b_im[i] * a_re[i + 1] @@ -179,9 +190,12 @@ N = 127 R = T <: Integer ? (T(-100):T(100)) : T a = rand(T, N); b = rand(R, N); + ao = OffsetArray(a, -60:66); bo = OffsetArray(b, -60:66); s = mydot(a, b) @test mydotavx(a,b) ≈ s @test mydot_avx(a,b) ≈ s + @test mydotavx(ao,bo) ≈ s + @test mydot_avx(ao,bo) ≈ s @test dot_unroll2avx(a,b) ≈ s @test dot_unroll3avx(a,b) ≈ s @test dot_unroll2_avx(a,b) ≈ s @@ -190,6 +204,7 @@ @test dot_unroll3avx_inline(a,b) ≈ s s = myselfdot(a) @test myselfdotavx(a) ≈ s + @test myselfdotavx_range(a) ≈ s @test myselfdot_avx(a) ≈ s @test myselfdotavx(a) ≈ s @@ -205,7 +220,7 @@ b_re = rand(R, N); b_im = rand(R, N); ac = Complex.(a_re, a_im); bc = Complex.(b_re, b_im); - + @test mydot(ac, bc) ≈ complex_dot_soa(a_re, a_im, b_re, b_im) c_re1 = similar(a_re); c_im1 = similar(a_im); diff --git a/test/gemv.jl b/test/gemv.jl index 1583bcead..765ca1986 100644 --- a/test/gemv.jl +++ b/test/gemv.jl @@ -1,3 +1,6 @@ +using LoopVectorization +using Test + @testset "GEMV" begin gemvq = :(for i ∈ eachindex(y) yᵢ = 0.0 @@ -27,6 +30,16 @@ y[i] = yᵢ end end + function mygemvavx_range!(y, A, x) + rng1, rng2 = axes(A) + @avx for i ∈ rng1 + yᵢ = zero(eltype(y)) + for j ∈ rng2 + yᵢ += A[i,j] * x[j] + end + y[i] = yᵢ + end + end q = :(for i ∈ eachindex(y) yᵢ = zero(eltype(y)) for j ∈ eachindex(x) @@ -150,6 +163,9 @@ @test y1 ≈ y2 fill!(y2, -999.9); mygemv_avx!(y2, A, x) @test y1 ≈ y2 + fill!(y2, -999.9) + mygemvavx_range!(y2, A, x) + @test y1 ≈ y2 B = rand(R, N, N); G1 = Matrix{TC}(undef, N, 1); diff --git a/test/offsetarrays.jl b/test/offsetarrays.jl new file mode 100644 index 000000000..7a3371b3f --- /dev/null +++ b/test/offsetarrays.jl @@ -0,0 +1,66 @@ +using LoopVectorization, OffsetArrays +using Test + +@testset "OffsetArrays" begin + + function old2d!(out::AbstractMatrix, A::AbstractMatrix, kern, R=CartesianIndices(out), z=zero(eltype(out))) + rng1k, rng2k = axes(kern) + rng1, rng2 = R.indices + for j in rng2, i in rng1 + tmp = z + @inbounds for jk in rng2k, ik in rng1k + tmp += oftype(tmp, A[i+ik,j+jk])*kern[ik,jk] + end + @inbounds out[i,j] = tmp + end + out + end + function avx2d!(out::AbstractMatrix, A::AbstractMatrix, kern::OffsetArray, R=CartesianIndices(out), z=zero(eltype(out))) + rng1k, rng2k = axes(kern) + rng1, rng2 = R.indices + # Manually unpack the OffsetArray + kernA = parent(kern) + o1, o2 = kern.offsets + for j in rng2, i in rng1 + tmp = z + @avx for jk in rng2k, ik in rng1k + tmp += A[i+ik,j+jk]*kernA[ik-o1,jk-o2] + end + out[i,j] = tmp + end + out + end + function avx2douter!(out::AbstractMatrix, A::AbstractMatrix, kern::OffsetArray, R=CartesianIndices(out), z=zero(eltype(out))) + rng1k, rng2k = axes(kern) + rng1, rng2 = R.indices + # Manually unpack the OffsetArray + kernA = parent(kern) + o1, o2 = kern.offsets + @avx for j in rng2, i in rng1 + tmp = z + for jk in rng2k, ik in rng1k + tmp += A[i+ik,j+jk]*kernA[ik-o1,jk-o2] + 1 + end + out[i,j] = tmp + end + out + end + + for T ∈ (Float32, Float64) + @show T, @__LINE__ + A = rand(T, 100, 100); + kern = OffsetArray(rand(T, 3, 3), -1:1, -1:1); + out1 = OffsetArray(similar(A, size(A).-2), 1, 1); # stay away from the edges of A + out2 = similar(out1); out3 = similar(out1); + + old2d!(out1, A, kern); + avx2d!(out2, A, kern); + @test out1 ≈ out2 + avx2douter!(out3, A, kern); + @test out1 ≈ out3 + end + + +end + diff --git a/test/runtests.jl b/test/runtests.jl index 6c1f971a2..bc7cda92f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,6 +17,8 @@ end @time @testset "LoopVectorization.jl" begin @time include("printmethods.jl") + + @time include("offsetarrays.jl") @time include("map.jl")