Skip to content

Commit

Permalink
Merge pull request #65 from timholy/teh/loops
Browse files Browse the repository at this point in the history
Fix `for i in iter`
  • Loading branch information
chriselrod authored Mar 5, 2020
2 parents 977528d + 73e2ed5 commit 2ae8a78
Show file tree
Hide file tree
Showing 11 changed files with 216 additions and 62 deletions.
13 changes: 9 additions & 4 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"

[[OffsetArrays]]
git-tree-sha1 = "707e34562700b81e8aa13548eb6b23b18112e49b"
uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
version = "1.0.2"

[[OrderedCollections]]
deps = ["Random", "Serialization", "Test"]
git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
Expand All @@ -49,9 +54,9 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[[SIMDPirates]]
deps = ["VectorizationBase"]
git-tree-sha1 = "34dff4f4715f871e71b38f31397d96e62621f14d"
git-tree-sha1 = "f91198b7ef74b04028f98e0eed7c556b93538a2e"
uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
version = "0.6.5"
version = "0.6.6"

[[SLEEFPirates]]
deps = ["Libdl", "SIMDPirates", "VectorizationBase"]
Expand All @@ -71,6 +76,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[VectorizationBase]]
deps = ["CpuId", "LinearAlgebra"]
git-tree-sha1 = "006d7b7f276db8d728f8bfd70ebf2efd132f9548"
git-tree-sha1 = "8abb5697fb64cadccd1bba444c955942d3181e5c"
uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
version = "0.7.0"
version = "0.7.1"
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ version = "0.6.20"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a"
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
Expand Down
4 changes: 3 additions & 1 deletion src/LoopVectorization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ using VectorizationBase, SIMDPirates, SLEEFPirates, Parameters
using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr,
mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub, _MM,
maybestaticlength, maybestaticsize, staticm1, subsetview, vzero, stridedpointer_for_broadcast,
Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange,
Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, unwrap, maybestaticrange,
PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct
using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, #prefetch,
vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone
using Base.Broadcast: Broadcasted, DefaultArrayStyle
using LinearAlgebra: Adjoint, Transpose
using Base.Meta: isexpr

const SUPPORTED_TYPES = Union{Float16,Float32,Float64,Integer}

Expand All @@ -21,6 +22,7 @@ export LowDimArray, stridedpointer, vectorizable,
vfilter, vfilter!


include("vectorizationbase_extensions.jl")
include("map.jl")
include("filter.jl")
include("costs.jl")
Expand Down
10 changes: 0 additions & 10 deletions src/add_loads.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,3 @@ function add_loopvalue!(ls::LoopSet, arg::Symbol, elementbytes::Int)
loopsymop
end


struct LoopValue end
@inline VectorizationBase.stridedpointer(::LoopValue) = LoopValue()
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
# @inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Unsigned) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Mask) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
@inline VectorizationBase.vload(::LoopValue, i::Integer) = i + one(i)
@inline VectorizationBase.vload(::LoopValue, i::Tuple{I}) where {I<:Integer} = @inbounds(i[1]) + one(I)
@inline Base.eltype(::LoopValue) = Int8

96 changes: 52 additions & 44 deletions src/graphs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

# For passing options like array types and mask
# struct LoopSetOptions

# end

struct Loop
Expand Down Expand Up @@ -70,7 +70,7 @@ function startloop(loop::Loop, isvectorized, W, itersymbol = loop.itersymbol)
elseif startexact
Expr(:(=), itersymbol, loop.starthint)
else
Expr(:(=), itersymbol, loop.startsym)
Expr(:(=), itersymbol, Expr(:call, lv(:unwrap), loop.startsym))
end
end
function vec_looprange(loop::Loop, isunrolled::Bool, W::Symbol, U::Int)
Expand All @@ -84,7 +84,7 @@ function vec_looprange(loop::Loop, isunrolled::Bool, W::Symbol, U::Int)
else
Expr(:call, :<, loop.itersymbol, Expr(:call, :-, loop.stopsym, incr))
end
end
end
function looprange(loop::Loop, incr::Int, mangledname::Symbol)
incr -= 1#one(Int32)
if iszero(incr)
Expand Down Expand Up @@ -369,47 +369,59 @@ This function creates a loop, while switching from 1 to 0 based indices
"""
function register_single_loop!(ls::LoopSet, looprange::Expr)
itersym = (looprange.args[1])::Symbol
r = (looprange.args[2])::Expr
@assert r.head === :call
f = first(r.args)
loop::Loop = if f === :(:)
lower = r.args[2]
upper = r.args[3]
lii::Bool = lower isa Integer
liiv::Int = lii ? (convert(Int, lower)-1) : 0
uii::Bool = upper isa Integer
if lii & uii # both are integers
Loop(itersym, liiv, convert(Int, upper))
elseif lii # only lower bound is an integer
if upper isa Symbol
Loop(itersym, liiv, upper)
elseif upper isa Expr
Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true))
else
Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true))
r = looprange.args[2]
if isexpr(r, :call)
f = first(r.args)
loop::Loop = if f === :(:)
lower = r.args[2]
upper = r.args[3]
lii::Bool = lower isa Integer
liiv::Int = lii ? (convert(Int, lower)-1) : 0
uii::Bool = upper isa Integer
if lii & uii # both are integers
Loop(itersym, liiv, convert(Int, upper))
elseif lii # only lower bound is an integer
if upper isa Symbol
Loop(itersym, liiv, upper)
elseif upper isa Expr
Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true))
else
Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true))
end
elseif uii # only upper bound is an integer
uiiv = convert(Int, upper)
Loop(itersym, add_loop_bound!(ls, itersym, lower, false), uiiv)
else # neither are integers
L = add_loop_bound!(ls, itersym, lower, false)
U = add_loop_bound!(ls, itersym, upper, true)
Loop(itersym, L, U)
end
elseif uii # only upper bound is an integer
uiiv = convert(Int, upper)
Loop(itersym, add_loop_bound!(ls, itersym, lower, false), uiiv)
else # neither are integers
L = add_loop_bound!(ls, itersym, lower, false)
U = add_loop_bound!(ls, itersym, upper, true)
elseif f === :eachindex
N = gensym(Symbol(:loopeachindex, itersym))
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r)))
L = add_loop_bound!(ls, itersym, Expr(:call, :first, N), false)
U = add_loop_bound!(ls, itersym, Expr(:call, :last, N), true)
Loop(itersym, L, U)
end
elseif f === :eachindex
N = gensym(Symbol(:loop, itersym))
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticlength), r.args[2])))
Loop(itersym, 0, N)
elseif f === :OneTo || f == Expr(:(.), :Base, QuoteNode(:OneTo))
otN = r.args[2]
if otN isa Integer
Loop(itersym, 0, otN)
elseif f === :OneTo || f == Expr(:(.), :Base, QuoteNode(:OneTo))
otN = r.args[2]
if otN isa Integer
Loop(itersym, 0, otN)
else
otN isa Expr && maybestatic!(otN)
N = gensym(Symbol(:loop, itersym))
pushpreamble!(ls, Expr(:(=), N, otN))
Loop(itersym, 0, N)
end
else
otN isa Expr && maybestatic!(otN)
N = gensym(Symbol(:loop, itersym))
pushpreamble!(ls, Expr(:(=), N, otN))
Loop(itersym, 0, N)
throw("Unrecognized loop range type: $r.")
end
elseif isa(r, Symbol)
# Treat similar to `eachindex`
N = gensym(Symbol(:loop, itersym))
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r)))
L = add_loop_bound!(ls, itersym, Expr(:call, :first, N), false)
U = add_loop_bound!(ls, itersym, Expr(:call, :last, N), true)
loop = Loop(itersym, L, U)
else
throw("Unrecognized loop range type: $r.")
end
Expand Down Expand Up @@ -546,7 +558,3 @@ function Base.push!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int)
throw("Don't know how to handle expression:\n$ex")
end
end




13 changes: 12 additions & 1 deletion src/reconstruct_loopset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ function Loop(ls::LoopSet, l::Int, ::Type{StaticLowerUnitRange{L}}) where {L}
pushpreamble!(ls, Expr(:(=), stop, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), Expr(:ref, :lb, l), QuoteNode(:U)))))
Loop(gensym(:n), L, L + 1024, Symbol(""), stop, true, false)::Loop
end
# Is there any likely way to generate such a range?
# function Loop(ls::LoopSet, l::Int, ::Type{StaticLengthUnitRange{N}}) where {N}
# start = gensym(:loopstart); stop = gensym(:loopstop)
# pushpreamble!(ls, Expr(:(=), start, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), Expr(:ref, :lb, l), QuoteNode(:L)))))
# pushpreamble!(ls, Expr(:(=), stop, Expr(:call, :(+), start, N - 1)))
# Loop(gensym(:n), 0, N, start, stop, false, false)::Loop
# end
function Loop(ls, l, ::Type{StaticUnitRange{L,U}}) where {L,U}
Loop(gensym(:n), L, U, Symbol(""), Symbol(""), true, true)::Loop
end
Expand Down Expand Up @@ -63,14 +70,18 @@ extract_varg(i) = Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__
pushvarg!(ls::LoopSet, ar::ArrayReferenceMeta, i) = pushpreamble!(ls, Expr(:(=), vptr(ar), extract_varg(i)))
function pushvarg′!(ls::LoopSet, ar::ArrayReferenceMeta, i)
reverse!(ar.loopedindex); reverse!(getindices(ar)) # reverse the listed indices here, and transpose it to make it column major
pushpreamble!(ls, Expr(:(=), vptr(ar), Expr(:call, lv(:Transpose), extract_varg(i))))
pushpreamble!(ls, Expr(:(=), vptr(ar), Expr(:call, lv(:transpose), extract_varg(i))))
end
function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{PackedStridedPointer{T, N}}) where {T, N}
pushvarg!(ls, ar, i)
end
function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{RowMajorStridedPointer{T, N}}) where {T, N}
pushvarg′!(ls, ar, i)
end
function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{OffsetStridedPointer{T,N,P}}) where {T,N,P}
add_mref!(ls, ar, i, P)
end

function add_mref!(
ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{S}
) where {T, X <: Tuple, S <: VectorizationBase.AbstractStaticStridedPointer{T,X}}
Expand Down
38 changes: 38 additions & 0 deletions src/vectorizationbase_extensions.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

struct LoopValue end
@inline VectorizationBase.stridedpointer(::LoopValue) = LoopValue()
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
# @inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Unsigned) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Mask) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
@inline VectorizationBase.vload(::LoopValue, i::Integer) = i + one(i)
@inline VectorizationBase.vload(::LoopValue, i::Tuple{I}) where {I<:Integer} = @inbounds(i[1]) + one(I)
@inline Base.eltype(::LoopValue) = Int8

import OffsetArrays

# If ndim(::OffsetArray) == 1, we can convert to a regular strided pointer and offset.
@inline VectorizationBase.stridedpointer(a::OffsetArrays.OffsetArray{<:Any,1}) = gesp(stridedpointer(parent(a)), (-@inbounds(a.offsets[1]),))

struct OffsetStridedPointer{T, N, P <: VectorizationBase.AbstractStridedPointer{T}} <: VectorizationBase.AbstractStridedPointer{T}
ptr::P
offsets::NTuple{N,Int}
end
# if ndim(A::OffsetArray) ≥ 2, then eachindex(A) isa Base.OneTo, index starting at 1.
# but multiple indexing is calculated using offsets, so we need a special type to express this.
@inline function VectorizationBase.stridedpointer(A::OffsetArrays.OffsetArray)
OffsetStridedPointer(stridedpointer(parent(A)), A.offsets)
end
# Tuple of length == 1, use ind directly.
# @inline VectorizationBase.offset(ptr::OffsetStridedPointer, ind::Tuple{I}) where {I} = VectorizationBase.offset(ptr.ptr, ind)
# Tuple of length > 1, subtract offsets.
# @inline VectorizationBase.offset(ptr::OffsetStridedPointer{<:Any,N}, ind::Tuple) where {N} = VectorizationBase.offset(ptr.ptr, ntuple(n -> ind[n] + ptr.offsets[n], Val{N}()))
@inline VectorizationBase.offset(ptr::OffsetStridedPointer, ind::Tuple{I}) where {I} = ind
# Tuple of length > 1, subtract offsets.
@inline VectorizationBase.offset(ptr::OffsetStridedPointer{<:Any,N}, ind::Tuple) where {N} = ntuple(n -> ind[n] - ptr.offsets[n], Val{N}())
@inline Base.similar(p::OffsetStridedPointer, ptr::Ptr) = OffsetStridedPointer(similar(p.ptr, ptr), p.offsets)

# If an OffsetArray is getting indexed by a (loop-)constant value, then this particular vptr object cannot also be eachindexed, so we can safely return a stridedpointer
@inline function VectorizationBase.subsetview(ptr::OffsetStridedPointer{<:Any,N}, ::Val{I}, i) where {I,N}
subsetview(gesp(ptr.ptr, ntuple(n -> 0 - @inbounds(ptr.offsets[n]), Val{N}())), Val{I}(), i)
end

19 changes: 17 additions & 2 deletions test/dot.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
using LoopVectorization, OffsetArrays
using Test

@testset "dot" begin
dotq = :(for i eachindex(a,b)
s += a[i]*b[i]
Expand Down Expand Up @@ -46,6 +49,14 @@
end
s
end
function myselfdotavx_range(a)
s = zero(eltype(a))
rng = axes(a, 1)
@avx for i rng
s += a[i]*a[i]
end
s
end
function myselfdot_avx(a)
s = zero(eltype(a))
@_avx for i eachindex(a)
Expand Down Expand Up @@ -167,7 +178,7 @@
end
4acc/length(x)
end

# @macroexpand @_avx for i = 1:length(a_re) - 1
# c_re[i] = b_re[i] * a_re[i + 1] - b_im[i] * a_im[i + 1]
# c_im[i] = b_re[i] * a_im[i + 1] + b_im[i] * a_re[i + 1]
Expand All @@ -179,9 +190,12 @@
N = 127
R = T <: Integer ? (T(-100):T(100)) : T
a = rand(T, N); b = rand(R, N);
ao = OffsetArray(a, -60:66); bo = OffsetArray(b, -60:66);
s = mydot(a, b)
@test mydotavx(a,b) s
@test mydot_avx(a,b) s
@test mydotavx(ao,bo) s
@test mydot_avx(ao,bo) s
@test dot_unroll2avx(a,b) s
@test dot_unroll3avx(a,b) s
@test dot_unroll2_avx(a,b) s
Expand All @@ -190,6 +204,7 @@
@test dot_unroll3avx_inline(a,b) s
s = myselfdot(a)
@test myselfdotavx(a) s
@test myselfdotavx_range(a) s
@test myselfdot_avx(a) s
@test myselfdotavx(a) s

Expand All @@ -205,7 +220,7 @@
b_re = rand(R, N); b_im = rand(R, N);
ac = Complex.(a_re, a_im);
bc = Complex.(b_re, b_im);

@test mydot(ac, bc) complex_dot_soa(a_re, a_im, b_re, b_im)

c_re1 = similar(a_re); c_im1 = similar(a_im);
Expand Down
16 changes: 16 additions & 0 deletions test/gemv.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
using LoopVectorization
using Test

@testset "GEMV" begin
gemvq = :(for i eachindex(y)
yᵢ = 0.0
Expand Down Expand Up @@ -27,6 +30,16 @@
y[i] = yᵢ
end
end
function mygemvavx_range!(y, A, x)
rng1, rng2 = axes(A)
@avx for i rng1
yᵢ = zero(eltype(y))
for j rng2
yᵢ += A[i,j] * x[j]
end
y[i] = yᵢ
end
end
q = :(for i eachindex(y)
yᵢ = zero(eltype(y))
for j eachindex(x)
Expand Down Expand Up @@ -150,6 +163,9 @@
@test y1 y2
fill!(y2, -999.9); mygemv_avx!(y2, A, x)
@test y1 y2
fill!(y2, -999.9)
mygemvavx_range!(y2, A, x)
@test y1 y2

B = rand(R, N, N);
G1 = Matrix{TC}(undef, N, 1);
Expand Down
Loading

0 comments on commit 2ae8a78

Please sign in to comment.