diff --git a/Manifest.toml b/Manifest.toml
index 371dd8e06..6306d09a7 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -31,6 +31,11 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 deps = ["Base64"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 
+[[OffsetArrays]]
+git-tree-sha1 = "707e34562700b81e8aa13548eb6b23b18112e49b"
+uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+version = "1.0.2"
+
 [[OrderedCollections]]
 deps = ["Random", "Serialization", "Test"]
 git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
@@ -49,9 +54,9 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [[SIMDPirates]]
 deps = ["VectorizationBase"]
-git-tree-sha1 = "34dff4f4715f871e71b38f31397d96e62621f14d"
+git-tree-sha1 = "f91198b7ef74b04028f98e0eed7c556b93538a2e"
 uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
-version = "0.6.5"
+version = "0.6.6"
 
 [[SLEEFPirates]]
 deps = ["Libdl", "SIMDPirates", "VectorizationBase"]
@@ -71,6 +76,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[VectorizationBase]]
 deps = ["CpuId", "LinearAlgebra"]
-git-tree-sha1 = "006d7b7f276db8d728f8bfd70ebf2efd132f9548"
+git-tree-sha1 = "8abb5697fb64cadccd1bba444c955942d3181e5c"
 uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
-version = "0.7.0"
+version = "0.7.1"
diff --git a/Project.toml b/Project.toml
index e76310030..807065699 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.6.20"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
 SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a"
 SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
index 88c661da6..056a5678c 100644
--- a/src/LoopVectorization.jl
+++ b/src/LoopVectorization.jl
@@ -4,7 +4,7 @@ using VectorizationBase, SIMDPirates, SLEEFPirates, Parameters
 using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr,
     mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub, _MM,
     maybestaticlength, maybestaticsize, staticm1, subsetview, vzero, stridedpointer_for_broadcast,
-    Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange,
+    Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, unwrap, maybestaticrange,
     PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct
 using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
     sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
@@ -12,6 +12,7 @@ using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_pr
     vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone
 using Base.Broadcast: Broadcasted, DefaultArrayStyle
 using LinearAlgebra: Adjoint, Transpose
+using Base.Meta: isexpr
 
 const SUPPORTED_TYPES = Union{Float16,Float32,Float64,Integer}
 
@@ -21,6 +22,7 @@ export LowDimArray, stridedpointer, vectorizable,
     vfilter, vfilter!
 
 
+include("vectorizationbase_extensions.jl")
 include("map.jl")
 include("filter.jl")
 include("costs.jl")
diff --git a/src/add_loads.jl b/src/add_loads.jl
index b4d453a44..fe4a0e131 100644
--- a/src/add_loads.jl
+++ b/src/add_loads.jl
@@ -70,13 +70,3 @@ function add_loopvalue!(ls::LoopSet, arg::Symbol, elementbytes::Int)
     loopsymop
 end
 
-
-struct LoopValue end
-@inline VectorizationBase.stridedpointer(::LoopValue) = LoopValue()
-@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
-# @inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Unsigned) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
-@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Mask) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
-@inline VectorizationBase.vload(::LoopValue, i::Integer) = i + one(i)
-@inline VectorizationBase.vload(::LoopValue, i::Tuple{I}) where {I<:Integer} = @inbounds(i[1]) + one(I)
-@inline Base.eltype(::LoopValue) = Int8
-
diff --git a/src/graphs.jl b/src/graphs.jl
index ad42a9ecb..66203f4f1 100644
--- a/src/graphs.jl
+++ b/src/graphs.jl
@@ -33,7 +33,7 @@
 
 # For passing options like array types and mask
 # struct LoopSetOptions
-    
+
 # end
 
 struct Loop
@@ -70,7 +70,7 @@ function startloop(loop::Loop, isvectorized, W, itersymbol = loop.itersymbol)
     elseif startexact
         Expr(:(=), itersymbol, loop.starthint)
     else
-        Expr(:(=), itersymbol, loop.startsym)
+        Expr(:(=), itersymbol, Expr(:call, lv(:unwrap), loop.startsym))
     end
 end
 function vec_looprange(loop::Loop, isunrolled::Bool, W::Symbol, U::Int)
@@ -84,7 +84,7 @@ function vec_looprange(loop::Loop, isunrolled::Bool, W::Symbol, U::Int)
     else
         Expr(:call, :<, loop.itersymbol, Expr(:call, :-, loop.stopsym, incr))
     end
-end                       
+end
 function looprange(loop::Loop, incr::Int, mangledname::Symbol)
     incr -= 1#one(Int32)
     if iszero(incr)
@@ -369,47 +369,59 @@ This function creates a loop, while switching from 1 to 0 based indices
 """
 function register_single_loop!(ls::LoopSet, looprange::Expr)
     itersym = (looprange.args[1])::Symbol
-    r = (looprange.args[2])::Expr
-    @assert r.head === :call
-    f = first(r.args)
-    loop::Loop = if f === :(:)
-        lower = r.args[2]
-        upper = r.args[3]
-        lii::Bool = lower isa Integer
-        liiv::Int = lii ? (convert(Int, lower)-1) : 0
-        uii::Bool = upper isa Integer
-        if lii & uii # both are integers
-            Loop(itersym, liiv, convert(Int, upper))
-        elseif lii # only lower bound is an integer
-            if upper isa Symbol
-                Loop(itersym, liiv, upper)
-            elseif upper isa Expr
-                Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true))
-            else
-                Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true))
+    r = looprange.args[2]
+    if isexpr(r, :call)
+        f = first(r.args)
+        loop::Loop = if f === :(:)
+            lower = r.args[2]
+            upper = r.args[3]
+            lii::Bool = lower isa Integer
+            liiv::Int = lii ? (convert(Int, lower)-1) : 0
+            uii::Bool = upper isa Integer
+            if lii & uii # both are integers
+                Loop(itersym, liiv, convert(Int, upper))
+            elseif lii # only lower bound is an integer
+                if upper isa Symbol
+                    Loop(itersym, liiv, upper)
+                elseif upper isa Expr
+                    Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true))
+                else
+                    Loop(itersym, liiv, add_loop_bound!(ls, itersym, upper, true))
+                end
+            elseif uii # only upper bound is an integer
+                uiiv = convert(Int, upper)
+                Loop(itersym, add_loop_bound!(ls, itersym, lower, false), uiiv)
+            else # neither are integers
+                L = add_loop_bound!(ls, itersym, lower, false)
+                U = add_loop_bound!(ls, itersym, upper, true)
+                Loop(itersym, L, U)
             end
-        elseif uii # only upper bound is an integer
-            uiiv = convert(Int, upper)
-            Loop(itersym, add_loop_bound!(ls, itersym, lower, false), uiiv)
-        else # neither are integers
-            L = add_loop_bound!(ls, itersym, lower, false)
-            U = add_loop_bound!(ls, itersym, upper, true)
+        elseif f === :eachindex
+            N = gensym(Symbol(:loopeachindex, itersym))
+            pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r)))
+            L = add_loop_bound!(ls, itersym, Expr(:call, :first, N), false)
+            U = add_loop_bound!(ls, itersym, Expr(:call, :last, N), true)
             Loop(itersym, L, U)
-        end
-    elseif f === :eachindex
-        N = gensym(Symbol(:loop, itersym))
-        pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticlength), r.args[2])))
-        Loop(itersym, 0, N)
-    elseif f === :OneTo || f == Expr(:(.), :Base, QuoteNode(:OneTo))
-        otN = r.args[2]
-        if otN isa Integer
-            Loop(itersym, 0, otN)
+        elseif f === :OneTo || f == Expr(:(.), :Base, QuoteNode(:OneTo))
+            otN = r.args[2]
+            if otN isa Integer
+                Loop(itersym, 0, otN)
+            else
+                otN isa Expr && maybestatic!(otN)
+                N = gensym(Symbol(:loop, itersym))
+                pushpreamble!(ls, Expr(:(=), N, otN))
+                Loop(itersym, 0, N)
+            end
         else
-            otN isa Expr && maybestatic!(otN)
-            N = gensym(Symbol(:loop, itersym))
-            pushpreamble!(ls, Expr(:(=), N, otN))
-            Loop(itersym, 0, N)
+            throw("Unrecognized loop range type: $r.")
         end
+    elseif isa(r, Symbol)
+        # Treat similar to `eachindex`
+        N = gensym(Symbol(:loop, itersym))
+        pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r)))
+        L = add_loop_bound!(ls, itersym, Expr(:call, :first, N), false)
+        U = add_loop_bound!(ls, itersym, Expr(:call, :last, N), true)
+        loop = Loop(itersym, L, U)
     else
         throw("Unrecognized loop range type: $r.")
     end
@@ -546,7 +558,3 @@ function Base.push!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int)
         throw("Don't know how to handle expression:\n$ex")
     end
 end
-
-
-
-
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
index 68555dc52..d4c695dec 100644
--- a/src/reconstruct_loopset.jl
+++ b/src/reconstruct_loopset.jl
@@ -14,6 +14,13 @@ function Loop(ls::LoopSet, l::Int, ::Type{StaticLowerUnitRange{L}}) where {L}
     pushpreamble!(ls, Expr(:(=), stop, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), Expr(:ref, :lb, l), QuoteNode(:U)))))
     Loop(gensym(:n), L, L + 1024, Symbol(""), stop, true, false)::Loop
 end
+# Is there any likely way to generate such a range?
+# function Loop(ls::LoopSet, l::Int, ::Type{StaticLengthUnitRange{N}}) where {N}
+#     start = gensym(:loopstart); stop = gensym(:loopstop)
+#     pushpreamble!(ls, Expr(:(=), start, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), Expr(:ref, :lb, l), QuoteNode(:L)))))
+#     pushpreamble!(ls, Expr(:(=), stop, Expr(:call, :(+), start, N - 1)))
+#     Loop(gensym(:n), 0, N, start, stop, false, false)::Loop
+# end
 function Loop(ls, l, ::Type{StaticUnitRange{L,U}}) where {L,U}
     Loop(gensym(:n), L, U, Symbol(""), Symbol(""), true, true)::Loop
 end
@@ -63,7 +70,7 @@ extract_varg(i) = Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__
 pushvarg!(ls::LoopSet, ar::ArrayReferenceMeta, i) = pushpreamble!(ls, Expr(:(=), vptr(ar), extract_varg(i)))
 function pushvarg′!(ls::LoopSet, ar::ArrayReferenceMeta, i)
     reverse!(ar.loopedindex); reverse!(getindices(ar)) # reverse the listed indices here, and transpose it to make it column major
-    pushpreamble!(ls, Expr(:(=), vptr(ar), Expr(:call, lv(:Transpose), extract_varg(i))))
+    pushpreamble!(ls, Expr(:(=), vptr(ar), Expr(:call, lv(:transpose), extract_varg(i))))
 end
 function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{PackedStridedPointer{T, N}}) where {T, N}
     pushvarg!(ls, ar, i)
@@ -71,6 +78,10 @@ end
 function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{RowMajorStridedPointer{T, N}}) where {T, N}
     pushvarg′!(ls, ar, i)
 end
+function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{OffsetStridedPointer{T,N,P}}) where {T,N,P}
+    add_mref!(ls, ar, i, P)
+end
+
 function add_mref!(
     ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{S}
 ) where {T, X <: Tuple, S <: VectorizationBase.AbstractStaticStridedPointer{T,X}}
diff --git a/src/vectorizationbase_extensions.jl b/src/vectorizationbase_extensions.jl
new file mode 100644
index 000000000..613c2687e
--- /dev/null
+++ b/src/vectorizationbase_extensions.jl
@@ -0,0 +1,38 @@
+
+struct LoopValue end
+@inline VectorizationBase.stridedpointer(::LoopValue) = LoopValue()
+@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
+# @inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Unsigned) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
+@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Mask) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
+@inline VectorizationBase.vload(::LoopValue, i::Integer) = i + one(i)
+@inline VectorizationBase.vload(::LoopValue, i::Tuple{I}) where {I<:Integer} = @inbounds(i[1]) + one(I)
+@inline Base.eltype(::LoopValue) = Int8
+
+import OffsetArrays
+
+# If ndim(::OffsetArray) == 1, we can convert to a regular strided pointer and offset.
+@inline VectorizationBase.stridedpointer(a::OffsetArrays.OffsetArray{<:Any,1}) = gesp(stridedpointer(parent(a)), (-@inbounds(a.offsets[1]),))
+
+struct OffsetStridedPointer{T, N, P <: VectorizationBase.AbstractStridedPointer{T}} <: VectorizationBase.AbstractStridedPointer{T}
+    ptr::P
+    offsets::NTuple{N,Int}
+end
+# if ndim(A::OffsetArray) ≥ 2, then eachindex(A) isa Base.OneTo, index starting at 1.
+# but multiple indexing is calculated using offsets, so we need a special type to express this.
+@inline function VectorizationBase.stridedpointer(A::OffsetArrays.OffsetArray)
+    OffsetStridedPointer(stridedpointer(parent(A)), A.offsets)
+end
+# Tuple of length == 1, use ind directly.
+# @inline VectorizationBase.offset(ptr::OffsetStridedPointer, ind::Tuple{I}) where {I} = VectorizationBase.offset(ptr.ptr, ind)
+# Tuple of length > 1, subtract offsets.
+# @inline VectorizationBase.offset(ptr::OffsetStridedPointer{<:Any,N}, ind::Tuple) where {N} = VectorizationBase.offset(ptr.ptr, ntuple(n -> ind[n] + ptr.offsets[n], Val{N}()))
+@inline VectorizationBase.offset(ptr::OffsetStridedPointer, ind::Tuple{I}) where {I} = ind
+# Tuple of length > 1, subtract offsets.
+@inline VectorizationBase.offset(ptr::OffsetStridedPointer{<:Any,N}, ind::Tuple) where {N} = ntuple(n -> ind[n] - ptr.offsets[n], Val{N}())
+@inline Base.similar(p::OffsetStridedPointer, ptr::Ptr) = OffsetStridedPointer(similar(p.ptr, ptr), p.offsets)
+
+# If an OffsetArray is getting indexed by a (loop-)constant value, then this particular vptr object cannot also be eachindexed, so we can safely return a stridedpointer
+@inline function VectorizationBase.subsetview(ptr::OffsetStridedPointer{<:Any,N}, ::Val{I}, i) where {I,N}
+    subsetview(gesp(ptr.ptr, ntuple(n -> 0 - @inbounds(ptr.offsets[n]), Val{N}())), Val{I}(), i)
+end
+
diff --git a/test/dot.jl b/test/dot.jl
index 391130dd2..6bdb256ca 100644
--- a/test/dot.jl
+++ b/test/dot.jl
@@ -1,3 +1,6 @@
+using LoopVectorization, OffsetArrays
+using Test
+
 @testset "dot" begin
     dotq = :(for i ∈ eachindex(a,b)
              s += a[i]*b[i]
@@ -46,6 +49,14 @@
         end
         s
     end
+    function myselfdotavx_range(a)
+        s = zero(eltype(a))
+        rng = axes(a, 1)
+        @avx for i ∈ rng
+            s += a[i]*a[i]
+        end
+        s
+    end
     function myselfdot_avx(a)
         s = zero(eltype(a))
         @_avx for i ∈ eachindex(a)
@@ -167,7 +178,7 @@
         end
         4acc/length(x)
     end
-    
+
     # @macroexpand @_avx for i = 1:length(a_re) - 1
     #     c_re[i] = b_re[i] * a_re[i + 1] - b_im[i] * a_im[i + 1]
     #     c_im[i] = b_re[i] * a_im[i + 1] + b_im[i] * a_re[i + 1]
@@ -179,9 +190,12 @@
         N = 127
         R = T <: Integer ? (T(-100):T(100)) : T
         a = rand(T, N); b = rand(R, N);
+        ao = OffsetArray(a, -60:66); bo = OffsetArray(b, -60:66);
         s = mydot(a, b)
         @test mydotavx(a,b) ≈ s
         @test mydot_avx(a,b) ≈ s
+        @test mydotavx(ao,bo) ≈ s
+        @test mydot_avx(ao,bo) ≈ s
         @test dot_unroll2avx(a,b) ≈ s
         @test dot_unroll3avx(a,b) ≈ s
         @test dot_unroll2_avx(a,b) ≈ s
@@ -190,6 +204,7 @@
         @test dot_unroll3avx_inline(a,b) ≈ s
         s = myselfdot(a)
         @test myselfdotavx(a) ≈ s
+        @test myselfdotavx_range(a) ≈ s
         @test myselfdot_avx(a) ≈ s
         @test myselfdotavx(a) ≈ s
 
@@ -205,7 +220,7 @@
         b_re = rand(R, N); b_im = rand(R, N);
         ac = Complex.(a_re, a_im);
         bc = Complex.(b_re, b_im);
-        
+
         @test mydot(ac, bc) ≈ complex_dot_soa(a_re, a_im, b_re, b_im)
 
         c_re1 = similar(a_re); c_im1 = similar(a_im);
diff --git a/test/gemv.jl b/test/gemv.jl
index 1583bcead..765ca1986 100644
--- a/test/gemv.jl
+++ b/test/gemv.jl
@@ -1,3 +1,6 @@
+using LoopVectorization
+using Test
+
 @testset "GEMV" begin
     gemvq = :(for i ∈ eachindex(y)
               yᵢ = 0.0
@@ -27,6 +30,16 @@
             y[i] = yᵢ
         end
     end
+    function mygemvavx_range!(y, A, x)
+        rng1, rng2 = axes(A)
+        @avx for i ∈ rng1
+            yᵢ = zero(eltype(y))
+            for j ∈ rng2
+                yᵢ += A[i,j] * x[j]
+            end
+            y[i] = yᵢ
+        end
+    end
     q = :(for i ∈ eachindex(y)
           yᵢ = zero(eltype(y))
           for j ∈ eachindex(x)
@@ -150,6 +163,9 @@
         @test y1 ≈ y2
         fill!(y2, -999.9); mygemv_avx!(y2, A, x)
         @test y1 ≈ y2
+        fill!(y2, -999.9)
+        mygemvavx_range!(y2, A, x)
+        @test y1 ≈ y2
 
         B = rand(R, N, N);
         G1 = Matrix{TC}(undef, N, 1);
diff --git a/test/offsetarrays.jl b/test/offsetarrays.jl
new file mode 100644
index 000000000..7a3371b3f
--- /dev/null
+++ b/test/offsetarrays.jl
@@ -0,0 +1,66 @@
+using LoopVectorization, OffsetArrays
+using Test
+
+@testset "OffsetArrays" begin
+
+    function old2d!(out::AbstractMatrix, A::AbstractMatrix, kern, R=CartesianIndices(out), z=zero(eltype(out)))
+        rng1k, rng2k = axes(kern)
+        rng1,  rng2  = R.indices
+        for j in rng2, i in rng1
+            tmp = z
+            @inbounds for jk in rng2k, ik in rng1k
+                tmp += oftype(tmp, A[i+ik,j+jk])*kern[ik,jk]
+            end
+            @inbounds out[i,j] = tmp
+        end
+        out
+    end
+    function avx2d!(out::AbstractMatrix, A::AbstractMatrix, kern::OffsetArray, R=CartesianIndices(out), z=zero(eltype(out)))
+        rng1k, rng2k = axes(kern)
+        rng1,  rng2  = R.indices
+        # Manually unpack the OffsetArray
+        kernA = parent(kern)
+        o1, o2 = kern.offsets
+        for j in rng2, i in rng1
+            tmp = z
+            @avx for jk in rng2k, ik in rng1k
+                tmp += A[i+ik,j+jk]*kernA[ik-o1,jk-o2]
+            end
+            out[i,j] = tmp
+        end
+        out
+    end
+    function avx2douter!(out::AbstractMatrix, A::AbstractMatrix, kern::OffsetArray, R=CartesianIndices(out), z=zero(eltype(out)))
+        rng1k, rng2k = axes(kern)
+        rng1,  rng2  = R.indices
+        # Manually unpack the OffsetArray
+        kernA = parent(kern)
+        o1, o2 = kern.offsets
+        @avx for j in rng2, i in rng1
+            tmp = z
+            for jk in rng2k, ik in rng1k
+                tmp += A[i+ik,j+jk]*kernA[ik-o1,jk-o2]
+                1
+            end
+            out[i,j] = tmp
+        end
+        out
+    end
+    
+    for T ∈ (Float32, Float64)
+        @show T, @__LINE__
+        A = rand(T, 100, 100);
+        kern = OffsetArray(rand(T, 3, 3), -1:1, -1:1);
+        out1 = OffsetArray(similar(A, size(A).-2), 1, 1);   # stay away from the edges of A
+        out2 = similar(out1); out3 = similar(out1);
+
+        old2d!(out1, A, kern);
+        avx2d!(out2, A, kern);
+        @test out1 ≈ out2
+        avx2douter!(out3, A, kern);
+        @test out1 ≈ out3
+    end
+
+    
+end
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 6c1f971a2..bc7cda92f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -17,6 +17,8 @@ end
 @time @testset "LoopVectorization.jl" begin
 
     @time include("printmethods.jl")
+
+    @time include("offsetarrays.jl")
     
     @time include("map.jl")