From 294b02da3d862a097a84f2206ffab4949d6fd2d5 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Thu, 7 Jul 2016 14:05:11 -0500
Subject: [PATCH 1/8] Fix setindex! with SubDArray source

This method is an optimization wherein we try to chunk accesses based upon the parent DArray's parts. The hard thing is then going backwards and trying to figure out which parts of the assignment indices need to be used in order to access those chunks.  This is a four stage process that uses five different types of indices:

1. Find the indices of each portion of the DArray
2. Find the valid subset of indices of the SubArray that index into that portion
3. Find the portion of the indices for the assignment that need to be used for that subset of indices in step 2. This is the hard part.  It requires creating another set of indices that represents the mask of valid indices from step 2.  With those masks in hand, it's possible to reindex `I` to the indices we need. The trouble is that `setindex!` supports singleton dimensions in the source array in ways that `getindex` does not, so we need to selectively drop singleton dimensions as we restrict the indices. A final complication is that the last index can be a linear index over many indices in either the source or destination.
4. Finally, if the entire DArray chunk isn't getting used, we need to shift the indices from step 2 to refer to the local part of the DArray.
---
 src/DistributedArrays.jl | 93 +++++++++++++++++++++++++++++++++++++---
 test/darray.jl           | 16 +++++++
 2 files changed, 103 insertions(+), 6 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index 28d9975..bdb6bda 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -656,21 +656,102 @@ function Base.setindex!(a::Array, d::DArray,
     return a
 end
 
+# Similar to Base.indexin, but just create a logical mask
+indexin_mask(a, b::Number) = a .== b
+indexin_mask(a, r::Range{Int}) = [i in r for i in a]
+indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
+indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))
+indexin_mask(a, b) = [i in b for i in a]
+
+import Base: tail
+# Given a tuple of indices and a tuple of masks, restrict the indices to the
+# valid regions. This is, effectively, reversing Base.setindex_shape_check.
+# We can't just use indexing into MergedIndices here because getindex is much 
+# pickier about singleton dimensions than setindex! is.
+restrict_indices(::Tuple{}, ::Tuple{}) = ()
+function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})
+    if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)
+        (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)
+    elseif length(a[1]) == 1
+        (a[1], restrict_indices(tail(a), b))
+    elseif length(b[1]) == 1 && b[1][1]
+        restrict_indices(a, tail(b))
+    else
+        throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue"))
+    end
+end
+# The final indices are funky - they're allowed to accumulate together.
+# Too many masks is an easy fix -- just use the outer product to merge them:
+function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
+    restrict_indices(a, (map(Bool, vec(vec(b[1])*vec(b[2])')), tail(tail(b))...))
+end
+# But too many indices is much harder; this will require merging the indices
+# in `a` before applying the final mask in `b`.
+function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
+    if length(a[1]) == 1
+        (a[1], restrict_indices(tail(a), b))
+    else
+        # When one mask spans multiple indices, we need to merge the indices
+        # together. At this point, we can just use indexing to merge them since
+        # there's no longer special handling of singleton dimensions
+        (view(MergedIndices(a, map(length, a)), b[1]),)
+    end
+end
+
+immutable MergedIndices{T,N} <: AbstractArray{CartesianIndex{N}, N}
+    indices::T
+    sz::NTuple{N,Int}
+end
+Base.size(M::MergedIndices) = M.sz
+Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) = CartesianIndex(map(getindex, M.indices, I))
+# Boundschecking for using MergedIndices as an array index. This is overly
+# strict -- even for SubArrays of ReshapedIndices, we require that the entire
+# parent array's indices are valid. In this usage, it is just fine... and is a
+# huge optimization over exact bounds checking.
+typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M}
+typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M}
+typealias MergedIndicesOrSub Union{MergedIndices, SubMergedIndices}
+import Base: _chkbnds
+# Ambiguity with linear indexing:
+@inline _chkbnds(A::AbstractVector, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
+@inline _chkbnds(A::AbstractArray, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
+# Generic bounds checking
+@inline _chkbnds{T,N}(A::AbstractArray{T,N}, checked::NTuple{N,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+@inline _chkbnds{T,N,M}(A::AbstractArray{T,N}, checked::NTuple{M,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+
+# The tricky thing here is that we want to optimize the accesses into the
+# distributed array, but in doing so, we lose track of which indices in I we
+# should be using.
+#
+# I’ve come to the conclusion that the function is utterly insane.
+# There are *6* flavors of indices with four different reference points:
+# 1. Find the indices of each portion of the DArray.
+# 2. Find the valid subset of indices for the SubArray into that portion.
+# 3. Find the portion of the `I` indices that should be used when you access the
+#    `K` indices in the subarray.  This guy is nasty.  It’s totally backwards
+#    from all other arrays, wherein we simply iterate over the source array’s
+#    elements.  You need to *both* know which elements in `J` were skipped
+#    (`indexin_mask`) and which dimensions should match up (`restrict_indices`)
+# 4. If `K` doesn’t correspond to an entire chunk, reinterpret `K` in terms of
+#    the local portion of the source array
 function Base.setindex!(a::Array, s::SubDArray,
         I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
+    Base.setindex_shape_check(s, Base.index_lengths(a, I...)...)
     n = length(I)
     d = s.parent
-    J = s.indexes
+    J = Base.decolon(d, s.indexes...)
     if length(J) < n
+        # TODO: this failsafe only works sometimes; the proper solution is to
+        # implement `restrict_indices` to merge the indices above.
         a[I...] = convert(Array,s)
         return a
     end
-    offs = [isa(J[i],Int) ? J[i]-1 : first(J[i])-1 for i=1:n]
     @sync for i = 1:length(d.pids)
-        K_c = Any[d.indexes[i]...]
-        K = [ intersect(J[j],K_c[j]) for j=1:n ]
+        K_c = d.indexes[i]
+        K = map(intersect, J, K_c)
         if !any(isempty, K)
-            idxs = [ I[j][K[j]-offs[j]] for j=1:n ]
+            K_mask = map(indexin_mask, J, K_c)
+            idxs = restrict_indices(Base.decolon(a, I...), K_mask)
             if isequal(K, K_c)
                 # whole chunk
                 @async a[idxs...] = chunk(d, i)
@@ -678,7 +759,7 @@ function Base.setindex!(a::Array, s::SubDArray,
                 # partial chunk
                 @async a[idxs...] =
                     remotecall_fetch(d.pids[i]) do
-                        view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:n]...)
+                        view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:length(J)]...)
                     end
             end
         end
diff --git a/test/darray.jl b/test/darray.jl
index 5efb6d9..17fc7ef 100644
--- a/test/darray.jl
+++ b/test/darray.jl
@@ -99,6 +99,22 @@ check_leaks()
         @test fetch(@spawnat MYID localpart(D)[1,1]) == D[1,1]
         @test fetch(@spawnat OTHERIDS localpart(D)[1,1]) == D[1,101]
         close(D2)
+
+        S2 = convert(Vector{Float64}, D[4, 23:176])
+        @fact A[4, 23:176] --> S2
+
+        S3 = convert(Vector{Float64}, D[23:176, 197])
+        @fact A[23:176, 197] --> S3
+
+        S4 = zeros(4)
+        setindex!(S4, D[3:4, 99:100], :)
+        @fact S4 --> vec(D[3:4, 99:100])
+        @fact S4 --> vec(A[3:4, 99:100])
+        
+        S5 = zeros(2,2)
+        setindex!(S5, D[1,1:4], :, 1:2)
+        @fact vec(S5) --> D[1, 1:4]
+        @fact vec(S5) --> A[1, 1:4]
     end
     close(D)
 end

From faae054d9730610520a9e45fab46daecb954e115 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Fri, 8 Jul 2016 10:30:55 -0500
Subject: [PATCH 2/8] Remove unnecessary failsafe

This is no longer needed -- the comment is from when I only had restrict_indices partially implemented
---
 src/DistributedArrays.jl | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index bdb6bda..1d2066e 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -740,12 +740,6 @@ function Base.setindex!(a::Array, s::SubDArray,
     n = length(I)
     d = s.parent
     J = Base.decolon(d, s.indexes...)
-    if length(J) < n
-        # TODO: this failsafe only works sometimes; the proper solution is to
-        # implement `restrict_indices` to merge the indices above.
-        a[I...] = convert(Array,s)
-        return a
-    end
     @sync for i = 1:length(d.pids)
         K_c = d.indexes[i]
         K = map(intersect, J, K_c)

From 8277b552fb248792d13640faa87ca9faf402bcbf Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Sat, 9 Jul 2016 12:32:51 -0500
Subject: [PATCH 3/8] Also implement checkbounds_indices

---
 src/DistributedArrays.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index 1d2066e..c4d3d7e 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -718,6 +718,10 @@ import Base: _chkbnds
 # Generic bounds checking
 @inline _chkbnds{T,N}(A::AbstractArray{T,N}, checked::NTuple{N,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
 @inline _chkbnds{T,N,M}(A::AbstractArray{T,N}, checked::NTuple{M,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+import Base: checkbounds_indices
+@inline checkbounds_indices(::Tuple{},   I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices((),   (parent(parent(I[1])).indices..., tail(I)...))
+@inline checkbounds_indices(inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
+@inline checkbounds_indices(inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
 
 # The tricky thing here is that we want to optimize the accesses into the
 # distributed array, but in doing so, we lose track of which indices in I we

From c5c535264d91afe7d3abf20208533e57ca13618b Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Sat, 9 Jul 2016 12:33:44 -0500
Subject: [PATCH 4/8] Only enable this method on 0.5

---
 src/DistributedArrays.jl | 210 ++++++++++++++++++++-------------------
 1 file changed, 107 insertions(+), 103 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index c4d3d7e..713bbbe 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -656,113 +656,117 @@ function Base.setindex!(a::Array, d::DArray,
     return a
 end
 
-# Similar to Base.indexin, but just create a logical mask
-indexin_mask(a, b::Number) = a .== b
-indexin_mask(a, r::Range{Int}) = [i in r for i in a]
-indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
-indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))
-indexin_mask(a, b) = [i in b for i in a]
-
-import Base: tail
-# Given a tuple of indices and a tuple of masks, restrict the indices to the
-# valid regions. This is, effectively, reversing Base.setindex_shape_check.
-# We can't just use indexing into MergedIndices here because getindex is much 
-# pickier about singleton dimensions than setindex! is.
-restrict_indices(::Tuple{}, ::Tuple{}) = ()
-function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})
-    if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)
-        (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)
-    elseif length(a[1]) == 1
-        (a[1], restrict_indices(tail(a), b))
-    elseif length(b[1]) == 1 && b[1][1]
-        restrict_indices(a, tail(b))
-    else
-        throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue"))
+# We also want to optimize setindex! with a SubDArray source, but this is hard
+# and only works on 0.5.
+if VERSION > v"0.5.0-dev+5230"
+    # Similar to Base.indexin, but just create a logical mask
+    indexin_mask(a, b::Number) = a .== b
+    indexin_mask(a, r::Range{Int}) = [i in r for i in a]
+    indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
+    indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))
+    indexin_mask(a, b) = [i in b for i in a]
+
+    import Base: tail
+    # Given a tuple of indices and a tuple of masks, restrict the indices to the
+    # valid regions. This is, effectively, reversing Base.setindex_shape_check.
+    # We can't just use indexing into MergedIndices here because getindex is much 
+    # pickier about singleton dimensions than setindex! is.
+    restrict_indices(::Tuple{}, ::Tuple{}) = ()
+    function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})
+        if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)
+            (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)
+        elseif length(a[1]) == 1
+            (a[1], restrict_indices(tail(a), b))
+        elseif length(b[1]) == 1 && b[1][1]
+            restrict_indices(a, tail(b))
+        else
+            throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue"))
+        end
     end
-end
-# The final indices are funky - they're allowed to accumulate together.
-# Too many masks is an easy fix -- just use the outer product to merge them:
-function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
-    restrict_indices(a, (map(Bool, vec(vec(b[1])*vec(b[2])')), tail(tail(b))...))
-end
-# But too many indices is much harder; this will require merging the indices
-# in `a` before applying the final mask in `b`.
-function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
-    if length(a[1]) == 1
-        (a[1], restrict_indices(tail(a), b))
-    else
-        # When one mask spans multiple indices, we need to merge the indices
-        # together. At this point, we can just use indexing to merge them since
-        # there's no longer special handling of singleton dimensions
-        (view(MergedIndices(a, map(length, a)), b[1]),)
-    end
-end
-
-immutable MergedIndices{T,N} <: AbstractArray{CartesianIndex{N}, N}
-    indices::T
-    sz::NTuple{N,Int}
-end
-Base.size(M::MergedIndices) = M.sz
-Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) = CartesianIndex(map(getindex, M.indices, I))
-# Boundschecking for using MergedIndices as an array index. This is overly
-# strict -- even for SubArrays of ReshapedIndices, we require that the entire
-# parent array's indices are valid. In this usage, it is just fine... and is a
-# huge optimization over exact bounds checking.
-typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M}
-typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M}
-typealias MergedIndicesOrSub Union{MergedIndices, SubMergedIndices}
-import Base: _chkbnds
-# Ambiguity with linear indexing:
-@inline _chkbnds(A::AbstractVector, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
-@inline _chkbnds(A::AbstractArray, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
-# Generic bounds checking
-@inline _chkbnds{T,N}(A::AbstractArray{T,N}, checked::NTuple{N,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
-@inline _chkbnds{T,N,M}(A::AbstractArray{T,N}, checked::NTuple{M,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
-import Base: checkbounds_indices
-@inline checkbounds_indices(::Tuple{},   I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices((),   (parent(parent(I[1])).indices..., tail(I)...))
-@inline checkbounds_indices(inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
-@inline checkbounds_indices(inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
-
-# The tricky thing here is that we want to optimize the accesses into the
-# distributed array, but in doing so, we lose track of which indices in I we
-# should be using.
-#
-# I’ve come to the conclusion that the function is utterly insane.
-# There are *6* flavors of indices with four different reference points:
-# 1. Find the indices of each portion of the DArray.
-# 2. Find the valid subset of indices for the SubArray into that portion.
-# 3. Find the portion of the `I` indices that should be used when you access the
-#    `K` indices in the subarray.  This guy is nasty.  It’s totally backwards
-#    from all other arrays, wherein we simply iterate over the source array’s
-#    elements.  You need to *both* know which elements in `J` were skipped
-#    (`indexin_mask`) and which dimensions should match up (`restrict_indices`)
-# 4. If `K` doesn’t correspond to an entire chunk, reinterpret `K` in terms of
-#    the local portion of the source array
-function Base.setindex!(a::Array, s::SubDArray,
-        I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
-    Base.setindex_shape_check(s, Base.index_lengths(a, I...)...)
-    n = length(I)
-    d = s.parent
-    J = Base.decolon(d, s.indexes...)
-    @sync for i = 1:length(d.pids)
-        K_c = d.indexes[i]
-        K = map(intersect, J, K_c)
-        if !any(isempty, K)
-            K_mask = map(indexin_mask, J, K_c)
-            idxs = restrict_indices(Base.decolon(a, I...), K_mask)
-            if isequal(K, K_c)
-                # whole chunk
-                @async a[idxs...] = chunk(d, i)
-            else
-                # partial chunk
-                @async a[idxs...] =
-                    remotecall_fetch(d.pids[i]) do
-                        view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:length(J)]...)
-                    end
+    # The final indices are funky - they're allowed to accumulate together.
+    # Too many masks is an easy fix -- just use the outer product to merge them:
+    function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
+        restrict_indices(a, (map(Bool, vec(vec(b[1])*vec(b[2])')), tail(tail(b))...))
+    end
+    # But too many indices is much harder; this will require merging the indices
+    # in `a` before applying the final mask in `b`.
+    function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
+        if length(a[1]) == 1
+            (a[1], restrict_indices(tail(a), b))
+        else
+            # When one mask spans multiple indices, we need to merge the indices
+            # together. At this point, we can just use indexing to merge them since
+            # there's no longer special handling of singleton dimensions
+            (view(MergedIndices(a, map(length, a)), b[1]),)
+        end
+    end
+
+    immutable MergedIndices{T,N} <: AbstractArray{CartesianIndex{N}, N}
+        indices::T
+        sz::NTuple{N,Int}
+    end
+    Base.size(M::MergedIndices) = M.sz
+    Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) = CartesianIndex(map(getindex, M.indices, I))
+    # Boundschecking for using MergedIndices as an array index. This is overly
+    # strict -- even for SubArrays of ReshapedIndices, we require that the entire
+    # parent array's indices are valid. In this usage, it is just fine... and is a
+    # huge optimization over exact bounds checking.
+    typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M}
+    typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M}
+    typealias MergedIndicesOrSub Union{MergedIndices, SubMergedIndices}
+    import Base: _chkbnds
+    # Ambiguity with linear indexing:
+    @inline _chkbnds(A::AbstractVector, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
+    @inline _chkbnds(A::AbstractArray, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
+    # Generic bounds checking
+    @inline _chkbnds{T,N}(A::AbstractArray{T,N}, checked::NTuple{N,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+    @inline _chkbnds{T,N,M}(A::AbstractArray{T,N}, checked::NTuple{M,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+    import Base: checkbounds_indices
+    @inline checkbounds_indices(::Tuple{},   I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices((),   (parent(parent(I[1])).indices..., tail(I)...))
+    @inline checkbounds_indices(inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
+    @inline checkbounds_indices(inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
+
+    # The tricky thing here is that we want to optimize the accesses into the
+    # distributed array, but in doing so, we lose track of which indices in I we
+    # should be using.
+    #
+    # I’ve come to the conclusion that the function is utterly insane.
+    # There are *6* flavors of indices with four different reference points:
+    # 1. Find the indices of each portion of the DArray.
+    # 2. Find the valid subset of indices for the SubArray into that portion.
+    # 3. Find the portion of the `I` indices that should be used when you access the
+    #    `K` indices in the subarray.  This guy is nasty.  It’s totally backwards
+    #    from all other arrays, wherein we simply iterate over the source array’s
+    #    elements.  You need to *both* know which elements in `J` were skipped
+    #    (`indexin_mask`) and which dimensions should match up (`restrict_indices`)
+    # 4. If `K` doesn’t correspond to an entire chunk, reinterpret `K` in terms of
+    #    the local portion of the source array
+    function Base.setindex!(a::Array, s::SubDArray,
+            I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
+        Base.setindex_shape_check(s, Base.index_lengths(a, I...)...)
+        n = length(I)
+        d = s.parent
+        J = Base.decolon(d, s.indexes...)
+        @sync for i = 1:length(d.pids)
+            K_c = d.indexes[i]
+            K = map(intersect, J, K_c)
+            if !any(isempty, K)
+                K_mask = map(indexin_mask, J, K_c)
+                idxs = restrict_indices(Base.decolon(a, I...), K_mask)
+                if isequal(K, K_c)
+                    # whole chunk
+                    @async a[idxs...] = chunk(d, i)
+                else
+                    # partial chunk
+                    @async a[idxs...] =
+                        remotecall_fetch(d.pids[i]) do
+                            view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:length(J)]...)
+                        end
+                end
             end
         end
+        return a
     end
-    return a
 end
 
 Base.fill!(A::DArray, x) = begin

From 287f56d6ecbb177f8a83a6468664e2d1e3a4e2bd Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Sat, 23 Jul 2016 17:04:04 -0500
Subject: [PATCH 5/8] Fixup checkbounds_indices to the new APIs

Also clarify the comment since I was confused upon coming back to this method a few weeks later
---
 src/DistributedArrays.jl | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index 713bbbe..1258d5b 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -707,24 +707,24 @@ if VERSION > v"0.5.0-dev+5230"
     end
     Base.size(M::MergedIndices) = M.sz
     Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) = CartesianIndex(map(getindex, M.indices, I))
-    # Boundschecking for using MergedIndices as an array index. This is overly
-    # strict -- even for SubArrays of ReshapedIndices, we require that the entire
-    # parent array's indices are valid. In this usage, it is just fine... and is a
-    # huge optimization over exact bounds checking.
+    # Additionally, we optimize bounds checking when using MergedIndices as an 
+    # array index since checking, e.g., A[1:500, 1:500] is *way* faster than
+    # checking an array of 500^2 elements of CartesianIndex{2}. This optimization
+    # also applies to reshapes of MergedIndices since the outer shape of the
+    # container doesn't affect the index elements themselves. We can go even
+    # farther and say that even restricted views of MergedIndices must be valid
+    # over the entire array. This is overly strict in general, but in this
+    # use-case all the merged indices must be valid at some point, so it's ok.
     typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M}
     typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M}
-    typealias MergedIndicesOrSub Union{MergedIndices, SubMergedIndices}
-    import Base: _chkbnds
-    # Ambiguity with linear indexing:
-    @inline _chkbnds(A::AbstractVector, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
-    @inline _chkbnds(A::AbstractArray, checked::NTuple{1,Bool}, I::MergedIndicesOrSub) = _chkbnds(A, checked, parent(parent(I)).indices...)
-    # Generic bounds checking
-    @inline _chkbnds{T,N}(A::AbstractArray{T,N}, checked::NTuple{N,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
-    @inline _chkbnds{T,N,M}(A::AbstractArray{T,N}, checked::NTuple{M,Bool}, I1::MergedIndicesOrSub, I...) = _chkbnds(A, checked, parent(parent(I1)).indices..., I...)
+    typealias MergedIndicesOrSub Union{MergedIndices, ReshapedMergedIndices, SubMergedIndices}
     import Base: checkbounds_indices
-    @inline checkbounds_indices(::Tuple{},   I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices((),   (parent(parent(I[1])).indices..., tail(I)...))
-    @inline checkbounds_indices(inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
-    @inline checkbounds_indices(inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) = checkbounds_indices(inds, (parent(parent(I[1])).indices..., tail(I)...))
+    @inline checkbounds_indices(::Type{Bool}, inds::Tuple{}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
+        checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
+    @inline checkbounds_indices(::Type{Bool}, inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
+        checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
+    @inline checkbounds_indices(::Type{Bool}, inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
+        checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
 
     # The tricky thing here is that we want to optimize the accesses into the
     # distributed array, but in doing so, we lose track of which indices in I we

From ef1eb7bae07319c5217fadecce3e78aa4b92c466 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Sat, 23 Jul 2016 18:29:21 -0500
Subject: [PATCH 6/8] Add a lazy ProductIndices type instead of creating the
 outer product

Both these lazy arrays are effectively generalizations of Tim's MappedArrays.jl package. Doing this generally adds a bit more difficulty in terms of element types, but that is true of the MappedArray type, too.  It might be worth breaking this out into a package at some point.
---
 src/DistributedArrays.jl | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index 1258d5b..cdd10ab 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -659,7 +659,11 @@ end
 # We also want to optimize setindex! with a SubDArray source, but this is hard
 # and only works on 0.5.
 if VERSION > v"0.5.0-dev+5230"
-    # Similar to Base.indexin, but just create a logical mask
+    # Similar to Base.indexin, but just create a logical mask. Note that this
+    # must return a logical mask in order to support merging multiple masks
+    # together into one linear index since we need to know how many elements to
+    # skip at the end. In many cases range intersection would be much faster
+    # than generating a logical mask, but that loses the endpoint information.
     indexin_mask(a, b::Number) = a .== b
     indexin_mask(a, r::Range{Int}) = [i in r for i in a]
     indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
@@ -684,11 +688,12 @@ if VERSION > v"0.5.0-dev+5230"
         end
     end
     # The final indices are funky - they're allowed to accumulate together.
-    # Too many masks is an easy fix -- just use the outer product to merge them:
+    # An easy (albeit very inefficient) fix for too many masks is to use the
+    # outer product to merge them. But we can do that lazily with a custom type:
     function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
-        restrict_indices(a, (map(Bool, vec(vec(b[1])*vec(b[2])')), tail(tail(b))...))
+        (vec(a[1])[vec(ProductIndices(b, map(length, b)))],)
     end
-    # But too many indices is much harder; this will require merging the indices
+    # But too many indices is much harder; this requires merging the indices
     # in `a` before applying the final mask in `b`.
     function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
         if length(a[1]) == 1
@@ -701,8 +706,15 @@ if VERSION > v"0.5.0-dev+5230"
         end
     end
 
-    immutable MergedIndices{T,N} <: AbstractArray{CartesianIndex{N}, N}
-        indices::T
+    immutable ProductIndices{I,N} <: AbstractArray{Bool, N}
+        indices::I
+        sz::NTuple{N,Int}
+    end
+    Base.size(P::ProductIndices) = P.sz
+    Base.getindex{_,N}(P::ProductIndices{_,N}, I::Vararg{Int, N}) = Bool((&)(map(getindex, P.indices, I)...))
+
+    immutable MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N}
+        indices::I
         sz::NTuple{N,Int}
     end
     Base.size(M::MergedIndices) = M.sz

From 85289adc86ce319776702365b9ee71d5e00edb96 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Sat, 23 Jul 2016 18:44:14 -0500
Subject: [PATCH 7/8] Propagate inbounds for the lazy array types

As a further optimization, (at)inbounds could be added throughout the algorithm once it has received more widespread testing.
---
 src/DistributedArrays.jl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index cdd10ab..4334504 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -711,14 +711,18 @@ if VERSION > v"0.5.0-dev+5230"
         sz::NTuple{N,Int}
     end
     Base.size(P::ProductIndices) = P.sz
-    Base.getindex{_,N}(P::ProductIndices{_,N}, I::Vararg{Int, N}) = Bool((&)(map(getindex, P.indices, I)...))
+    # This gets passed to map to avoid breaking propagation of inbounds
+    Base.@propagate_inbounds propagate_getindex(A, I...) = A[I...]
+    Base.@propagate_inbounds Base.getindex{_,N}(P::ProductIndices{_,N}, I::Vararg{Int, N}) =
+        Bool((&)(map(propagate_getindex, P.indices, I)...))
 
     immutable MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N}
         indices::I
         sz::NTuple{N,Int}
     end
     Base.size(M::MergedIndices) = M.sz
-    Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) = CartesianIndex(map(getindex, M.indices, I))
+    Base.@propagate_inbounds Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) =
+        CartesianIndex(map(propagate_getindex, M.indices, I))
     # Additionally, we optimize bounds checking when using MergedIndices as an 
     # array index since checking, e.g., A[1:500, 1:500] is *way* faster than
     # checking an array of 500^2 elements of CartesianIndex{2}. This optimization

From feff2c64ab91b430870dac9069b78f6733e91dae Mon Sep 17 00:00:00 2001
From: Andreas Noack <andreasnoackjensen@gmail.com>
Date: Wed, 3 Aug 2016 19:03:23 -0400
Subject: [PATCH 8/8] Adjust to testsets.

Remove 0.4 support
---
 .travis.yml              |   2 +-
 REQUIRE                  |   3 +-
 src/DistributedArrays.jl | 334 +++++++++++++++++----------------------
 test/darray.jl           |  26 ++-
 4 files changed, 153 insertions(+), 212 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 845cba7..2e8bf63 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,7 +3,7 @@ os:
   - linux
   - osx
 julia:
-  - 0.4
+  - 0.5
   - nightly
 notifications:
   email: false
diff --git a/REQUIRE b/REQUIRE
index e1063f2..1f5e8ec 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -1,3 +1,2 @@
-julia 0.4
-Compat 0.7.14
+julia 0.5-
 Primes
diff --git a/src/DistributedArrays.jl b/src/DistributedArrays.jl
index 4334504..b6695d2 100644
--- a/src/DistributedArrays.jl
+++ b/src/DistributedArrays.jl
@@ -5,16 +5,8 @@ module DistributedArrays
 using Compat
 import Compat.view
 
-if VERSION >= v"0.5.0-dev+4340"
-    using Primes
-    using Primes: factor
-end
-
-if VERSION < v"0.5.0-"
-    typealias Future RemoteRef
-    typealias RemoteChannel RemoteRef
-    typealias AbstractSerializer SerializationState  # On 0.4 fallback to the only concrete implementation
-end
+using Primes
+using Primes: factor
 
 importall Base
 import Base.Callable
@@ -195,40 +187,24 @@ function DArray(refs)
 
     DArray(identity, refs, ndims, reshape(npids, dimdist), nindexes, ncuts)
 end
-if VERSION < v"0.5.0-"
-    macro DArray(ex::Expr)
-        if ex.head !== :comprehension
-            throw(ArgumentError("invalid @DArray syntax"))
-        end
-        ex.args[1] = esc(ex.args[1])
-        ndim = length(ex.args) - 1
-        ranges = map(r->esc(r.args[2]), ex.args[2:end])
-        for d = 1:ndim
-            var = ex.args[d+1].args[1]
-            ex.args[d+1] = :( $(esc(var)) = ($(ranges[d]))[I[$d]] )
-        end
-        return :( DArray((I::Tuple{Vararg{UnitRange{Int}}})->($ex),
-                    tuple($(map(r->:(length($r)), ranges)...))) )
+
+macro DArray(ex0::Expr)
+    if ex0.head !== :comprehension
+        throw(ArgumentError("invalid @DArray syntax"))
     end
-else
-    macro DArray(ex0::Expr)
-        if ex0.head !== :comprehension
-            throw(ArgumentError("invalid @DArray syntax"))
-        end
-        ex = ex0.args[1]
-        if ex.head !== :generator
-            throw(ArgumentError("invalid @DArray syntax"))
-        end
-        ex.args[1] = esc(ex.args[1])
-        ndim = length(ex.args) - 1
-        ranges = map(r->esc(r.args[2]), ex.args[2:end])
-        for d = 1:ndim
-            var = ex.args[d+1].args[1]
-            ex.args[d+1] = :( $(esc(var)) = ($(ranges[d]))[I[$d]] )
-        end
-        return :( DArray((I::Tuple{Vararg{UnitRange{Int}}})->($ex0),
-                    tuple($(map(r->:(length($r)), ranges)...))) )
+    ex = ex0.args[1]
+    if ex.head !== :generator
+        throw(ArgumentError("invalid @DArray syntax"))
     end
+    ex.args[1] = esc(ex.args[1])
+    ndim = length(ex.args) - 1
+    ranges = map(r->esc(r.args[2]), ex.args[2:end])
+    for d = 1:ndim
+        var = ex.args[d+1].args[1]
+        ex.args[d+1] = :( $(esc(var)) = ($(ranges[d]))[I[$d]] )
+    end
+    return :( DArray((I::Tuple{Vararg{UnitRange{Int}}})->($ex0),
+                tuple($(map(r->:(length($r)), ranges)...))) )
 end
 
 # new DArray similar to an existing one
@@ -658,131 +634,130 @@ end
 
 # We also want to optimize setindex! with a SubDArray source, but this is hard
 # and only works on 0.5.
-if VERSION > v"0.5.0-dev+5230"
-    # Similar to Base.indexin, but just create a logical mask. Note that this
-    # must return a logical mask in order to support merging multiple masks
-    # together into one linear index since we need to know how many elements to
-    # skip at the end. In many cases range intersection would be much faster
-    # than generating a logical mask, but that loses the endpoint information.
-    indexin_mask(a, b::Number) = a .== b
-    indexin_mask(a, r::Range{Int}) = [i in r for i in a]
-    indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
-    indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))
-    indexin_mask(a, b) = [i in b for i in a]
-
-    import Base: tail
-    # Given a tuple of indices and a tuple of masks, restrict the indices to the
-    # valid regions. This is, effectively, reversing Base.setindex_shape_check.
-    # We can't just use indexing into MergedIndices here because getindex is much 
-    # pickier about singleton dimensions than setindex! is.
-    restrict_indices(::Tuple{}, ::Tuple{}) = ()
-    function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})
-        if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)
-            (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)
-        elseif length(a[1]) == 1
-            (a[1], restrict_indices(tail(a), b))
-        elseif length(b[1]) == 1 && b[1][1]
-            restrict_indices(a, tail(b))
-        else
-            throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue"))
-        end
-    end
-    # The final indices are funky - they're allowed to accumulate together.
-    # An easy (albeit very inefficient) fix for too many masks is to use the
-    # outer product to merge them. But we can do that lazily with a custom type:
-    function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
-        (vec(a[1])[vec(ProductIndices(b, map(length, b)))],)
-    end
-    # But too many indices is much harder; this requires merging the indices
-    # in `a` before applying the final mask in `b`.
-    function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
-        if length(a[1]) == 1
-            (a[1], restrict_indices(tail(a), b))
-        else
-            # When one mask spans multiple indices, we need to merge the indices
-            # together. At this point, we can just use indexing to merge them since
-            # there's no longer special handling of singleton dimensions
-            (view(MergedIndices(a, map(length, a)), b[1]),)
-        end
-    end
 
-    immutable ProductIndices{I,N} <: AbstractArray{Bool, N}
-        indices::I
-        sz::NTuple{N,Int}
-    end
-    Base.size(P::ProductIndices) = P.sz
-    # This gets passed to map to avoid breaking propagation of inbounds
-    Base.@propagate_inbounds propagate_getindex(A, I...) = A[I...]
-    Base.@propagate_inbounds Base.getindex{_,N}(P::ProductIndices{_,N}, I::Vararg{Int, N}) =
-        Bool((&)(map(propagate_getindex, P.indices, I)...))
-
-    immutable MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N}
-        indices::I
-        sz::NTuple{N,Int}
-    end
-    Base.size(M::MergedIndices) = M.sz
-    Base.@propagate_inbounds Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) =
-        CartesianIndex(map(propagate_getindex, M.indices, I))
-    # Additionally, we optimize bounds checking when using MergedIndices as an 
-    # array index since checking, e.g., A[1:500, 1:500] is *way* faster than
-    # checking an array of 500^2 elements of CartesianIndex{2}. This optimization
-    # also applies to reshapes of MergedIndices since the outer shape of the
-    # container doesn't affect the index elements themselves. We can go even
-    # farther and say that even restricted views of MergedIndices must be valid
-    # over the entire array. This is overly strict in general, but in this
-    # use-case all the merged indices must be valid at some point, so it's ok.
-    typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M}
-    typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M}
-    typealias MergedIndicesOrSub Union{MergedIndices, ReshapedMergedIndices, SubMergedIndices}
-    import Base: checkbounds_indices
-    @inline checkbounds_indices(::Type{Bool}, inds::Tuple{}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
-        checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
-    @inline checkbounds_indices(::Type{Bool}, inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
-        checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
-    @inline checkbounds_indices(::Type{Bool}, inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
-        checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
-
-    # The tricky thing here is that we want to optimize the accesses into the
-    # distributed array, but in doing so, we lose track of which indices in I we
-    # should be using.
-    #
-    # I’ve come to the conclusion that the function is utterly insane.
-    # There are *6* flavors of indices with four different reference points:
-    # 1. Find the indices of each portion of the DArray.
-    # 2. Find the valid subset of indices for the SubArray into that portion.
-    # 3. Find the portion of the `I` indices that should be used when you access the
-    #    `K` indices in the subarray.  This guy is nasty.  It’s totally backwards
-    #    from all other arrays, wherein we simply iterate over the source array’s
-    #    elements.  You need to *both* know which elements in `J` were skipped
-    #    (`indexin_mask`) and which dimensions should match up (`restrict_indices`)
-    # 4. If `K` doesn’t correspond to an entire chunk, reinterpret `K` in terms of
-    #    the local portion of the source array
-    function Base.setindex!(a::Array, s::SubDArray,
-            I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
-        Base.setindex_shape_check(s, Base.index_lengths(a, I...)...)
-        n = length(I)
-        d = s.parent
-        J = Base.decolon(d, s.indexes...)
-        @sync for i = 1:length(d.pids)
-            K_c = d.indexes[i]
-            K = map(intersect, J, K_c)
-            if !any(isempty, K)
-                K_mask = map(indexin_mask, J, K_c)
-                idxs = restrict_indices(Base.decolon(a, I...), K_mask)
-                if isequal(K, K_c)
-                    # whole chunk
-                    @async a[idxs...] = chunk(d, i)
-                else
-                    # partial chunk
-                    @async a[idxs...] =
-                        remotecall_fetch(d.pids[i]) do
-                            view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:length(J)]...)
-                        end
-                end
+# Similar to Base.indexin, but just create a logical mask. Note that this
+# must return a logical mask in order to support merging multiple masks
+# together into one linear index since we need to know how many elements to
+# skip at the end. In many cases range intersection would be much faster
+# than generating a logical mask, but that loses the endpoint information.
+indexin_mask(a, b::Number) = a .== b
+indexin_mask(a, r::Range{Int}) = [i in r for i in a]
+indexin_mask(a, b::AbstractArray{Int}) = indexin_mask(a, IntSet(b))
+indexin_mask(a, b::AbstractArray) = indexin_mask(a, Set(b))
+indexin_mask(a, b) = [i in b for i in a]
+
+import Base: tail
+# Given a tuple of indices and a tuple of masks, restrict the indices to the
+# valid regions. This is, effectively, reversing Base.setindex_shape_check.
+# We can't just use indexing into MergedIndices here because getindex is much 
+# pickier about singleton dimensions than setindex! is.
+restrict_indices(::Tuple{}, ::Tuple{}) = ()
+function restrict_indices(a::Tuple{Any, Vararg{Any}}, b::Tuple{Any, Vararg{Any}})
+    if (length(a[1]) == length(b[1]) == 1) || (length(a[1]) > 1 && length(b[1]) > 1)
+        (vec(a[1])[vec(b[1])], restrict_indices(tail(a), tail(b))...)
+    elseif length(a[1]) == 1
+        (a[1], restrict_indices(tail(a), b))
+    elseif length(b[1]) == 1 && b[1][1]
+        restrict_indices(a, tail(b))
+    else
+        throw(DimensionMismatch("this should be caught by setindex_shape_check; please submit an issue"))
+    end
+end
+# The final indices are funky - they're allowed to accumulate together.
+# An easy (albeit very inefficient) fix for too many masks is to use the
+# outer product to merge them. But we can do that lazily with a custom type:
+function restrict_indices(a::Tuple{Any}, b::Tuple{Any, Any, Vararg{Any}})
+    (vec(a[1])[vec(ProductIndices(b, map(length, b)))],)
+end
+# But too many indices is much harder; this requires merging the indices
+# in `a` before applying the final mask in `b`.
+function restrict_indices(a::Tuple{Any, Any, Vararg{Any}}, b::Tuple{Any})
+    if length(a[1]) == 1
+        (a[1], restrict_indices(tail(a), b))
+    else
+        # When one mask spans multiple indices, we need to merge the indices
+        # together. At this point, we can just use indexing to merge them since
+        # there's no longer special handling of singleton dimensions
+        (view(MergedIndices(a, map(length, a)), b[1]),)
+    end
+end
+
+immutable ProductIndices{I,N} <: AbstractArray{Bool, N}
+    indices::I
+    sz::NTuple{N,Int}
+end
+Base.size(P::ProductIndices) = P.sz
+# This gets passed to map to avoid breaking propagation of inbounds
+Base.@propagate_inbounds propagate_getindex(A, I...) = A[I...]
+Base.@propagate_inbounds Base.getindex{_,N}(P::ProductIndices{_,N}, I::Vararg{Int, N}) =
+    Bool((&)(map(propagate_getindex, P.indices, I)...))
+
+immutable MergedIndices{I,N} <: AbstractArray{CartesianIndex{N}, N}
+    indices::I
+    sz::NTuple{N,Int}
+end
+Base.size(M::MergedIndices) = M.sz
+Base.@propagate_inbounds Base.getindex{_,N}(M::MergedIndices{_,N}, I::Vararg{Int, N}) =
+    CartesianIndex(map(propagate_getindex, M.indices, I))
+# Additionally, we optimize bounds checking when using MergedIndices as an 
+# array index since checking, e.g., A[1:500, 1:500] is *way* faster than
+# checking an array of 500^2 elements of CartesianIndex{2}. This optimization
+# also applies to reshapes of MergedIndices since the outer shape of the
+# container doesn't affect the index elements themselves. We can go even
+# farther and say that even restricted views of MergedIndices must be valid
+# over the entire array. This is overly strict in general, but in this
+# use-case all the merged indices must be valid at some point, so it's ok.
+typealias ReshapedMergedIndices{T,N,M<:MergedIndices} Base.ReshapedArray{T,N,M}
+typealias SubMergedIndices{T,N,M<:Union{MergedIndices, ReshapedMergedIndices}} SubArray{T,N,M}
+typealias MergedIndicesOrSub Union{MergedIndices, ReshapedMergedIndices, SubMergedIndices}
+import Base: checkbounds_indices
+@inline checkbounds_indices(::Type{Bool}, inds::Tuple{}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
+    checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
+@inline checkbounds_indices(::Type{Bool}, inds::Tuple{Any}, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
+    checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
+@inline checkbounds_indices(::Type{Bool}, inds::Tuple, I::Tuple{MergedIndicesOrSub,Vararg{Any}}) =
+    checkbounds_indices(Bool, inds, (parent(parent(I[1])).indices..., tail(I)...))
+
+# The tricky thing here is that we want to optimize the accesses into the
+# distributed array, but in doing so, we lose track of which indices in I we
+# should be using.
+#
+# I’ve come to the conclusion that the function is utterly insane.
+# There are *6* flavors of indices with four different reference points:
+# 1. Find the indices of each portion of the DArray.
+# 2. Find the valid subset of indices for the SubArray into that portion.
+# 3. Find the portion of the `I` indices that should be used when you access the
+#    `K` indices in the subarray.  This guy is nasty.  It’s totally backwards
+#    from all other arrays, wherein we simply iterate over the source array’s
+#    elements.  You need to *both* know which elements in `J` were skipped
+#    (`indexin_mask`) and which dimensions should match up (`restrict_indices`)
+# 4. If `K` doesn’t correspond to an entire chunk, reinterpret `K` in terms of
+#    the local portion of the source array
+function Base.setindex!(a::Array, s::SubDArray,
+        I::Union{UnitRange{Int},Colon,Vector{Int},StepRange{Int,Int}}...)
+    Base.setindex_shape_check(s, Base.index_lengths(a, I...)...)
+    n = length(I)
+    d = s.parent
+    J = Base.decolon(d, s.indexes...)
+    @sync for i = 1:length(d.pids)
+        K_c = d.indexes[i]
+        K = map(intersect, J, K_c)
+        if !any(isempty, K)
+            K_mask = map(indexin_mask, J, K_c)
+            idxs = restrict_indices(Base.decolon(a, I...), K_mask)
+            if isequal(K, K_c)
+                # whole chunk
+                @async a[idxs...] = chunk(d, i)
+            else
+                # partial chunk
+                @async a[idxs...] =
+                    remotecall_fetch(d.pids[i]) do
+                        view(localpart(d), [K[j]-first(K_c[j])+1 for j=1:length(J)]...)
+                    end
             end
         end
-        return a
     end
+    return a
 end
 
 Base.fill!(A::DArray, x) = begin
@@ -1494,16 +1469,7 @@ function compute_boundaries{T}(d::DVector{T}; kwargs...)
     np = length(pids)
     sample_sz_on_wrkr = 512
 
-    if VERSION < v"0.5.0-"
-        results = Array(Any,np)
-        @sync begin
-            for (i,p) in enumerate(pids)
-                @async results[i] = remotecall_fetch(sample_n_setup_ref, p, d, sample_sz_on_wrkr; kwargs...)
-            end
-        end
-    else
-        results = asyncmap(p -> remotecall_fetch(sample_n_setup_ref, p, d, sample_sz_on_wrkr; kwargs...), pids)
-    end
+    results = asyncmap(p -> remotecall_fetch(sample_n_setup_ref, p, d, sample_sz_on_wrkr; kwargs...), pids)
 
     samples = Array(T,0)
     for x in results
@@ -1554,14 +1520,7 @@ function Base.sort{T}(d::DVector{T}; sample=true, kwargs...)
 
     elseif sample==false
         # Assume an uniform distribution between min and max values
-        if VERSION < v"0.5.0-"
-            minmax=Array(Tuple, np)
-            @sync for (i,p) in enumerate(pids)
-                @async minmax[i] = remotecall_fetch(d->(minimum(localpart(d)), maximum(localpart(d))), p, d)
-            end
-        else
-            minmax=asyncmap(p->remotecall_fetch(d->(minimum(localpart(d)), maximum(localpart(d))), p, d), pids)
-        end
+        minmax=asyncmap(p->remotecall_fetch(d->(minimum(localpart(d)), maximum(localpart(d))), p, d), pids)
         min_d = minimum(T[x[1] for x in minmax])
         max_d = maximum(T[x[2] for x in minmax])
 
@@ -1602,19 +1561,10 @@ function Base.sort{T}(d::DVector{T}; sample=true, kwargs...)
     end
 
     local_sort_results = Array(Tuple, np)
-    if VERSION < v"0.5.0-"
-        @sync begin
-            for (i,p) in enumerate(pids)
-                @async local_sort_results[i] =
-                    remotecall_fetch(
-                        scatter_n_sort_localparts, p, presorted ? nothing : d, i, refs, boundaries; kwargs...)
-            end
-        end
-    else
-        Base.asyncmap!((i,p) -> remotecall_fetch(
+
+    Base.asyncmap!((i,p) -> remotecall_fetch(
             scatter_n_sort_localparts, p, presorted ? nothing : d, i, refs, boundaries; kwargs...),
                                     local_sort_results, 1:np, pids)
-    end
 
     # Construct a new DArray from the sorted refs. Remove parts with 0-length since
     # the DArray constructor_from_refs does not yet support it. This implies that
diff --git a/test/darray.jl b/test/darray.jl
index 17fc7ef..0efb59e 100644
--- a/test/darray.jl
+++ b/test/darray.jl
@@ -101,20 +101,20 @@ check_leaks()
         close(D2)
 
         S2 = convert(Vector{Float64}, D[4, 23:176])
-        @fact A[4, 23:176] --> S2
+        @test A[4, 23:176] == S2
 
         S3 = convert(Vector{Float64}, D[23:176, 197])
-        @fact A[23:176, 197] --> S3
+        @test A[23:176, 197] == S3
 
         S4 = zeros(4)
         setindex!(S4, D[3:4, 99:100], :)
-        @fact S4 --> vec(D[3:4, 99:100])
-        @fact S4 --> vec(A[3:4, 99:100])
-        
+        @test S4 == vec(D[3:4, 99:100])
+        @test S4 == vec(A[3:4, 99:100])
+
         S5 = zeros(2,2)
         setindex!(S5, D[1,1:4], :, 1:2)
-        @fact vec(S5) --> D[1, 1:4]
-        @fact vec(S5) --> A[1, 1:4]
+        @test vec(S5) == D[1, 1:4]
+        @test vec(S5) == A[1, 1:4]
     end
     close(D)
 end
@@ -626,20 +626,12 @@ check_leaks()
 # Commented out tests that need to be enabled in due course when DArray support is more complete
 @testset "test mapslices" begin
     a = drand((5,5), workers(), [1, min(nworkers(), 5)])
-    if VERSION < v"0.5.0-dev+4361"
-        h = mapslices(v -> hist(v,0:0.1:1)[2], a, 1)
-    else
-        h = mapslices(v -> fit(Histogram,v,0:0.1:1).weights, a, 1)
-    end
+    h = mapslices(v -> fit(Histogram,v,0:0.1:1).weights, a, 1)
 #    H = mapslices(v -> hist(v,0:0.1:1)[2], a, 2)
 #    s = mapslices(sort, a, [1])
 #    S = mapslices(sort, a, [2])
     for i = 1:5
-        if VERSION < v"0.5.0-dev+4361"
-            @test h[:,i] == hist(a[:,i],0:0.1:1)[2]
-        else
-            @test h[:,i] == fit(Histogram, a[:,i],0:0.1:1).weights
-        end
+        @test h[:,i] == fit(Histogram, a[:,i],0:0.1:1).weights
 #        @test vec(H[i,:]) => hist(vec(a[i,:]),0:0.1:1)[2]
 #        @test s[:,i] => sort(a[:,i])
 #        @test vec(S[i,:]) => sort(vec(a[i,:]))