diff --git a/base/abstractarray.jl b/base/abstractarray.jl index 1f1120740e99a..383814ff38ddc 100644 --- a/base/abstractarray.jl +++ b/base/abstractarray.jl @@ -1645,28 +1645,29 @@ _cat(dims, X...) = cat_t(promote_eltypeof(X...), X...; dims=dims) return __cat(A, shape, catdims, X...) end -function __cat(A, shape::NTuple{M}, catdims, X...) where M - N = M::Int - offsets = zeros(Int, N) - inds = Vector{UnitRange{Int}}(undef, N) - concat = copyto!(zeros(Bool, N), catdims) - for x in X - for i = 1:N - if concat[i] - inds[i] = offsets[i] .+ cat_indices(x, i) - offsets[i] += cat_size(x, i) - else - inds[i] = 1:shape[i] - end - end - I::NTuple{N, UnitRange{Int}} = (inds...,) - if x isa AbstractArray - A[I...] = x - else - fill!(view(A, I...), x) - end +# Why isn't this called `__cat!`? +__cat(A, shape, catdims, X...) = __cat_offset!(A, shape, catdims, ntuple(zero, length(shape)), X...) + +function __cat_offset!(A, shape, catdims, offsets, x, X...) + # splitting the "work" on x from X... may reduce latency (fewer costly specializations) + newoffsets = __cat_offset1!(A, shape, catdims, offsets, x) + return __cat_offset!(A, shape, catdims, newoffsets, X...) +end +__cat_offset!(A, shape, catdims, offsets) = A + +function __cat_offset1!(A, shape, catdims, offsets, x) + inds = ntuple(length(offsets)) do i + (i <= length(catdims) && catdims[i]) ? offsets[i] .+ cat_indices(x, i) : 1:shape[i] + end + if x isa AbstractArray + A[inds...] = x + else + fill!(view(A, inds...), x) + end + newoffsets = ntuple(length(offsets)) do i + (i <= length(catdims) && catdims[i]) ? offsets[i] + cat_size(x, i) : offsets[i] end - return A + return newoffsets end """