diff --git a/base/missing.jl b/base/missing.jl index dd829995d98cd..6cee1182c076a 100644 --- a/base/missing.jl +++ b/base/missing.jl @@ -146,6 +146,9 @@ float(A::AbstractArray{Missing}) = A skipmissing(itr) Return an iterator over the elements in `itr` skipping [`missing`](@ref) values. +In addition to supporting any function taking iterators, the resulting object +implements reductions over dimensions (i.e. the `dims` argument to +[`mapreduce`](@ref), [`reduce`](@ref) and special functions like [`sum`](@ref)). Use [`collect`](@ref) to obtain an `Array` containing the non-`missing` values in `itr`. Note that even if `itr` is a multidimensional array, the result will always @@ -154,9 +157,6 @@ of the input. # Examples ```jldoctest -julia> sum(skipmissing([1, missing, 2])) -3 - julia> collect(skipmissing([1, missing, 2])) 2-element Array{Int64,1}: 1 @@ -166,6 +166,31 @@ julia> collect(skipmissing([1 missing; 2 missing])) 2-element Array{Int64,1}: 1 2 + +julia> sum(skipmissing([1, missing, 2])) +3 + +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> sum(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 4 4 + +julia> sum(skipmissing(B), dims=2) +2×1 Array{Int64,2}: + 1 + 7 + +julia> reduce(*, skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 3 4 + +julia> mapreduce(cos, +, skipmissing(B), dims=1) +1×2 Array{Float64,2}: + -0.44969 -0.653644 ``` """ skipmissing(itr) = SkipMissing(itr) @@ -192,8 +217,8 @@ end # Optimized mapreduce implementation # The generic method is faster when !(eltype(A) >: Missing) since it does not need # additional loops to identify the two first non-missing values of each block -mapreduce(f, op, itr::SkipMissing{<:AbstractArray}) = - _mapreduce(f, op, IndexStyle(itr.x), eltype(itr.x) >: Missing ? itr : itr.x) +mapreduce(f, op, itr::SkipMissing{<:AbstractArray}; dims=:, kw...) = + _mapreduce_dim(f, op, kw.data, eltype(itr.x) >: Missing ? itr : itr.x, dims) function _mapreduce(f, op, ::IndexLinear, itr::SkipMissing{<:AbstractArray}) A = itr.x @@ -280,6 +305,171 @@ mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) = end end +# mapreduce over dimensions implementation + +_mapreduce_dim(f, op, nt::NamedTuple{(:init,)}, itr::SkipMissing{<:AbstractArray}, ::Colon) = + mapfoldl(f, op, itr; nt...) + +_mapreduce_dim(f, op, ::NamedTuple{()}, itr::SkipMissing{<:AbstractArray}, ::Colon) = + _mapreduce(f, op, IndexStyle(itr.x), itr) + +_mapreduce_dim(f, op, nt::NamedTuple{(:init,)}, itr::SkipMissing{<:AbstractArray}, dims) = + mapreducedim!(f, op, reducedim_initarray(itr, dims, nt.init), itr) + +_mapreduce_dim(f, op, ::NamedTuple{()}, itr::SkipMissing{<:AbstractArray}, dims) = + mapreducedim!(f, op, reducedim_init(f, op, itr, dims), itr) + +reducedim_initarray(itr::SkipMissing{<:AbstractArray}, region, init, ::Type{R}) where {R} = + reducedim_initarray(itr.x, region, init, R) +reducedim_initarray(itr::SkipMissing{<:AbstractArray}, region, init::T) where {T} = + reducedim_initarray(itr.x, region, init, T) + +# initialization when computing minima and maxima requires a little care +for (f1, f2, initval) in ((:min, :max, :Inf), (:max, :min, :(-Inf))) + @eval function reducedim_init(f, op::typeof($f1), itr::SkipMissing{<:AbstractArray}, region) + A = itr.x + T = eltype(itr) + + # First compute the reduce indices. This will throw an ArgumentError + # if any region is invalid + ri = reduced_indices(A, region) + + # Next, throw if reduction is over a region with length zero + any(i -> isempty(axes(A, i)), region) && _empty_reduce_error() + + # Make a view of the first slice of the region + A1 = view(A, ri...) + + if isempty(A1) + # If the slice is empty just return non-view, non-missing version as the initial array + return similar(A1, eltype(itr)) + else + # otherwise use the min/max of the first slice as initial value + v0 = mapreduce(f, $f2, A1) + + R = similar(A1, typeof(v0)) + + # if any value is missing in first slice, look for first + # non-missing value in each slice + if v0 === missing + v0 = nonmissingval(f, $f2, itr, R) + R = similar(A1, typeof(v0)) + end + + # but NaNs need to be avoided as initial values + v0 = v0 != v0 ? typeof(v0)($initval) : v0 + + # equivalent to reducedim_initarray, but we need R in advance + return fill!(R, v0) + end + end +end + +# Iterate until we've encountered at least one non-missing value in each slice, +# and return the min/max non-missing value of all clices +function nonmissingval(f, op::Union{typeof(min), typeof(max)}, + itr::SkipMissing{<:AbstractArray}, R::AbstractArray) + A = itr.x + lsiz = check_reducedims(R,A) + indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually + keep, Idefault = Broadcast.shapeindexer(indsRt) + i = findfirst(!ismissing, A) + i === nothing && throw(ArgumentError("cannot reduce over array with only missing values")) + @inbounds v = A[i] + if reducedim1(R, A) + # keep track of state using a single variable when reducing along the first dimension + i1 = first(axes1(R)) + @inbounds for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + filled = false + for i in axes(A, 1) + x = A[i, IA] + if x !== missing + v = op(v, f(x)) + filled = true + break + end + end + if !filled + throw(ArgumentError("cannot reduce over slices with only missing values")) + end + end + else + filled = fill!(similar(R, Bool), false) + allfilled = false + @inbounds for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + for i in axes(A, 1) + x = A[i, IA] + if x !== missing + v = op(v, f(x)) + filled[i,IR] = true + end + end + (allfilled = all(filled)) && break + end + if !allfilled + throw(ArgumentError("cannot reduce over slices with only missing values")) + end + end + v +end + +function _mapreducedim!(f, op, R::AbstractArray, itr::SkipMissing{<:AbstractArray}) + A = itr.x + lsiz = check_reducedims(R,A) + isempty(A) && return R + + if has_fast_linear_indexing(A) && lsiz > 16 + # use mapreduce_impl, which is probably better tuned to achieve higher performance + nslices = div(length(A), lsiz) + ibase = first(LinearIndices(A))-1 + for i = 1:nslices + x = mapreduce_impl(f, op, itr, ibase+1, ibase+lsiz) + if x !== nothing + @inbounds R[i] = op(R[i], something(x)) + end + ibase += lsiz + end + return R + end + indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually + keep, Idefault = Broadcast.shapeindexer(indsRt) + if reducedim1(R, A) + # keep the accumulator as a local variable when reducing along the first dimension + i1 = first(axes1(R)) + @inbounds for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + r = R[i1,IR] + @simd for i in axes(A, 1) + x = A[i, IA] + if x !== missing + r = op(r, f(x)) + end + end + + R[i1,IR] = r + end + else + @inbounds for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + @simd for i in axes(A, 1) + x = A[i, IA] + if x !== missing + R[i,IR] = op(R[i,IR], f(x)) + end + end + end + end + return R +end + +mapreducedim!(f, op, R::AbstractArray, A::SkipMissing{<:AbstractArray}) = + (_mapreducedim!(f, op, R, A); R) + +reducedim!(op, R::AbstractArray{RT}, A::SkipMissing{<:AbstractArray}) where {RT} = + mapreducedim!(identity, op, R, A) + """ coalesce(x, y...) diff --git a/base/reducedim.jl b/base/reducedim.jl index 150b93c7e8f5a..a3682d9c9b172 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -99,10 +99,10 @@ reducedim_initarray(A::AbstractArray, region, init::T) where {T} = reducedim_ini promote_union(T::Union) = promote_type(promote_union(T.a), promote_union(T.b)) promote_union(T) = T -function reducedim_init(f, op::Union{typeof(+),typeof(add_sum)}, A::AbstractArray, region) +function reducedim_init(f, op::Union{typeof(+),typeof(add_sum)}, A, region) _reducedim_init(f, op, zero, sum, A, region) end -function reducedim_init(f, op::Union{typeof(*),typeof(mul_prod)}, A::AbstractArray, region) +function reducedim_init(f, op::Union{typeof(*),typeof(mul_prod)}, A, region) _reducedim_init(f, op, one, prod, A, region) end function _reducedim_init(f, op, fv, fop, A, region) @@ -145,11 +145,11 @@ for (f1, f2, initval) in ((:min, :max, :Inf), (:max, :min, :(-Inf))) end end end -reducedim_init(f::Union{typeof(abs),typeof(abs2)}, op::typeof(max), A::AbstractArray{T}, region) where {T} = - reducedim_initarray(A, region, zero(f(zero(T)))) +reducedim_init(f::Union{typeof(abs),typeof(abs2)}, op::typeof(max), A, region) = + reducedim_initarray(A, region, zero(f(zero(eltype(A))))) -reducedim_init(f, op::typeof(&), A::AbstractArray, region) = reducedim_initarray(A, region, true) -reducedim_init(f, op::typeof(|), A::AbstractArray, region) = reducedim_initarray(A, region, false) +reducedim_init(f, op::typeof(&), A, region) = reducedim_initarray(A, region, true) +reducedim_init(f, op::typeof(|), A, region) = reducedim_initarray(A, region, false) # specialize to make initialization more efficient for common cases @@ -288,6 +288,15 @@ julia> mapreduce(isodd, *, a, dims=1) julia> mapreduce(isodd, |, true, a, dims=1) 1×4 Array{Bool,2}: true true true true + +julia> b = [1 missing 5; 2 4 missing] +2×3 Array{Union{Missing, Int64},2}: + 1 missing 5 + 2 4 missing + +julia> mapreduce(isodd, *, skipmissing(b), dims=1) +1×3 Array{Bool,2}: + false false true ``` """ mapreduce(f, op, A::AbstractArray; dims=:, kw...) = _mapreduce_dim(f, op, kw.data, A, dims) @@ -332,6 +341,15 @@ julia> reduce(max, a, dims=2) julia> reduce(max, a, dims=1) 1×4 Array{Int64,2}: 4 8 12 16 + +julia> b = [1 missing 5; 2 4 missing] +2×3 Array{Union{Missing, Int64},2}: + 1 missing 5 + 2 4 missing + +julia> reduce(+, skipmissing(b), dims=1) +1×3 Array{Int64,2}: + 3 4 5 ``` """ reduce(op, A::AbstractArray; kw...) = mapreduce(identity, op, A; kw...) @@ -342,6 +360,12 @@ reduce(op, A::AbstractArray; kw...) = mapreduce(identity, op, A; kw...) Sum elements of an array over the given dimensions. +!!! note + If `A` contains `NaN` or [`missing`](@ref) values, they are propagated to the + corresponding result (`missing` takes precedence if a slice contains both). + Use [`skipmissing(A)`](@ref) to omit `missing` entries and compute the + sum of non-missing values. + # Examples ```jldoctest julia> A = [1 2; 3 4] @@ -357,6 +381,20 @@ julia> sum(A, dims=2) 2×1 Array{Int64,2}: 3 7 + +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> sum(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 4 4 + +julia> sum(skipmissing(B), dims=2) +2×1 Array{Int64,2}: + 1 + 7 ``` """ sum(A::AbstractArray; dims) @@ -390,6 +428,12 @@ sum!(r, A) Multiply elements of an array over the given dimensions. +!!! note + If `A` contains `NaN` or [`missing`](@ref) values, they are propagated to the + corresponding result (`missing` takes precedence if a slice contains both). + Use [`skipmissing(A)`](@ref) to omit `missing` entries and compute the + product of non-missing values. + # Examples ```jldoctest julia> A = [1 2; 3 4] @@ -405,6 +449,20 @@ julia> prod(A, dims=2) 2×1 Array{Int64,2}: 2 12 + +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> prod(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 3 4 + +julia> prod(skipmissing(B), dims=2) +2×1 Array{Int64,2}: + 1 + 12 ``` """ prod(A::AbstractArray; dims) @@ -440,6 +498,12 @@ Compute the maximum value of an array over the given dimensions. See also the [`max(a,b)`](@ref) function to take the maximum of two or more arguments, which can be applied elementwise to arrays via `max.(a,b)`. +!!! note + If `A` contains `NaN` or [`missing`](@ref) values, they are propagated to the + corresponding result (`missing` takes precedence if a slice contains both). + Use [`skipmissing(A)`](@ref) to omit `missing` entries and compute the + maximum of non-missing values. + # Examples ```jldoctest julia> A = [1 2; 3 4] @@ -455,6 +519,20 @@ julia> maximum(A, dims=2) 2×1 Array{Int64,2}: 2 4 + +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> maximum(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 3 4 + +julia> maximum(skipmissing(B), dims=2) +2×1 Array{Int64,2}: + 1 + 4 ``` """ maximum(A::AbstractArray; dims) @@ -490,6 +568,12 @@ Compute the minimum value of an array over the given dimensions. See also the [`min(a,b)`](@ref) function to take the minimum of two or more arguments, which can be applied elementwise to arrays via `min.(a,b)`. +!!! note + If `A` contains `NaN` or [`missing`](@ref) values, they are propagated to the + corresponding result (`missing` takes precedence if a slice contains both). + Use [`skipmissing(A)`](@ref) to omit `missing` entries and compute the + minimum of non-missing values. + # Examples ```jldoctest julia> A = [1 2; 3 4] @@ -502,6 +586,20 @@ julia> minimum(A, dims=1) 1 2 julia> minimum(A, dims=2) +2×1 Array{Int64,2}: + 1 + 3 + +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> minimum(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 1 4 + +julia> minimum(skipmissing(B), dims=2) 2×1 Array{Int64,2}: 1 3 @@ -635,6 +733,7 @@ for (fname, _fname, op) in [(:sum, :_sum, :add_sum), (:prod, :_prod, @eval begin # User-facing methods with keyword arguments @inline ($fname)(a::AbstractArray; dims=:) = ($_fname)(a, dims) + @inline ($fname)(a::SkipMissing{<:AbstractArray}; dims=:) = ($_fname)(a, dims) @inline ($fname)(f::Callable, a::AbstractArray; dims=:) = ($_fname)(f, a, dims) # Underlying implementations using dispatch diff --git a/doc/src/manual/missing.md b/doc/src/manual/missing.md index 67b349f57a157..3b5bfa7420730 100644 --- a/doc/src/manual/missing.md +++ b/doc/src/manual/missing.md @@ -305,7 +305,36 @@ julia> mapreduce(sqrt, +, skipmissing([3, missing, 2, 1])) 4.146264369941973 ``` -Use [`collect`](@ref) to extract non-`missing` values and store them in an array +Reduction over dimensions is also supported +```jldoctest +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> sum(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 4 4 + +julia> sum(skipmissing(B), dims=2) +2×1 Array{Int64,2}: + 1 + 7 + +julia> reduce(*, skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 3 4 + +julia> mapreduce(cos, +, skipmissing(B), dims=1) +1×2 Array{Float64,2}: + -0.44969 -0.653644 +``` + +However, note that the special iterator object returned by `skipmissing` is not an +`AbstractArray`: since `missing` entries of the original array are skipped, +it does not have a regular shape and cannot be indexed. +Use [`collect`](@ref) to extract non-`missing` values and store them in a vector, +losing the shape of the original array ```jldoctest julia> collect(skipmissing([3, missing, 2, 1])) 3-element Array{Int64,1}: diff --git a/test/missing.jl b/test/missing.jl index d25751322cb2e..0bae44ac7b890 100644 --- a/test/missing.jl +++ b/test/missing.jl @@ -412,6 +412,76 @@ end @test_throws ArgumentError mapreduce(x -> x/2, +, itr) end end + + @testset "mapreduce over dims" begin + nm(x) = (@test !(eltype(x) >: Missing); x) + # Vary size to test splitting blocks with several configurations of missing values + for T in (Int, Float64), + A in (rand(T, 10), rand(T, 100, 1000), rand(T, 10000, 10)), + i in (1, 2, 1:2), + f in (identity, abs, abs2, cos), + (op, redf, init) in ((+, sum, x->oftype(f(one(eltype(x))), zero(eltype(x)))), + (*, prod, x->oftype(f(one(eltype(x))), one(eltype(x)))), + (max, maximum, x->f(first(skipmissing(x)))), + (min, minimum, x->f(first(skipmissing(x))))) + if T <: Integer + @test redf(A, dims=i) == nm(redf(skipmissing(A), dims=i)) == + nm(reduce(op, skipmissing(A), dims=i)) == + nm(mapreduce(identity, op, skipmissing(A), dims=i)) + else + @test redf(A, dims=i) ≈ nm(redf(skipmissing(A), dims=i)) == + nm(reduce(op, skipmissing(A), dims=i)) == + nm(mapreduce(identity, op, skipmissing(A), dims=i)) + end + @test mapreduce(f, op, A, dims=i) ≈ + nm(mapreduce(f, op, skipmissing(A), dims=i)) + + maximum(i) > ndims(A) && (i = 1) # for mapslices + B = Array{Union{T,Missing}}(A) + replace!(x -> rand(Bool) ? x : missing, B) + # mapreduce fails for min and max if there are empty slices + if !(op in (min, max)) || !any(iszero, sum(!ismissing, B, dims=i)) + if T === Int + @test mapslices(x -> redf(skipmissing(x)), B, dims=i) == + nm(redf(skipmissing(B), dims=i)) + else + @test isapprox(mapslices(x -> redf(skipmissing(x)), B, dims=i), + nm(redf(skipmissing(B), dims=i)), rtol=sqrt(eps()), atol=1e-16) + end + @test nm(redf(skipmissing(B), dims=i)) == + nm(reduce(op, skipmissing(B), dims=i)) == + nm(mapreduce(identity, op, skipmissing(B), dims=i)) + @test isapprox(mapslices(x -> mapreduce(f, op, skipmissing(x), init=init(x)), B, dims=i), + nm(mapreduce(f, op, skipmissing(B), dims=i)), + rtol=sqrt(eps()), atol=1e-16) + elseif op in (min, max) && !(f in (abs, abs2)) + @test_throws ArgumentError mapslices(x -> redf(skipmissing(x)), B, dims=i) + @test_throws ArgumentError redf(skipmissing(B), dims=i) + @test_throws ArgumentError reduce(op, skipmissing(B), dims=i) + @test_throws ArgumentError mapreduce(identity, op, skipmissing(B), dims=i) + @test_throws ArgumentError mapreduce(f, op, skipmissing(B), dims=i) + end + + # Test block full of missing values + B[1:length(B)÷2] .= missing + if !(op in (min, max)) + if T <: Integer + @test mapslices(x -> redf(skipmissing(x)), B, dims=i) == + nm(redf(skipmissing(B), dims=i)) + else + @test isapprox(mapslices(x -> redf(skipmissing(x)), B, dims=i), + nm(redf(skipmissing(B), dims=i)), + atol=1e-16) + end + @test nm(redf(skipmissing(B), dims=i)) == + nm(reduce(op, skipmissing(B), dims=i)) == + nm(mapreduce(identity, op, skipmissing(B), dims=i)) + @test isapprox(mapslices(x -> mapreduce(f, op, skipmissing(x), init=init(x)), B, dims=i), + nm(mapreduce(f, op, skipmissing(B), dims=i)), + rtol=sqrt(eps()), atol=1e-16) + end + end + end end @testset "coalesce" begin