From 5efb69b30a219508f95af9fc122fe4f17e888001 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 16 Jul 2018 18:10:01 -0400 Subject: [PATCH] Support mapreduce over dimensions with SkipMissing Allows calling mapreduce and specialized functinos with the dims argument on SkipMissing objects. The implementation is copied on the generic methods, but missing values need to be handled directly inside functions for efficiency and because mapreduce_impl returns a Some object which needs special handling. --- base/missing.jl | 200 +++++++++++++++++++++++++++++++++++++- base/reducedim.jl | 111 +++++++++++++++++++-- doc/src/manual/missing.md | 31 +++++- test/missing.jl | 70 +++++++++++++ 4 files changed, 400 insertions(+), 12 deletions(-) diff --git a/base/missing.jl b/base/missing.jl index dd829995d98cd..6cee1182c076a 100644 --- a/base/missing.jl +++ b/base/missing.jl @@ -146,6 +146,9 @@ float(A::AbstractArray{Missing}) = A skipmissing(itr) Return an iterator over the elements in `itr` skipping [`missing`](@ref) values. +In addition to supporting any function taking iterators, the resulting object +implements reductions over dimensions (i.e. the `dims` argument to +[`mapreduce`](@ref), [`reduce`](@ref) and special functions like [`sum`](@ref)). Use [`collect`](@ref) to obtain an `Array` containing the non-`missing` values in `itr`. Note that even if `itr` is a multidimensional array, the result will always @@ -154,9 +157,6 @@ of the input. # Examples ```jldoctest -julia> sum(skipmissing([1, missing, 2])) -3 - julia> collect(skipmissing([1, missing, 2])) 2-element Array{Int64,1}: 1 @@ -166,6 +166,31 @@ julia> collect(skipmissing([1 missing; 2 missing])) 2-element Array{Int64,1}: 1 2 + +julia> sum(skipmissing([1, missing, 2])) +3 + +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> sum(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 4 4 + +julia> sum(skipmissing(B), dims=2) +2×1 Array{Int64,2}: + 1 + 7 + +julia> reduce(*, skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 3 4 + +julia> mapreduce(cos, +, skipmissing(B), dims=1) +1×2 Array{Float64,2}: + -0.44969 -0.653644 ``` """ skipmissing(itr) = SkipMissing(itr) @@ -192,8 +217,8 @@ end # Optimized mapreduce implementation # The generic method is faster when !(eltype(A) >: Missing) since it does not need # additional loops to identify the two first non-missing values of each block -mapreduce(f, op, itr::SkipMissing{<:AbstractArray}) = - _mapreduce(f, op, IndexStyle(itr.x), eltype(itr.x) >: Missing ? itr : itr.x) +mapreduce(f, op, itr::SkipMissing{<:AbstractArray}; dims=:, kw...) = + _mapreduce_dim(f, op, kw.data, eltype(itr.x) >: Missing ? itr : itr.x, dims) function _mapreduce(f, op, ::IndexLinear, itr::SkipMissing{<:AbstractArray}) A = itr.x @@ -280,6 +305,171 @@ mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) = end end +# mapreduce over dimensions implementation + +_mapreduce_dim(f, op, nt::NamedTuple{(:init,)}, itr::SkipMissing{<:AbstractArray}, ::Colon) = + mapfoldl(f, op, itr; nt...) + +_mapreduce_dim(f, op, ::NamedTuple{()}, itr::SkipMissing{<:AbstractArray}, ::Colon) = + _mapreduce(f, op, IndexStyle(itr.x), itr) + +_mapreduce_dim(f, op, nt::NamedTuple{(:init,)}, itr::SkipMissing{<:AbstractArray}, dims) = + mapreducedim!(f, op, reducedim_initarray(itr, dims, nt.init), itr) + +_mapreduce_dim(f, op, ::NamedTuple{()}, itr::SkipMissing{<:AbstractArray}, dims) = + mapreducedim!(f, op, reducedim_init(f, op, itr, dims), itr) + +reducedim_initarray(itr::SkipMissing{<:AbstractArray}, region, init, ::Type{R}) where {R} = + reducedim_initarray(itr.x, region, init, R) +reducedim_initarray(itr::SkipMissing{<:AbstractArray}, region, init::T) where {T} = + reducedim_initarray(itr.x, region, init, T) + +# initialization when computing minima and maxima requires a little care +for (f1, f2, initval) in ((:min, :max, :Inf), (:max, :min, :(-Inf))) + @eval function reducedim_init(f, op::typeof($f1), itr::SkipMissing{<:AbstractArray}, region) + A = itr.x + T = eltype(itr) + + # First compute the reduce indices. This will throw an ArgumentError + # if any region is invalid + ri = reduced_indices(A, region) + + # Next, throw if reduction is over a region with length zero + any(i -> isempty(axes(A, i)), region) && _empty_reduce_error() + + # Make a view of the first slice of the region + A1 = view(A, ri...) + + if isempty(A1) + # If the slice is empty just return non-view, non-missing version as the initial array + return similar(A1, eltype(itr)) + else + # otherwise use the min/max of the first slice as initial value + v0 = mapreduce(f, $f2, A1) + + R = similar(A1, typeof(v0)) + + # if any value is missing in first slice, look for first + # non-missing value in each slice + if v0 === missing + v0 = nonmissingval(f, $f2, itr, R) + R = similar(A1, typeof(v0)) + end + + # but NaNs need to be avoided as initial values + v0 = v0 != v0 ? typeof(v0)($initval) : v0 + + # equivalent to reducedim_initarray, but we need R in advance + return fill!(R, v0) + end + end +end + +# Iterate until we've encountered at least one non-missing value in each slice, +# and return the min/max non-missing value of all clices +function nonmissingval(f, op::Union{typeof(min), typeof(max)}, + itr::SkipMissing{<:AbstractArray}, R::AbstractArray) + A = itr.x + lsiz = check_reducedims(R,A) + indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually + keep, Idefault = Broadcast.shapeindexer(indsRt) + i = findfirst(!ismissing, A) + i === nothing && throw(ArgumentError("cannot reduce over array with only missing values")) + @inbounds v = A[i] + if reducedim1(R, A) + # keep track of state using a single variable when reducing along the first dimension + i1 = first(axes1(R)) + @inbounds for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + filled = false + for i in axes(A, 1) + x = A[i, IA] + if x !== missing + v = op(v, f(x)) + filled = true + break + end + end + if !filled + throw(ArgumentError("cannot reduce over slices with only missing values")) + end + end + else + filled = fill!(similar(R, Bool), false) + allfilled = false + @inbounds for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + for i in axes(A, 1) + x = A[i, IA] + if x !== missing + v = op(v, f(x)) + filled[i,IR] = true + end + end + (allfilled = all(filled)) && break + end + if !allfilled + throw(ArgumentError("cannot reduce over slices with only missing values")) + end + end + v +end + +function _mapreducedim!(f, op, R::AbstractArray, itr::SkipMissing{<:AbstractArray}) + A = itr.x + lsiz = check_reducedims(R,A) + isempty(A) && return R + + if has_fast_linear_indexing(A) && lsiz > 16 + # use mapreduce_impl, which is probably better tuned to achieve higher performance + nslices = div(length(A), lsiz) + ibase = first(LinearIndices(A))-1 + for i = 1:nslices + x = mapreduce_impl(f, op, itr, ibase+1, ibase+lsiz) + if x !== nothing + @inbounds R[i] = op(R[i], something(x)) + end + ibase += lsiz + end + return R + end + indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually + keep, Idefault = Broadcast.shapeindexer(indsRt) + if reducedim1(R, A) + # keep the accumulator as a local variable when reducing along the first dimension + i1 = first(axes1(R)) + @inbounds for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + r = R[i1,IR] + @simd for i in axes(A, 1) + x = A[i, IA] + if x !== missing + r = op(r, f(x)) + end + end + + R[i1,IR] = r + end + else + @inbounds for IA in CartesianIndices(indsAt) + IR = Broadcast.newindex(IA, keep, Idefault) + @simd for i in axes(A, 1) + x = A[i, IA] + if x !== missing + R[i,IR] = op(R[i,IR], f(x)) + end + end + end + end + return R +end + +mapreducedim!(f, op, R::AbstractArray, A::SkipMissing{<:AbstractArray}) = + (_mapreducedim!(f, op, R, A); R) + +reducedim!(op, R::AbstractArray{RT}, A::SkipMissing{<:AbstractArray}) where {RT} = + mapreducedim!(identity, op, R, A) + """ coalesce(x, y...) diff --git a/base/reducedim.jl b/base/reducedim.jl index 150b93c7e8f5a..a3682d9c9b172 100644 --- a/base/reducedim.jl +++ b/base/reducedim.jl @@ -99,10 +99,10 @@ reducedim_initarray(A::AbstractArray, region, init::T) where {T} = reducedim_ini promote_union(T::Union) = promote_type(promote_union(T.a), promote_union(T.b)) promote_union(T) = T -function reducedim_init(f, op::Union{typeof(+),typeof(add_sum)}, A::AbstractArray, region) +function reducedim_init(f, op::Union{typeof(+),typeof(add_sum)}, A, region) _reducedim_init(f, op, zero, sum, A, region) end -function reducedim_init(f, op::Union{typeof(*),typeof(mul_prod)}, A::AbstractArray, region) +function reducedim_init(f, op::Union{typeof(*),typeof(mul_prod)}, A, region) _reducedim_init(f, op, one, prod, A, region) end function _reducedim_init(f, op, fv, fop, A, region) @@ -145,11 +145,11 @@ for (f1, f2, initval) in ((:min, :max, :Inf), (:max, :min, :(-Inf))) end end end -reducedim_init(f::Union{typeof(abs),typeof(abs2)}, op::typeof(max), A::AbstractArray{T}, region) where {T} = - reducedim_initarray(A, region, zero(f(zero(T)))) +reducedim_init(f::Union{typeof(abs),typeof(abs2)}, op::typeof(max), A, region) = + reducedim_initarray(A, region, zero(f(zero(eltype(A))))) -reducedim_init(f, op::typeof(&), A::AbstractArray, region) = reducedim_initarray(A, region, true) -reducedim_init(f, op::typeof(|), A::AbstractArray, region) = reducedim_initarray(A, region, false) +reducedim_init(f, op::typeof(&), A, region) = reducedim_initarray(A, region, true) +reducedim_init(f, op::typeof(|), A, region) = reducedim_initarray(A, region, false) # specialize to make initialization more efficient for common cases @@ -288,6 +288,15 @@ julia> mapreduce(isodd, *, a, dims=1) julia> mapreduce(isodd, |, true, a, dims=1) 1×4 Array{Bool,2}: true true true true + +julia> b = [1 missing 5; 2 4 missing] +2×3 Array{Union{Missing, Int64},2}: + 1 missing 5 + 2 4 missing + +julia> mapreduce(isodd, *, skipmissing(b), dims=1) +1×3 Array{Bool,2}: + false false true ``` """ mapreduce(f, op, A::AbstractArray; dims=:, kw...) = _mapreduce_dim(f, op, kw.data, A, dims) @@ -332,6 +341,15 @@ julia> reduce(max, a, dims=2) julia> reduce(max, a, dims=1) 1×4 Array{Int64,2}: 4 8 12 16 + +julia> b = [1 missing 5; 2 4 missing] +2×3 Array{Union{Missing, Int64},2}: + 1 missing 5 + 2 4 missing + +julia> reduce(+, skipmissing(b), dims=1) +1×3 Array{Int64,2}: + 3 4 5 ``` """ reduce(op, A::AbstractArray; kw...) = mapreduce(identity, op, A; kw...) @@ -342,6 +360,12 @@ reduce(op, A::AbstractArray; kw...) = mapreduce(identity, op, A; kw...) Sum elements of an array over the given dimensions. +!!! note + If `A` contains `NaN` or [`missing`](@ref) values, they are propagated to the + corresponding result (`missing` takes precedence if a slice contains both). + Use [`skipmissing(A)`](@ref) to omit `missing` entries and compute the + sum of non-missing values. + # Examples ```jldoctest julia> A = [1 2; 3 4] @@ -357,6 +381,20 @@ julia> sum(A, dims=2) 2×1 Array{Int64,2}: 3 7 + +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> sum(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 4 4 + +julia> sum(skipmissing(B), dims=2) +2×1 Array{Int64,2}: + 1 + 7 ``` """ sum(A::AbstractArray; dims) @@ -390,6 +428,12 @@ sum!(r, A) Multiply elements of an array over the given dimensions. +!!! note + If `A` contains `NaN` or [`missing`](@ref) values, they are propagated to the + corresponding result (`missing` takes precedence if a slice contains both). + Use [`skipmissing(A)`](@ref) to omit `missing` entries and compute the + product of non-missing values. + # Examples ```jldoctest julia> A = [1 2; 3 4] @@ -405,6 +449,20 @@ julia> prod(A, dims=2) 2×1 Array{Int64,2}: 2 12 + +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> prod(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 3 4 + +julia> prod(skipmissing(B), dims=2) +2×1 Array{Int64,2}: + 1 + 12 ``` """ prod(A::AbstractArray; dims) @@ -440,6 +498,12 @@ Compute the maximum value of an array over the given dimensions. See also the [`max(a,b)`](@ref) function to take the maximum of two or more arguments, which can be applied elementwise to arrays via `max.(a,b)`. +!!! note + If `A` contains `NaN` or [`missing`](@ref) values, they are propagated to the + corresponding result (`missing` takes precedence if a slice contains both). + Use [`skipmissing(A)`](@ref) to omit `missing` entries and compute the + maximum of non-missing values. + # Examples ```jldoctest julia> A = [1 2; 3 4] @@ -455,6 +519,20 @@ julia> maximum(A, dims=2) 2×1 Array{Int64,2}: 2 4 + +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> maximum(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 3 4 + +julia> maximum(skipmissing(B), dims=2) +2×1 Array{Int64,2}: + 1 + 4 ``` """ maximum(A::AbstractArray; dims) @@ -490,6 +568,12 @@ Compute the minimum value of an array over the given dimensions. See also the [`min(a,b)`](@ref) function to take the minimum of two or more arguments, which can be applied elementwise to arrays via `min.(a,b)`. +!!! note + If `A` contains `NaN` or [`missing`](@ref) values, they are propagated to the + corresponding result (`missing` takes precedence if a slice contains both). + Use [`skipmissing(A)`](@ref) to omit `missing` entries and compute the + minimum of non-missing values. + # Examples ```jldoctest julia> A = [1 2; 3 4] @@ -502,6 +586,20 @@ julia> minimum(A, dims=1) 1 2 julia> minimum(A, dims=2) +2×1 Array{Int64,2}: + 1 + 3 + +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> minimum(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 1 4 + +julia> minimum(skipmissing(B), dims=2) 2×1 Array{Int64,2}: 1 3 @@ -635,6 +733,7 @@ for (fname, _fname, op) in [(:sum, :_sum, :add_sum), (:prod, :_prod, @eval begin # User-facing methods with keyword arguments @inline ($fname)(a::AbstractArray; dims=:) = ($_fname)(a, dims) + @inline ($fname)(a::SkipMissing{<:AbstractArray}; dims=:) = ($_fname)(a, dims) @inline ($fname)(f::Callable, a::AbstractArray; dims=:) = ($_fname)(f, a, dims) # Underlying implementations using dispatch diff --git a/doc/src/manual/missing.md b/doc/src/manual/missing.md index 67b349f57a157..3b5bfa7420730 100644 --- a/doc/src/manual/missing.md +++ b/doc/src/manual/missing.md @@ -305,7 +305,36 @@ julia> mapreduce(sqrt, +, skipmissing([3, missing, 2, 1])) 4.146264369941973 ``` -Use [`collect`](@ref) to extract non-`missing` values and store them in an array +Reduction over dimensions is also supported +```jldoctest +julia> B = [1 missing; 3 4] +2×2 Array{Union{Missing, Int64},2}: + 1 missing + 3 4 + +julia> sum(skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 4 4 + +julia> sum(skipmissing(B), dims=2) +2×1 Array{Int64,2}: + 1 + 7 + +julia> reduce(*, skipmissing(B), dims=1) +1×2 Array{Int64,2}: + 3 4 + +julia> mapreduce(cos, +, skipmissing(B), dims=1) +1×2 Array{Float64,2}: + -0.44969 -0.653644 +``` + +However, note that the special iterator object returned by `skipmissing` is not an +`AbstractArray`: since `missing` entries of the original array are skipped, +it does not have a regular shape and cannot be indexed. +Use [`collect`](@ref) to extract non-`missing` values and store them in a vector, +losing the shape of the original array ```jldoctest julia> collect(skipmissing([3, missing, 2, 1])) 3-element Array{Int64,1}: diff --git a/test/missing.jl b/test/missing.jl index d25751322cb2e..0bae44ac7b890 100644 --- a/test/missing.jl +++ b/test/missing.jl @@ -412,6 +412,76 @@ end @test_throws ArgumentError mapreduce(x -> x/2, +, itr) end end + + @testset "mapreduce over dims" begin + nm(x) = (@test !(eltype(x) >: Missing); x) + # Vary size to test splitting blocks with several configurations of missing values + for T in (Int, Float64), + A in (rand(T, 10), rand(T, 100, 1000), rand(T, 10000, 10)), + i in (1, 2, 1:2), + f in (identity, abs, abs2, cos), + (op, redf, init) in ((+, sum, x->oftype(f(one(eltype(x))), zero(eltype(x)))), + (*, prod, x->oftype(f(one(eltype(x))), one(eltype(x)))), + (max, maximum, x->f(first(skipmissing(x)))), + (min, minimum, x->f(first(skipmissing(x))))) + if T <: Integer + @test redf(A, dims=i) == nm(redf(skipmissing(A), dims=i)) == + nm(reduce(op, skipmissing(A), dims=i)) == + nm(mapreduce(identity, op, skipmissing(A), dims=i)) + else + @test redf(A, dims=i) ≈ nm(redf(skipmissing(A), dims=i)) == + nm(reduce(op, skipmissing(A), dims=i)) == + nm(mapreduce(identity, op, skipmissing(A), dims=i)) + end + @test mapreduce(f, op, A, dims=i) ≈ + nm(mapreduce(f, op, skipmissing(A), dims=i)) + + maximum(i) > ndims(A) && (i = 1) # for mapslices + B = Array{Union{T,Missing}}(A) + replace!(x -> rand(Bool) ? x : missing, B) + # mapreduce fails for min and max if there are empty slices + if !(op in (min, max)) || !any(iszero, sum(!ismissing, B, dims=i)) + if T === Int + @test mapslices(x -> redf(skipmissing(x)), B, dims=i) == + nm(redf(skipmissing(B), dims=i)) + else + @test isapprox(mapslices(x -> redf(skipmissing(x)), B, dims=i), + nm(redf(skipmissing(B), dims=i)), rtol=sqrt(eps()), atol=1e-16) + end + @test nm(redf(skipmissing(B), dims=i)) == + nm(reduce(op, skipmissing(B), dims=i)) == + nm(mapreduce(identity, op, skipmissing(B), dims=i)) + @test isapprox(mapslices(x -> mapreduce(f, op, skipmissing(x), init=init(x)), B, dims=i), + nm(mapreduce(f, op, skipmissing(B), dims=i)), + rtol=sqrt(eps()), atol=1e-16) + elseif op in (min, max) && !(f in (abs, abs2)) + @test_throws ArgumentError mapslices(x -> redf(skipmissing(x)), B, dims=i) + @test_throws ArgumentError redf(skipmissing(B), dims=i) + @test_throws ArgumentError reduce(op, skipmissing(B), dims=i) + @test_throws ArgumentError mapreduce(identity, op, skipmissing(B), dims=i) + @test_throws ArgumentError mapreduce(f, op, skipmissing(B), dims=i) + end + + # Test block full of missing values + B[1:length(B)÷2] .= missing + if !(op in (min, max)) + if T <: Integer + @test mapslices(x -> redf(skipmissing(x)), B, dims=i) == + nm(redf(skipmissing(B), dims=i)) + else + @test isapprox(mapslices(x -> redf(skipmissing(x)), B, dims=i), + nm(redf(skipmissing(B), dims=i)), + atol=1e-16) + end + @test nm(redf(skipmissing(B), dims=i)) == + nm(reduce(op, skipmissing(B), dims=i)) == + nm(mapreduce(identity, op, skipmissing(B), dims=i)) + @test isapprox(mapslices(x -> mapreduce(f, op, skipmissing(x), init=init(x)), B, dims=i), + nm(mapreduce(f, op, skipmissing(B), dims=i)), + rtol=sqrt(eps()), atol=1e-16) + end + end + end end @testset "coalesce" begin