From afe92867fd7ae74b7eafaa1577fb772df795b0aa Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 23 Jun 2018 11:05:41 +0200 Subject: [PATCH 1/3] Add optimized mapreduce implementation for SkipMissing Based on the AbstractArray methods. Allows the compiler to emit SIMD instructions, for SkipMissing{Vector{Int}}, but also improves performance for SkipMissing{Vector{Float64}}. --- base/missing.jl | 95 ++++++++++++++++++++++++++++++++++++++++++++++++- test/missing.jl | 51 ++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 1 deletion(-) diff --git a/base/missing.jl b/base/missing.jl index 6e31ee6c55bda..6e0313bdcb5e4 100644 --- a/base/missing.jl +++ b/base/missing.jl @@ -177,7 +177,7 @@ IteratorSize(::Type{<:SkipMissing}) = SizeUnknown() IteratorEltype(::Type{SkipMissing{T}}) where {T} = IteratorEltype(T) eltype(::Type{SkipMissing{T}}) where {T} = nonmissingtype(eltype(T)) -function Base.iterate(itr::SkipMissing, state...) +function iterate(itr::SkipMissing, state...) y = iterate(itr.x, state...) y === nothing && return nothing item, state = y @@ -189,6 +189,99 @@ function Base.iterate(itr::SkipMissing, state...) item, state end +# Optimized mapreduce implementation +mapreduce(f, op, itr::SkipMissing{<:AbstractArray}) = _mapreduce(f, op, IndexStyle(itr.x), itr) + +"Sentinel indicating that no non-missing value was encountered." +struct AllMissing end + +function _mapreduce(f, op, ::IndexLinear, itr::SkipMissing{<:AbstractArray}) + A = itr.x + local ai + inds = LinearIndices(A) + i = first(inds) + ilast = last(inds) + while i <= ilast + @inbounds ai = A[i] + ai === missing || break + i += 1 + end + i > ilast && return mapreduce_empty(f, op, eltype(itr)) + a1 = ai + i += 1 + while i <= ilast + @inbounds ai = A[i] + ai === missing || break + i += 1 + end + i > ilast && return mapreduce_first(f, op, a1) + # We know A contains at least two non-missing entries, therefore AllMissing() is not + # a possible return value: the check provides that information to inference + s = mapreduce_impl(f, op, itr, first(inds), last(inds)) + s isa AllMissing && error("got AllMissing for an array with non-missing values") + return s +end + +_mapreduce(f, op, ::IndexCartesian, itr::SkipMissing) = mapfoldl(f, op, itr) + +mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) = + mapreduce_impl(f, op, A, ifirst, ilast, pairwise_blocksize(f, op)) + +@noinline function mapreduce_impl(f, op, itr::SkipMissing{<:AbstractArray}, + ifirst::Integer, ilast::Integer, blksize::Int) + A = itr.x + if ifirst == ilast + @inbounds a1 = A[ifirst] + if a1 === missing + return AllMissing() + else + return mapreduce_first(f, op, a1) + end + elseif ifirst + blksize > ilast + # sequential portion + local ai + i = ifirst + while i <= ilast + @inbounds ai = A[i] + ai === missing || break + i += 1 + end + i > ilast && return AllMissing() + a1 = ai::eltype(itr) + i += 1 + while i <= ilast + @inbounds ai = A[i] + ai === missing || break + i += 1 + end + i > ilast && return mapreduce_first(f, op, a1) + a2 = ai::eltype(itr) + # Unexpectedly, the following assertion allows SIMD instructions to be emitted + A[i]::eltype(itr) + i += 1 + v = op(f(a1), f(a2)) + @simd for i = i:ilast + @inbounds ai = A[i] + if ai !== missing + v = op(v, f(ai)) + end + end + return v + else + # pairwise portion + imid = (ifirst + ilast) >> 1 + v1 = mapreduce_impl(f, op, itr, ifirst, imid, blksize) + v2 = mapreduce_impl(f, op, itr, imid+1, ilast, blksize) + if v1 isa AllMissing + return v2 + elseif v2 isa AllMissing + return v1 + else + return op(v1, v2) + end + end +end + """ coalesce(x, y...) diff --git a/test/missing.jl b/test/missing.jl index 1c37af827a985..d25751322cb2e 100644 --- a/test/missing.jl +++ b/test/missing.jl @@ -361,6 +361,57 @@ end @test eltype(x) === Any @test collect(x) == [1, 2, 4] @test collect(x) isa Vector{Int} + + @testset "mapreduce" begin + # Vary size to test splitting blocks with several configurations of missing values + for T in (Int, Float64), + A in (rand(T, 10), rand(T, 1000), rand(T, 10000)) + if T === Int + @test sum(A) === sum(skipmissing(A)) === + reduce(+, skipmissing(A)) === mapreduce(identity, +, skipmissing(A)) + else + @test sum(A) ≈ sum(skipmissing(A)) === + reduce(+, skipmissing(A)) === mapreduce(identity, +, skipmissing(A)) + end + @test mapreduce(cos, *, A) ≈ mapreduce(cos, *, skipmissing(A)) + + B = Vector{Union{T,Missing}}(A) + replace!(x -> rand(Bool) ? x : missing, B) + if T === Int + @test sum(collect(skipmissing(B))) === sum(skipmissing(B)) === + reduce(+, skipmissing(B)) === mapreduce(identity, +, skipmissing(B)) + else + @test sum(collect(skipmissing(B))) ≈ sum(skipmissing(B)) === + reduce(+, skipmissing(B)) === mapreduce(identity, +, skipmissing(B)) + end + @test mapreduce(cos, *, collect(skipmissing(A))) ≈ mapreduce(cos, *, skipmissing(A)) + + # Test block full of missing values + B[1:length(B)÷2] .= missing + if T === Int + @test sum(collect(skipmissing(B))) == sum(skipmissing(B)) == + reduce(+, skipmissing(B)) == mapreduce(identity, +, skipmissing(B)) + else + @test sum(collect(skipmissing(B))) ≈ sum(skipmissing(B)) == + reduce(+, skipmissing(B)) == mapreduce(identity, +, skipmissing(B)) + end + + @test mapreduce(cos, *, collect(skipmissing(A))) ≈ mapreduce(cos, *, skipmissing(A)) + end + + # Patterns that exercize code paths for inputs with 1 or 2 non-missing values + @test sum(skipmissing([1, missing, missing, missing])) === 1 + @test sum(skipmissing([missing, missing, missing, 1])) === 1 + @test sum(skipmissing([1, missing, missing, missing, 2])) === 3 + @test sum(skipmissing([missing, missing, missing, 1, 2])) === 3 + + for n in 0:3 + itr = skipmissing(Vector{Union{Int,Missing}}(fill(missing, n))) + @test sum(itr) == reduce(+, itr) == mapreduce(identity, +, itr) === 0 + @test_throws ArgumentError reduce(x -> x/2, itr) + @test_throws ArgumentError mapreduce(x -> x/2, +, itr) + end + end end @testset "coalesce" begin From 058ba81c648786320ba881a172d5eec43e0b21de Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Mon, 2 Jul 2018 17:39:02 +0200 Subject: [PATCH 2/3] Use nothing/Some instead of custom AllMissing singleton --- base/missing.jl | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/base/missing.jl b/base/missing.jl index 6e0313bdcb5e4..4a13586e3654d 100644 --- a/base/missing.jl +++ b/base/missing.jl @@ -192,9 +192,6 @@ end # Optimized mapreduce implementation mapreduce(f, op, itr::SkipMissing{<:AbstractArray}) = _mapreduce(f, op, IndexStyle(itr.x), itr) -"Sentinel indicating that no non-missing value was encountered." -struct AllMissing end - function _mapreduce(f, op, ::IndexLinear, itr::SkipMissing{<:AbstractArray}) A = itr.x local ai @@ -215,11 +212,8 @@ function _mapreduce(f, op, ::IndexLinear, itr::SkipMissing{<:AbstractArray}) i += 1 end i > ilast && return mapreduce_first(f, op, a1) - # We know A contains at least two non-missing entries, therefore AllMissing() is not - # a possible return value: the check provides that information to inference - s = mapreduce_impl(f, op, itr, first(inds), last(inds)) - s isa AllMissing && error("got AllMissing for an array with non-missing values") - return s + # We know A contains at least two non-missing entries: the result cannot be nothing + something(mapreduce_impl(f, op, itr, first(inds), last(inds))) end _mapreduce(f, op, ::IndexCartesian, itr::SkipMissing) = mapfoldl(f, op, itr) @@ -227,15 +221,16 @@ _mapreduce(f, op, ::IndexCartesian, itr::SkipMissing) = mapfoldl(f, op, itr) mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) = mapreduce_impl(f, op, A, ifirst, ilast, pairwise_blocksize(f, op)) +# Returns nothing when the input contains only missing values, and Some(x) otherwise @noinline function mapreduce_impl(f, op, itr::SkipMissing{<:AbstractArray}, ifirst::Integer, ilast::Integer, blksize::Int) A = itr.x if ifirst == ilast @inbounds a1 = A[ifirst] if a1 === missing - return AllMissing() + return nothing else - return mapreduce_first(f, op, a1) + return Some(mapreduce_first(f, op, a1)) end elseif ifirst + blksize > ilast # sequential portion @@ -246,7 +241,7 @@ mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) = ai === missing || break i += 1 end - i > ilast && return AllMissing() + i > ilast && return nothing a1 = ai::eltype(itr) i += 1 while i <= ilast @@ -254,7 +249,7 @@ mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) = ai === missing || break i += 1 end - i > ilast && return mapreduce_first(f, op, a1) + i > ilast && return Some(mapreduce_first(f, op, a1)) a2 = ai::eltype(itr) # Unexpectedly, the following assertion allows SIMD instructions to be emitted A[i]::eltype(itr) @@ -266,18 +261,20 @@ mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) = v = op(v, f(ai)) end end - return v + return Some(v) else # pairwise portion imid = (ifirst + ilast) >> 1 v1 = mapreduce_impl(f, op, itr, ifirst, imid, blksize) v2 = mapreduce_impl(f, op, itr, imid+1, ilast, blksize) - if v1 isa AllMissing + if v1 === nothing && v2 === nothing + return nothing + elseif v1 === nothing return v2 - elseif v2 isa AllMissing + elseif v2 === nothing return v1 else - return op(v1, v2) + return Some(op(something(v1), something(v2))) end end end From 8f73286275508a89cf643f58ec4fc4273e8c88f4 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Thu, 5 Jul 2018 20:35:47 +0200 Subject: [PATCH 3/3] Use generic fallback for arrays with !(eltype >: Missing) --- base/missing.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/base/missing.jl b/base/missing.jl index 4a13586e3654d..38322bee65b1e 100644 --- a/base/missing.jl +++ b/base/missing.jl @@ -190,7 +190,10 @@ function iterate(itr::SkipMissing, state...) end # Optimized mapreduce implementation -mapreduce(f, op, itr::SkipMissing{<:AbstractArray}) = _mapreduce(f, op, IndexStyle(itr.x), itr) +# The generic method is faster when !(eltype(A) >: Missing) since it does not need +# additional loops to identify the two first non-missing values of each block +mapreduce(f, op, itr::SkipMissing{<:AbstractArray}) = + _mapreduce(f, op, IndexStyle(itr.x), eltype(itr.x) >: Missing ? itr : itr.x) function _mapreduce(f, op, ::IndexLinear, itr::SkipMissing{<:AbstractArray}) A = itr.x