diff --git a/perf/mapreduce.jl b/perf/mapreduce.jl index 93e35fd..82739c0 100644 --- a/perf/mapreduce.jl +++ b/perf/mapreduce.jl @@ -15,30 +15,213 @@ f{T<:Number}(x::Nullable{T}) = Nullable(5 * x.value, x.isnull) #-----------------------------------------------------------------------------# -function profile_mapreduce(A, X, D) +function profile_reduce_methods() + A = rand(5_000_000) + B = rand(Bool, 5_000_000) + X = NullableArray(A) + Y = NullableArray(A, B) + D = DataArray(A) + E = DataArray(A, B) + + profile_mapreduce(A, X, Y, D, E) + println() + profile_reduce(A, X, Y, D, E) + println() + + for method in ( + sum, + prod, + minimum, + maximum, + ) + (method)(A) + (method)(X) + (method)(D) + println("Method: $method(A) (0 missing entries)") + print(" for Array{Float64}: ") + @time((method)(A)) + print(" for NullableArray{Float64}: ") + @time((method)(X)) + print(" for DataArray{Float64}: ") + @time((method)(D)) + + (method)(f, A) + (method)(f, X) + (method)(f, D) + println("Method: $method(f, A) (0 missing entries)") + print(" for Array{Float64}: ") + @time((method)(f, A)) + print(" for NullableArray{Float64}: ") + @time((method)(f, X)) + print(" for DataArray{Float64}: ") + @time((method)(f, D)) + end + println() + + for method in ( + sum, + prod, + minimum, + maximum, + ) + (method)(Y) + println("Method: $method(A) (~half missing entries, skip=false)") + print(" for NullableArray{Float64}: ") + @time((method)(Y)) + (method)(E) + print(" for DataArray{Float64}: ") + @time((method)(E)) + + (method)(f, Y) + println("Method: $method(f, A) (~half missing entries, skip=false)") + print(" for NullableArray{Float64}: ") + @time((method)(f, Y)) + if in(method, (sum, prod)) + (method)(f, E) + print(" for DataArray{Float64}: ") + @time((method)(f, E)) + else + println(" $method(f, A::DataArray) currently incurs error") + end + end + println() + + for method in ( + sum, + prod, + minimum, + maximum, + ) + (method)(Y, skipnull=true) + println("Method: $method(A) (~half missing entries, skip=true)") + print(" for NullableArray{Float64}: ") + @time((method)(Y, skipnull=true)) + (method)(E, skipna=true) + print(" for DataArray{Float64}: ") + @time((method)(E, skipna=true)) + + (method)(f, Y, skipnull=true) + println("Method: $method(f, A) (~half missing entries, skip=true)") + print(" for NullableArray{Float64}: ") + @time((method)(f, Y, skipnull=true)) + (method)(f, E, skipna=true) + print(" for DataArray{Float64}: ") + @time((method)(f, E, skipna=true)) + end + println() + + for method in ( + sumabs, + sumabs2 + ) + (method)(A) + (method)(X) + (method)(D) + println("Method: $method(A) (0 missing entries)") + print(" for Array{Float64}: ") + @time((method)(A)) + print(" for NullableArray{Float64}: ") + @time((method)(X)) + print(" for DataArray{Float64}: ") + @time((method)(D)) + + (method)(f, A) + (method)(f, X) + (method)(f, D) + println("Method: $method(f, A) (0 missing entries)") + print(" for Array{Float64}: ") + @time((method)(f, A)) + print(" for NullableArray{Float64}: ") + @time((method)(f, X)) + print(" for DataArray{Float64}: ") + @time((method)(f, D)) + end + + for method in ( + mean, + var, + ) + (method)(A) + (method)(X) + (method)(D) + println("Method: $method(A) (0 missing entries)") + print(" for Array{Float64}: ") + @time((method)(A)) + print(" for NullableArray{Float64}: ") + @time((method)(X)) + print(" for DataArray{Float64}: ") + @time((method)(D)) + + (method)(f, A) + (method)(f, X) + (method)(f, D) + println("Method: $method(f, A) (0 missing entries)") + print(" for Array{Float64}: ") + @time((method)(f, A)) + print(" for NullableArray{Float64}: ") + @time((method)(f, X)) + print(" for DataArray{Float64}: ") + @time((method)(f, D)) + end +end + + +function profile_mapreduce(A, X, Y, D, E) + println("Method: mapreduce(f, op, A) (0 missing entries)") mapreduce(f, Base.(:+), A) - mapreduce(f, Base.(:+), X) - mapreduce(f, Base.(:+), D) - println("Method: mapreduce(f, op, A)") print(" for Array{Float64}: ") @time(mapreduce(f, Base.(:+), A)) + mapreduce(f, Base.(:+), X) print(" for NullableArray{Float64}: ") @time(mapreduce(f, Base.(:+), X)) + mapreduce(f, Base.(:+), D) print(" for DataArray{Float64}: ") @time(mapreduce(f, Base.(:+), D)) + + println("Method: mapreduce(f, op, A) (~half missing entries, skip=false)") + mapreduce(f, Base.(:+), Y) + print(" for NullableArray{Float64}: ") + @time(mapreduce(f, Base.(:+), Y)) + mapreduce(f, Base.(:+), E) + print(" for DataArray{Float64}: ") + @time(mapreduce(f, Base.(:+), E)) + + println("Method: mapreduce(f, op, A) (~half missing entries, skip=true)") + mapreduce(f, Base.(:+), Y, skipnull=true) + print(" for NullableArray{Float64}: ") + @time(mapreduce(f, Base.(:+), Y, skipnull=true)) + mapreduce(f, Base.(:+), E, skipna=true) + print(" for DataArray{Float64}: ") + @time(mapreduce(f, Base.(:+), E, skipna=true)) end -function profile_reduce(A, X, D) +function profile_reduce(A, X, Y, D, E) + println("Method: reduce(f, op, A) (0 missing entries)") reduce(Base.(:+), A) - reduce(Base.(:+), X) - reduce(Base.(:+), D) - println("Method: reduce(op, A)") print(" for Array{Float64}: ") @time(reduce(Base.(:+), A)) + reduce(Base.(:+), X) print(" for NullableArray{Float64}: ") @time(reduce(Base.(:+), X)) + reduce(Base.(:+), D) print(" for DataArray{Float64}: ") @time(reduce(Base.(:+), D)) + + println("Method: reduce(f, op, A) (~half missing entries, skip=false)") + reduce(Base.(:+), Y) + print(" for NullableArray{Float64}: ") + @time(reduce(Base.(:+), Y)) + reduce(Base.(:+), E) + print(" for DataArray{Float64}: ") + @time(reduce(Base.(:+), E)) + + println("Method: reduce(f, op, A) (~half missing entries, skip=true)") + reduce(Base.(:+), Y, skipnull=true) + print(" for NullableArray{Float64}: ") + @time(reduce(Base.(:+), Y, skipnull=true)) + reduce(Base.(:+), E, skipna=true) + print(" for DataArray{Float64}: ") + @time(reduce(Base.(:+), E, skipna=true)) end function profile_sum1(A, X, D) diff --git a/perf/statistics.jl b/perf/statistics.jl new file mode 100644 index 0000000..66c776d --- /dev/null +++ b/perf/statistics.jl @@ -0,0 +1,119 @@ +using NullableArrays +using DataArrays +using StatsBase + +srand(1) +N = 5_000_000 + +function profile_stats_methods() + A = rand(N) + B = rand(Bool, N) + X = NullableArray(A) + Y = NullableArray(A, B) + D = DataArray(A) + E = DataArray(A, B) + + profile_mean(A, X, D, Y, E) + profile_var(A, X, D, Y, E) + nothing +end + +function profile_mean(A, X, D, Y, E) + W = WeightVec(rand(N)) + + mean(A) + println("Method: mean(A) (0 missing entries)") + print(" for Array{Float64}: ") + @time(mean(A)) + mean(X) + print(" for NullableArray{Float64}: ") + @time(mean(X)) + mean(D) + print(" for DataArray{Float64}: ") + @time(mean(D)) + println() + + mean(Y, skipnull=false) + println("Method: mean(A) (~half missing entries, skip=false)") + print(" for NullableArray{Float64}: ") + @time(mean(Y, skipnull=false)) + mean(E, skipna=false) + print(" for DataArray{Float64}: ") + @time(mean(E, skipna=false)) + println() + + mean(Y, skipnull=true) + println("Method: mean(A) (~half missing entries, skip=true)") + print(" for NullableArray{Float64}: ") + @time(mean(Y, skipnull=true)) + mean(E, skipna=true) + print(" for DataArray{Float64}: ") + @time(mean(E, skipna=true)) + println() + + mean(A, W) + println("Method: mean(A, w::WeightVec{W, V}) (0 missing entries, V<:Array)") + print(" for Array{Float64}: ") + @time(mean(A, W)) + mean(X, W) + print(" for NullableArray{Float64}: ") + @time(mean(X, W)) + mean(D, W) + print(" for DataArray{Float64}: ") + @time(mean(D, W)) + println() + + println("Method: mean(A, W::WeightVec) (~half missing entries, skip=false)") + mean(Y, W, skipnull=false) + print(" for NullableArray{Float64}: ") + @time(mean(Y, W, skipnull=false)) + mean(E, W, skipna=false) + print(" for DataArray{Float64}: ") + @time(mean(E, W, skipna=false)) + println() + + println("Method: mean(A, W::WeightVec) (~half missing entries, skip=true)") + mean(Y, W, skipnull=true) + print(" for NullableArray{Float64}: ") + @time(mean(Y, W, skipnull=true)) + mean(E, W, skipna=true) + print(" for DataArray{Float64}: ") + @time(mean(E, W, skipna=true)) + println() +end + +function profile_var(A, X, D, Y, E) + mu = mean(A) + mu2 = mean(X, skipnull=true) + + varm(A, mu) + println("Method: varm(A, mu) (0 missing entries)") + print(" for Array{Float64}: ") + @time(varm(A, mu)) + println(" ", varm(A, mu)) + varm(X, mu) + print(" for NullableArray{Float64}: ") + @time(varm(X, mu)) + varm(D, mu) + print(" for DataArray{Float64}: ") + @time(varm(D, mu)) + println() + + varm(Y, mu; skipnull=false) + println("Method: varm(A, mu) (~half missing entries, skip=false)") + print(" for NullableArray{Float64}: ") + @time(varm(Y, mu; skipnull=false)) + varm(E, mu; skipna=false) + print(" for DataArray{Float64}: ") + @time(varm(E, mu; skipna=false)) + println() + + varm(Y, mu; skipnull=true) + println("Method: varm(A, mu) (~half missing entries, skip=true)") + print(" for NullableArray{Float64}: ") + @time(varm(Y, mu; skipnull=true)) + varm(E, mu; skipna=true) + print(" for DataArray{Float64}: ") + @time(varm(E, mu; skipna=true)) + println() +end diff --git a/src/NullableArrays.jl b/src/NullableArrays.jl index 4e8a0b2..e3606fe 100644 --- a/src/NullableArrays.jl +++ b/src/NullableArrays.jl @@ -27,4 +27,5 @@ module NullableArrays include("operators.jl") include("broadcast.jl") include("mapreduce.jl") + include("statistics.jl") end diff --git a/src/mapreduce.jl b/src/mapreduce.jl index 2e80482..5d09aa4 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -27,7 +27,7 @@ function mapreduce_seq_impl_skipnull(f, op, X::NullableArray, @inbounds entry = X.values[i] v = op(v, f(entry)) end - return v + return Nullable(v) end # pairwise map-reduce @@ -36,11 +36,11 @@ function mapreduce_pairwise_impl_skipnull{T}(f, op, X::NullableArray{T}, # n_notnull::Int, blksize::Int) blksize::Int) if ifirst + blksize > ilast - # # fall back to Base implementation if no nulls in block - # if ilast - ifirst + 1 == n_notnull - # return Base.mapreduce_seq_impl(f, op, X, ifirst, ilast) + # fall back to Base implementation if no nulls in block + # if anynull(slice(X, ifirst:ilast)) + return mapreduce_seq_impl_skipnull(f, op, X, ifirst, ilast) # else - mapreduce_seq_impl_skipnull(f, op, X, ifirst, ilast) + # Nullable(Base.mapreduce_seq_impl(f, op, X.values, ifirst, ilast)) # end else imid = (ifirst + ilast) >>> 1 @@ -55,35 +55,49 @@ function mapreduce_pairwise_impl_skipnull{T}(f, op, X::NullableArray{T}, end mapreduce_impl_skipnull{T}(f, op, X::NullableArray{T}) = - mapreduce_seq_impl_skipnull(f, op, T, X, 1, length(X.values)) + mapreduce_seq_impl_skipnull(f, op, X, 1, length(X.values)) mapreduce_impl_skipnull(f, op::Base.AddFun, X::NullableArray) = mapreduce_pairwise_impl_skipnull(f, op, X, 1, length(X.values), max(128, Base.sum_pairwise_blocksize(f))) ## general mapreduce interface -function _mapreduce_skipnull{T}(f, op, X::NullableArray{T}) +function _mapreduce_skipnull{T}(f, op, X::NullableArray{T}, missingdata::Bool) n = length(X) + !missingdata && return Nullable(Base.mapreduce_impl(f, op, X.values, 1, n)) nnull = countnz(X.isnull) nnull == n && return Base.mr_empty(f, op, T) nnull == n - 1 && return Base.r_promote(op, f(X.values[findnext(x -> x == false), X, 1])) - nnull == 0 && return Base.mapreduce_impl(f, op, X, 1, n) + # nnull == 0 && return Base.mapreduce_impl(f, op, X, 1, n) return mapreduce_impl_skipnull(f, op, X) end +function Base._mapreduce(f, op, X::NullableArray, missingdata) + missingdata && return Base._mapreduce(f, op, X) + Nullable(Base._mapreduce(f, op, X.values)) +end + function Base.mapreduce(f, op::Function, X::NullableArray; skipnull::Bool = false) + missingdata = anynull(X) if skipnull - return _mapreduce_skipnull(f, Base.specialized_binary(op), X) + return _mapreduce_skipnull(f, Base.specialized_binary(op), + X, missingdata) else - return Base._mapreduce(f, Base.specialized_binary(op), X) + return Base._mapreduce(f, Base.specialized_binary(op), X, missingdata) end end -Base.mapreduce(f, op, X::NullableArray; skipnull::Bool = false) = - skipnull ? _mapreduce_skipnull(f, op, X) : Base._mapreduce(f, op, X) +function Base.mapreduce(f, op, X::NullableArray; skipnull::Bool = false) + missingdata = anynull(X) + if skipnull + return _mapreduce_skipnull(f, op, X, missingdata) + else + return Base._mapreduce(f, op, X, missingdata) + end +end Base.reduce(op, X::NullableArray; skipnull::Bool = false) = mapreduce(Base.IdFun(), op, X; skipnull = skipnull) @@ -113,8 +127,18 @@ end #----- Base.minimum / Base.maximum -------------------------------------------# # internal methods +for Op in (:(Base.MinFun), :(Base.MaxFun)) + @eval begin + function Base._mapreduce{T}(::Base.IdFun, ::$Op, + X::NullableArray{T}, missingdata) + missingdata && return Nullable{T}() + Nullable(Base._mapreduce(Base.IdFun(), $Op(), X.values)) + end + end +end -function Base.mapreduce_impl{T}(f, op::Base.MinFun, X::NullableArray{T}, first::Int, last::Int) +function Base.mapreduce_impl{T}(f, op::Base.MinFun, X::NullableArray{T}, + first::Int, last::Int) # locate the first non-null entry i = first while X.isnull[i] && i <= last @@ -135,7 +159,8 @@ function Base.mapreduce_impl{T}(f, op::Base.MinFun, X::NullableArray{T}, first:: return v end -function Base.mapreduce_impl{T}(f, op::Base.MaxFun, X::NullableArray{T}, first::Int, last::Int) +function Base.mapreduce_impl{T}(f, op::Base.MaxFun, X::NullableArray{T}, + first::Int, last::Int) # locate the first non-null entry i = first while X.isnull[i] && i <= last @@ -155,9 +180,3 @@ function Base.mapreduce_impl{T}(f, op::Base.MaxFun, X::NullableArray{T}, first:: end return v end - -#----- Base.mean -------------------------------------------------------------# - -Base.mean(X::NullableArray; skipnull::Bool = false) = - sum(X; skipnull = skipnull) / - Nullable(length(X.isnull) - (skipnull * countnz(X.isnull))) diff --git a/src/statistics.jl b/src/statistics.jl new file mode 100644 index 0000000..b65d1e7 --- /dev/null +++ b/src/statistics.jl @@ -0,0 +1,91 @@ +using StatsBase + +Base.mean(X::NullableArray; skipnull::Bool = false) = + sum(X; skipnull = skipnull) / + Nullable(length(X.isnull) - (skipnull * countnz(X.isnull))) + +function Base.mean{T, W, V}(X::NullableArray{T}, w::WeightVec{W, V}; + skipnull::Bool=false) + if skipnull + _X = NullableArray(X.values .* w.values, X.isnull) + _w = NullableArray(w.values, X.isnull) + return sum(_X; skipnull=true) / sum(_w; skipnull=true) + else + anynull(X) ? Nullable{T}() : Nullable(mean(X.values, w)) + end +end + +function Base.mean{T, W, V<:NullableArray}(X::NullableArray{T}, + w::WeightVec{W, V}; + skipnull::Bool=false) + if skipnull + _X = X .* w.values + _w = NullableArray(w.values, _X.isnull) + return sum(_X; skipnull=true) / sum(_w; skipnull=true) + else + anynull(X) || anynull(w) ? Nullable{T}() : + Nullable(mean(X.values, w.values.values)) + end +end + + +function Base.varm{T}(X::NullableArray{T}, m::Number; corrected::Bool=true, + skipnull::Bool=false) + if skipnull + n = length(X) + + nnull = countnz(X.isnull) + nnull == n && return Nullable(convert(Base.momenttype(T), NaN)) + nnull == n-1 && return Nullable( + convert(Base.momenttype(T), + abs2(X.values[Base.findnextnot(X.isnull, 1)] - m)/(1 - Int(corrected)) + ) + ) + /(nnull == 0 ? Base.centralize_sumabs2(X.values, m, 1, n) : + mapreduce_impl_skipnull(Base.CentralizedAbs2Fun(m), + Base.AddFun(), X), + Nullable(n - nnull - Int(corrected)) + ) + else + any(X.isnull) && return Nullable{T}() + Nullable(Base.varm(X.values, m; corrected=corrected)) + end +end + +function Base.varm{T, U<:Number}(X::NullableArray{T}, m::Nullable{U}; + corrected::Bool=true, skipnull::Bool=false) + m.isnull && throw(NullException()) + return varm(X, m.value; corrected=corrected, skipnull=skipnull) +end + +function Base.varzm{T}(X::NullableArray{T}; corrected::Bool=true, + skipnull::Bool=false) + n = length(X) + nnull = skipnull ? countnz(X.isnull) : 0 + (n == 0 || n == nnull) && return Nullable(convert(Base.momenttype(T), NaN)) + return sumabs2(X; skipnull=skipnull) / (n - nnull - Int(corrected)) +end + +function Base.var(X::NullableArray; corrected::Bool=true, mean=nothing, + skipnull::Bool=false) + if mean == 0 || (isa(mean, Nullable) && get(mean == Nullable(0))) + return Base.varzm(X; corrected=corrected, skipnull=skupnull) + elseif mean == nothing + return varm(X, Base.mean(X; skipnull=skupnull); corrected=corrected, + skipnull=skipnull) + elseif isa(mean, Union{Number, Nullable{Number}}) + return varm(X, mean; corrected=corrected, skipnull=skipnull) + else + error() + end +end + +function Base.stdm{T<:Number}(X::NullableArray, m::Union{T, Nullable{T}}; + corrected::Bool=true, skipnull::Bool=true) + return sqrt(varm(X, m; corrected=corrected, skipnull=skipnull)) +end + +function Base.stdm(X::NullableArray; corrected::Bool=true, + mean=nothing, skipnull::Bool=true) + return sqrt(varm(X; corrected=corrected, mean=mean, skipnull=skipnull)) +end