diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 6ba58fced..cebd7cf37 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -18,11 +18,15 @@ module StatsBase ## weights AbstractWeights, # the abstract type to represent any weight vector - Weights, # the default type for representing a weight vector + AnalyticWeights, # the default type for representing a analytic/precision/reliability weight vectors FrequencyWeights, # the type for representing a frequency weight vectors - weights, # construct a weights vector - frequency, # construct a frequency weights vector - exponential, # construct a weights vector using a exponential smoothing schema + ProbabilityWeights,# the type for representing a probability/sampling weight vectors + ExponentialWeights,# the type for representing exponential weights + weights, # alias for aweights + aweights, # construct an AnalyticWeights vector + fweights, # construct a FrequencyWeights vector + pweights, # construct a ProbabilityWeights vector + eweights, # construct an ExponentialWeights vector wsum, # weighted sum with vector as second argument wsum!, # weighted sum across dimensions with provided storage wmean, # weighted mean diff --git a/src/cov.jl b/src/cov.jl index 83faeda84..fc68c12c5 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -81,16 +81,17 @@ function mean_and_cov end end ## weighted cov - Base.cov(x::DenseMatrix, wv::AbstractWeights; mean=nothing, vardim::Int=1) = - scale!(scattermat(x, wv; mean=mean, vardim=vardim), inv(bias(wv))) + function Base.cov(x::DenseMatrix, wv::AbstractWeights; mean=nothing, vardim::Int=1, corrected=true) + scale!(scattermat(x, wv; mean=mean, vardim=vardim), bias(wv, corrected)) + end - function mean_and_cov(x::DenseMatrix; vardim::Int=1) + function mean_and_cov(x::DenseMatrix; vardim::Int=1, corrected=true) m = mean(x, vardim) - return m, Base.covm(x, m; vardim=vardim) + return m, Base.covm(x, m; vardim=vardim, corrected=corrected) end - function mean_and_cov(x::DenseMatrix, wv::AbstractWeights; vardim::Int=1) + function mean_and_cov(x::DenseMatrix, wv::AbstractWeights; vardim::Int=1, corrected=true) m = mean(x, wv, vardim) - return m, Base.cov(x, wv; mean=m, vardim=vardim) + return m, Base.cov(x, wv; mean=m, vardim=vardim, corrected=corrected) end else scattermatm(x::DenseMatrix, mean, vardim::Int=1) = @@ -106,18 +107,21 @@ else scattermatm(x, Base.mean(x, wv, vardim), wv, vardim) ## weighted cov - Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1) = - scale!(scattermatm(x, mean, wv, vardim), inv(bias(wv))) + function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1, corrected::Bool=true) + scale!(scattermatm(x, mean, wv, vardim), bias(wv, corrected)) + end - Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = - Base.covm(x, Base.mean(x, wv, vardim), wv, vardim) + function Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=true) + Base.covm(x, Base.mean(x, wv, vardim), wv, vardim, corrected) + end - function mean_and_cov(x::DenseMatrix, vardim::Int=1) + function mean_and_cov(x::DenseMatrix, vardim::Int=1; corrected=true) m = mean(x, vardim) - return m, Base.covm(x, m, vardim) + return m, Base.covm(x, m, vardim, corrected) end - function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) + + function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=true) m = mean(x, wv, vardim) - return m, Base.cov(x, wv, vardim) + return m, Base.cov(x, wv, vardim; corrected=corrected) end end diff --git a/src/deprecates.jl b/src/deprecates.jl index a75707fd0..6f3905e65 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -43,3 +43,5 @@ findat(a::AbstractArray, b::AbstractArray) = findat!(Array{Int}(size(b)), a, b) @deprecate df(obj::StatisticalModel) dof(obj) @deprecate df_residual(obj::StatisticalModel) dof_residual(obj) + +@deprecate WeightVec{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) AnalyticWeights(vs, s) diff --git a/src/moments.jl b/src/moments.jl index 0c5b83f07..2ea437d9c 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -15,7 +15,9 @@ whereas it's `length(x)-1` in `Base.varm`. The impact is that this is not a weighted estimate of the population variance based on the sample; it's the weighted variance of the sample. """ -Base.varm(v::RealArray, wv::AbstractWeights, m::Real) = _moment2(v, wv, m) +function Base.varm(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) + _moment2(v, wv, m, corrected=corrected) +end """ var(x, wv::AbstractWeights, [dim]; mean=nothing) @@ -30,23 +32,32 @@ whereas it's `length(x)-1` in `Base.var`. The impact is that this is not a weighted estimate of the population variance based on the sample; it's the weighted variance of the sample. """ -function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing) - mean == 0 ? Base.varm(v, wv, 0) : - mean == nothing ? varm(v, wv, Base.mean(v, wv)) : - varm(v, wv, mean) +function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=true) + mean == 0 ? varm(v, wv, 0; corrected=corrected) : + mean == nothing ? varm(v, wv, Base.mean(v, wv); corrected=corrected) : + varm(v, wv, mean; corrected=corrected) end ## var along dim -Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int) = - scale!(_wsum_centralize!(R, @functorize(abs2), A, values(wv), M, dim, true), inv(bias(wv))) +function Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=true) + scale!( + _wsum_centralize!(R, @functorize(abs2), A, values(wv), M, dim, true), + bias(wv, corrected) + ) +end -function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) +function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) if mean == 0 - Base.varm!(R, A, wv, - Base.reducedim_initarray(A, dim, 0, eltype(R)), dim) + Base.varm!( + R, A, wv, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim; + corrected=corrected + ) elseif mean == nothing - Base.varm!(R, A, wv, Base.mean(A, wv, dim), dim) + Base.varm!( + R, A, wv, Base.mean(A, wv, dim), dim; + corrected=corrected + ) else # check size of mean for i = 1:ndims(A) @@ -58,23 +69,37 @@ function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mea dM == dA || throw(DimensionMismatch("Incorrect size of mean.")) end end - Base.varm!(R, A, wv, mean, dim) + Base.varm!(R, A, wv, mean, dim; corrected=corrected) end end -Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int) = +function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=true) @static if VERSION < v"0.6.0-dev.1121" - Base.varm!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, M, dim) + return Base.varm!( + similar(A, Float64, Base.reduced_dims(size(A), dim)), + A, wv, M, dim; corrected=corrected + ) else - Base.varm!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, M, dim) + return Base.varm!( + similar(A, Float64, Base.reduced_indices(indices(A), dim)), + A, wv, M, dim; corrected=corrected + ) end +end -Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) = +function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) @static if VERSION < v"0.6.0-dev.1121" - var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim; mean=mean) + return var!( + similar(A, Float64, Base.reduced_dims(size(A), dim)), + A, wv, dim; mean=mean, corrected=corrected + ) else - var!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, dim; mean=mean) + return var!( + similar(A, Float64, Base.reduced_indices(indices(A), dim)), + A, wv, dim; mean=mean, corrected=corrected + ) end +end ## std """ @@ -84,7 +109,9 @@ Return the standard deviation of a real-valued array `v` with a known mean `m`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights (also called case weights) for the estimate. """ -Base.stdm(v::RealArray, wv::AbstractWeights, m::Real) = sqrt(varm(v, wv, m)) +function Base.stdm(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) + sqrt(varm(v, wv, m; corrected=corrected)) +end """ std(v, wv::AbstractWeights, [dim]; mean=nothing) @@ -93,12 +120,21 @@ Return the standard deviation of a real-valued array `v`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights (also called case weights) for the estimate. """ -Base.std(v::RealArray, wv::AbstractWeights; mean=nothing) = sqrt.(var(v, wv; mean=mean)) +function Base.std(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=true) + sqrt.(var(v, wv; mean=mean, corrected=corrected)) +end -Base.stdm(v::RealArray, m::RealArray, dim::Int) = Base.sqrt!(varm(v, m, dim)) -Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int) = sqrt.(varm(v, wv, m, dim)) -Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) = sqrt.(var(v, wv, dim; mean=mean)) +function Base.stdm(v::RealArray, m::RealArray, dim::Int; corrected=true) + Base.sqrt!(varm(v, m, dim; corrected=corrected)) +end + +function Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int; corrected=true) + sqrt.(varm(v, wv, m, dim; corrected=corrected)) +end +function Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) + sqrt.(var(v, wv, dim; mean=mean, corrected=corrected)) +end ##### Fused statistics """ @@ -108,7 +144,11 @@ Return the mean and variance of a real-valued array `x`, optionally over a dimen `dim`, as a tuple. A weighting vector `wv` can be specified to weight the estimates. The weights are assumed to be frequency weights, also called case weights. """ -mean_and_var(A::RealArray) = (m = mean(A); (m, varm(A, m))) +function mean_and_var(A::RealArray; corrected=true) + m = mean(A) + v = varm(A, m; corrected=corrected) + m, v +end """ mean_and_std(x, [wv::AbstractWeights], [dim]) -> (mean, std) @@ -118,31 +158,61 @@ over a dimension `dim`, as a tuple. A weighting vector `wv` can be specified to weight the estimates. The weights are assumed to be frequency weights, also called case weights. """ -mean_and_std(A::RealArray) = (m = mean(A); (m, stdm(A, m))) +function mean_and_std(A::RealArray; corrected=true) + m = mean(A) + s = stdm(A, m; corrected=corrected) + m, s +end -mean_and_var(A::RealArray, wv::AbstractWeights) = (m = mean(A, wv); (m, varm(A, wv, m))) -mean_and_std(A::RealArray, wv::AbstractWeights) = (m = mean(A, wv); (m, stdm(A, wv, m))) +function mean_and_var(A::RealArray, wv::AbstractWeights; corrected=true) + m = mean(A, wv) + v = varm(A, wv, m; corrected=corrected) + m, v +end + +function mean_and_std(A::RealArray, wv::AbstractWeights; corrected=true) + m = mean(A, wv) + s = stdm(A, wv, m; corrected=corrected) + m, s +end -mean_and_var(A::RealArray, dim::Int) = (m = mean(A, dim); (m, varm(A, m, dim))) -mean_and_std(A::RealArray, dim::Int) = (m = mean(A, dim); (m, stdm(A, m, dim))) +function mean_and_var(A::RealArray, dim::Int; corrected=true) + m = mean(A, dim) + v = varm(A, m, dim; corrected=corrected) + m, v +end + +function mean_and_std(A::RealArray, dim::Int; corrected=true) + m = mean(A, dim) + s = stdm(A, m, dim; corrected=corrected) + m, s +end -mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int) = (m = mean(A, wv, dim); (m, varm(A, wv, m, dim))) -mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int) = (m = mean(A, wv, dim); (m, stdm(A, wv, m, dim))) +function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int; corrected=true) + m = mean(A, wv, dim) + v = varm(A, wv, m, dim; corrected=corrected) + m, v +end +function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int; corrected=true) + m = mean(A, wv, dim) + s = stdm(A, wv, m, dim; corrected=corrected) + m, s +end ##### General central moment -function _moment2(v::RealArray, m::Real) +function _moment2(v::RealArray, m::Real; corrected=true) n = length(v) s = 0.0 for i = 1:n @inbounds z = v[i] - m s += z * z end - s / n + s * bias(n, corrected) end -function _moment2(v::RealArray, wv::AbstractWeights, m::Real) +function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) n = length(v) s = 0.0 w = values(wv) @@ -150,20 +220,22 @@ function _moment2(v::RealArray, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += (z * z) * w[i] end - s / sum(wv) + + result = s * bias(wv, corrected) + return result end -function _moment3(v::RealArray, m::Real) +function _moment3(v::RealArray, m::Real; corrected=true) n = length(v) s = 0.0 for i = 1:n @inbounds z = v[i] - m s += z * z * z end - s / n + s * bias(n, corrected) end -function _moment3(v::RealArray, wv::AbstractWeights, m::Real) +function _moment3(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) n = length(v) s = 0.0 w = values(wv) @@ -171,20 +243,20 @@ function _moment3(v::RealArray, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += (z * z * z) * w[i] end - s / sum(wv) + s * bias(wv, corrected) end -function _moment4(v::RealArray, m::Real) +function _moment4(v::RealArray, m::Real; corrected=true) n = length(v) s = 0.0 for i = 1:n @inbounds z = v[i] - m s += abs2(z * z) end - s / n + s * bias(n, corrected) end -function _moment4(v::RealArray, wv::AbstractWeights, m::Real) +function _moment4(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) n = length(v) s = 0.0 w = values(wv) @@ -192,20 +264,20 @@ function _moment4(v::RealArray, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += abs2(z * z) * w[i] end - s / sum(wv) + s * bias(wv, corrected) end -function _momentk(v::RealArray, k::Int, m::Real) +function _momentk(v::RealArray, k::Int, m::Real; corrected=true) n = length(v) s = 0.0 for i = 1:n @inbounds z = v[i] - m s += (z ^ k) end - s / n + s * bias(n, corrected) end -function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) +function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=true) n = length(v) s = 0.0 w = values(wv) @@ -213,7 +285,7 @@ function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += (z ^ k) * w[i] end - s / sum(wv) + s * bias(wv, corrected) end @@ -223,22 +295,24 @@ end Return the `k`th order central moment of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. """ -function moment(v::RealArray, k::Int, m::Real) - k == 2 ? _moment2(v, m) : - k == 3 ? _moment3(v, m) : - k == 4 ? _moment4(v, m) : - _momentk(v, k, m) +function moment(v::RealArray, k::Int, m::Real; corrected=true) + k == 2 ? _moment2(v, m; corrected=corrected) : + k == 3 ? _moment3(v, m; corrected=corrected) : + k == 4 ? _moment4(v, m; corrected=corrected) : + _momentk(v, k, m; corrected=corrected) end -function moment(v::RealArray, k::Int, wv::AbstractWeights, m::Real) - k == 2 ? _moment2(v, wv, m) : - k == 3 ? _moment3(v, wv, m) : - k == 4 ? _moment4(v, wv, m) : - _momentk(v, k, wv, m) +function moment(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=true) + k == 2 ? _moment2(v, wv, m; corrected=corrected) : + k == 3 ? _moment3(v, wv, m; corrected=corrected) : + k == 4 ? _moment4(v, wv, m; corrected=corrected) : + _momentk(v, k, wv, m; corrected=corrected) end -moment(v::RealArray, k::Int) = moment(v, k, mean(v)) -moment(v::RealArray, k::Int, wv::AbstractWeights) = moment(v, k, wv, mean(v, wv)) +moment(v::RealArray, k::Int; corrected=true) = moment(v, k, mean(v); corrected=corrected) +function moment(v::RealArray, k::Int, wv::AbstractWeights; corrected=true) + moment(v, k, wv, mean(v, wv); corrected=corrected) +end ##### Skewness and Kurtosis @@ -251,7 +325,7 @@ moment(v::RealArray, k::Int, wv::AbstractWeights) = moment(v, k, wv, mean(v, wv) Compute the standardized skewness of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. """ -function skewness(v::RealArray, m::Real) +function skewness(v::RealArray, m::Real; corrected=true) n = length(v) cm2 = 0.0 # empirical 2nd centered moment (variance) cm3 = 0.0 # empirical 3rd centered moment @@ -262,12 +336,13 @@ function skewness(v::RealArray, m::Real) cm2 += z2 cm3 += z2 * z end - cm3 /= n - cm2 /= n + b = bias(n, corrected) + cm3 *= b + cm2 *= b return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 end -function skewness(v::RealArray, wv::AbstractWeights, m::Real) +function skewness(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) n = length(v) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) @@ -282,14 +357,16 @@ function skewness(v::RealArray, wv::AbstractWeights, m::Real) cm2 += z2w cm3 += z2w * z end - sw = sum(wv) - cm3 /= sw - cm2 /= sw + b = bias(wv, corrected) + cm3 *= b + cm2 *= b return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 end -skewness(v::RealArray) = skewness(v, mean(v)) -skewness(v::RealArray, wv::AbstractWeights) = skewness(v, wv, mean(v, wv)) +skewness(v::RealArray; corrected=true) = skewness(v, mean(v); corrected=corrected) +function skewness(v::RealArray, wv::AbstractWeights; corrected=true) + skewness(v, wv, mean(v, wv); corrected=corrected) +end # (excessive) Kurtosis # This is Type 1 definition according to Joanes and Gill (1998) @@ -299,7 +376,7 @@ skewness(v::RealArray, wv::AbstractWeights) = skewness(v, wv, mean(v, wv)) Compute the excess kurtosis of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. """ -function kurtosis(v::RealArray, m::Real) +function kurtosis(v::RealArray, m::Real; corrected=true) n = length(v) cm2 = 0.0 # empirical 2nd centered moment (variance) cm4 = 0.0 # empirical 4th centered moment @@ -309,12 +386,13 @@ function kurtosis(v::RealArray, m::Real) cm2 += z2 cm4 += z2 * z2 end - cm4 /= n - cm2 /= n + b = bias(n, corrected) + cm4 *= b + cm2 *= b return (cm4 / (cm2 * cm2)) - 3.0 end -function kurtosis(v::RealArray, wv::AbstractWeights, m::Real) +function kurtosis(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) n = length(v) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) @@ -330,11 +408,13 @@ function kurtosis(v::RealArray, wv::AbstractWeights, m::Real) cm2 += z2w cm4 += z2w * z2 end - sw = sum(wv) - cm4 /= sw - cm2 /= sw + b = bias(wv, corrected) + cm4 *= b + cm2 *= b return (cm4 / (cm2 * cm2)) - 3.0 end -kurtosis(v::RealArray) = kurtosis(v, mean(v)) -kurtosis(v::RealArray, wv::AbstractWeights) = kurtosis(v, wv, mean(v, wv)) +kurtosis(v::RealArray; corrected=true) = kurtosis(v, mean(v); corrected=corrected) +function kurtosis(v::RealArray, wv::AbstractWeights; corrected=true) + kurtosis(v, wv, mean(v, wv); corrected=corrected) +end diff --git a/src/weights.jl b/src/weights.jl index 3ab1a0253..ac4f60add 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -3,113 +3,181 @@ if VERSION < v"0.6.0-dev.2123" abstract AbstractWeights{S<:Real, T<:Real, V<:RealVector} <: RealVector{T} +else + abstract AbstractWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} +end - immutable Weights{S<:Real, T<:Real, V<:RealVector} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Real - end +""" + `@weights name` - immutable FrequencyWeights{S<:Integer, T<:Integer, V<:IntegerVector} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Int +Generates a new generic weight type with specified `name`, which subtypes `AbstractWeights` +and stores the `values` (`V<:RealVector`) and `sum` (`S<:Real`). +""" +macro weights(name) + return quote + if VERSION < v"0.6.0-dev.2123" + immutable $name{S<:Real, T<:Real, V<:RealVector} <: AbstractWeights{S, T, V} + values::V + sum::S + end + else + immutable $name{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} + values::V + sum::S + end + end end +end - immutable ProbabilityWeights{S<:Real, T<:Real, V<:RealVector} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Real - end -else - abstract AbstractWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} +eltype(wv::AbstractWeights) = eltype(wv.values) +length(wv::AbstractWeights) = length(wv.values) +values(wv::AbstractWeights) = wv.values +sum(wv::AbstractWeights) = wv.sum +isempty(wv::AbstractWeights) = isempty(wv.values) - immutable Weights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Real - end +Base.getindex(wv::AbstractWeights, i) = getindex(wv.values, i) +Base.size(wv::AbstractWeights) = size(wv.values) - immutable FrequencyWeights{S<:Integer, T<:Integer, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Int - end +""" + bias(n::Integer, [corrected]) - immutable ProbabilityWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Real - end -end +Computes the corrected (default) or uncorrected bias for any `n` observations. +```math +\fraction{1}{n - 1} +``` """ - Weights(vs, [wsum]) +bias(n::Integer, corrected=true) = inv(n - Int(corrected)) -Construct a `Weights` with weight values `vs` and sum of weights `wsum`. -If omitted, `wsum` is computed. """ -function Weights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs); corrected::Bool=true) - if isempty(vs) || !corrected - return Weights{typeof(s), eltype(vs), V}(vs, s, s) + bias(w::AbstractWeights, [corrected]) + +Computes the corrected (default) or uncorrected bias for any weight vector. +The default equation assumes analytic/precision/reliability weights and determines the +bias as: + +```math +\fraction{1}{∑w × (1 - ∑(w'²))} +``` +where w' represents the normalized weights +""" +function bias(w::AbstractWeights, corrected=true) + s = sum(w) + if corrected + return inv(s * (1 - sum(normalize(values(w), 1) .^ 2))) else - return Weights{S, eltype(vs), V}(vs, s, s * (1 - sum(normalize(vs, 1) .^ 2))) + return inv(s) end end -function FrequencyWeights{S<:Integer, V<:IntegerVector}(vs::V, s::S=sum(vs); corrected::Bool=true) - return FrequencyWeights{S, eltype(vs), V}(vs, s, s - Int(corrected)) +@weights AnalyticWeights + +""" + AnalyticWeights(vs, [wsum]) + +Construct a `AnalyticWeights` with weight values `vs` and sum of weights `wsum`. +If omitted, `wsum` is computed. +""" +function AnalyticWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) + return AnalyticWeights{S, eltype(vs), V}(vs, s) end -# TODO: constructor for ProbabilityWeights, but I'm not familiar with how bias correction works with these -# types of weights or if bias correction even makes sense. -# https://en.wikipedia.org/wiki/Inverse_probability_weighting +""" + aweights(vs) + +Construct a `AnalyticWeights` type from a given array. +""" +aweights(vs::RealVector) = AnalyticWeights(vs) +aweights(vs::RealArray) = AnalyticWeights(vec(vs)) """ weights(vs) -Construct a `Weights` type from a given array. +Alias for aweights(vs) """ -weights(vs::RealVector, corrected=true) = Weights(vs; corrected=corrected) -weights(vs::RealArray, corrected=true) = Weights(vec(vs); corrected=corrected) +weights(vs) = aweights(vs) + +@weights FrequencyWeights + +function FrequencyWeights{S<:Integer, V<:IntegerVector}(vs::V, s::S=sum(vs)) + return FrequencyWeights{S, eltype(vs), V}(vs, s) +end """ - frequency(vs) + fweights(vs) Construct a `FrequencyWeights` type from a given array. """ -frequency(vs::RealVector, corrected=true) = FrequencyWeights(vs; corrected=corrected) -frequency(vs::RealArray, corrected=true) = FrequencyWeights(vec(vs); corrected=corrected) +fweights(vs::IntegerVector) = FrequencyWeights(vs) +fweights(vs::IntegerArray) = FrequencyWeights(vec(vs)) + +""" + bias(w::FrequencyWeights, [corrected]) + +```math +\fraction{1}{∑w - 1} +``` +""" +bias(w::FrequencyWeights, corrected=true) = inv(sum(w) - Int(corrected)) + +@weights ProbabilityWeights + +function ProbabilityWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) + return ProbabilityWeights{S, eltype(vs), V}(vs, s) +end + +""" + pweights(vs) + +Construct a `ProbabilityWeights` type from a given array. +""" +pweights(vs::RealVector) = ProbabilityWeights(vs) +pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) + +""" + bias(w::ProbabilityWeights, [corrected]) + +```math +\fraction{n}{∑w × (n - 1)} +``` +""" +function bias(w::ProbabilityWeights, corrected=true) + s = sum(w) + + if corrected + n = length(values(w)) + return n / (s * (n - 1)) + else + return inv(s) + end +end + +@weights ExponentialWeights + +function ExponentialWeights{V<:RealVector}(vs::V) + s = sum(vs) + return ExponentialWeights{typeof(s), eltype(vs), V}(vs, s) +end """ - exponential(n, [λ]) + eweights(n, [λ]) -Constructs a `Weights` type with a desired length `n` and smoothing factor `λ`, +Constructs a `ExponentialWeights` type with a desired length `n` and smoothing factor `λ`, where each element is set to `λ * (1 - λ)^(1 - i)`. # Arguments * `n::Integer`: the desired length of the `Weights` -* `λ::Real`: is a smoothing factor or rate paremeter between 0 .. 1. - As this value approaches 0 the resulting weights will be almost equal(), +* `λ::Real`: a smoothing factor or rate parameter between 0 and 1. + As this value approaches 0 the resulting weights will be almost equal, while values closer to 1 will put higher weight on the end elements of the vector. """ -function exponential(n::Integer, λ::Real=0.99) - @assert 0 <= λ <= 1 && n > 0 +function eweights(n::Integer, λ::Real=0.99) + n > 0 || throw(ArgumentError("cannot construct weights of length < 1")) + 0 <= λ <= 1 || throw(ArgumentError("smoothing factor must be between 0 and 1")) w0 = map(i -> λ * (1 - λ)^(1 - i), 1:n) return weights(w0) end -eltype(wv::AbstractWeights) = eltype(wv.values) -length(wv::AbstractWeights) = length(wv.values) -values(wv::AbstractWeights) = wv.values -sum(wv::AbstractWeights) = wv.sum -bias(wv::AbstractWeights) = wv.bias -isempty(wv::AbstractWeights) = isempty(wv.values) - -Base.getindex(wv::AbstractWeights, i) = getindex(wv.values, i) -Base.size(wv::AbstractWeights) = size(wv.values) - - ##### Weighted sum ##### ## weighted sum over vectors @@ -339,7 +407,7 @@ Compute the weighted mean of an array `v` with weights `w`. """ function wmean{T<:Number}(v::AbstractArray{T}, w::AbstractVector) Base.depwarn("wmean is deprecated, use mean(v, weights(w)) instead.", :wmean) - mean(v, weights(w, false)) + mean(v, weights(w)) end Base.mean(v::AbstractArray, w::AbstractWeights) = sum(v, w) / sum(w) @@ -416,7 +484,7 @@ end Compute the weighted median of an array `v` with weights `w`, given as either a vector or `AbstractWeights`. """ -wmedian(v::RealVector, w::RealVector) = median(v, weights(w, false)) +wmedian(v::RealVector, w::RealVector) = median(v, weights(w)) wmedian{W<:Real}(v::RealVector, w::AbstractWeights{W}) = median(v, w) ###### Weighted quantile ##### @@ -510,5 +578,5 @@ or a `AbstractWeights`. """ wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::RealVector) = quantile(v, w, p) wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::Number) = quantile(v, w, [p])[1] -wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, weights(w, false), p) -wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, weights(w, false), [p])[1] +wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, weights(w), p) +wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, weights(w), [p])[1] diff --git a/test/counts.jl b/test/counts.jl index d763817d3..2fae64eed 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -6,7 +6,7 @@ n = 5000 # 1D integer counts x = rand(1:5, n) -w = weights(rand(n), false) +w = weights(rand(n)) c = counts(x, 5) @test size(c) == (5,) @@ -40,7 +40,7 @@ c0 = Float64[sum(w.values[x .== i]) for i in 1 : 5] x = rand(1:4, n) y = rand(1:5, n) -w = weights(rand(n), false) +w = weights(rand(n)) c = counts(x, y, (4, 5)) @test size(c) == (4, 5) @@ -85,11 +85,11 @@ pm = proportionmap(x) @test pm["b"] ≈ (1/3) @test pm["c"] ≈ (1/6) -cm = countmap(x, weights(w, false)) +cm = countmap(x, weights(w)) @test cm["a"] == 5.5 @test cm["b"] == 4.5 @test cm["c"] == 3.5 -pm = proportionmap(x, weights(w, false)) +pm = proportionmap(x, weights(w)) @test pm["a"] ≈ (5.5 / 13.5) @test pm["b"] ≈ (4.5 / 13.5) @test pm["c"] ≈ (3.5 / 13.5) diff --git a/test/cov.jl b/test/cov.jl index 7523cb5b7..2cbd1d049 100644 --- a/test/cov.jl +++ b/test/cov.jl @@ -9,8 +9,8 @@ Z2 = X .- mean(X, 2) w1 = rand(3) w2 = rand(8) -wv1 = weights(w1, false) -wv2 = weights(w2, false) +wv1 = weights(w1) +wv2 = weights(w2) Z1w = X .- mean(X, wv1, 1) Z2w = X .- mean(X, wv2, 2) @@ -88,62 +88,62 @@ end # weighted covariance if VERSION < v"0.5.0-dev+679" - @test cov(X, wv1) ≈ S1w ./ sum(wv1) - @test cov(X, wv2; vardim=2) ≈ S2w ./ sum(wv2) + @test cov(X, wv1; corrected=false) ≈ S1w ./ sum(wv1) + @test cov(X, wv2; vardim=2, corrected=false) ≈ S2w ./ sum(wv2) - @test cov(X, wv1; mean=0) ≈ Sz1w ./ sum(wv1) - @test cov(X, wv2; mean=0, vardim=2) ≈ Sz2w ./ sum(wv2) + @test cov(X, wv1; mean=0, corrected=false) ≈ Sz1w ./ sum(wv1) + @test cov(X, wv2; mean=0, vardim=2, corrected=false) ≈ Sz2w ./ sum(wv2) - @test cov(X, wv1; mean=mean(X, wv1, 1)) ≈ S1w ./ sum(wv1) - @test cov(X, wv2; mean=mean(X, wv2, 2), vardim=2) ≈ S2w ./ sum(wv2) + @test cov(X, wv1; mean=mean(X, wv1, 1), corrected=false) ≈ S1w ./ sum(wv1) + @test cov(X, wv2; mean=mean(X, wv2, 2), vardim=2, corrected=false) ≈ S2w ./ sum(wv2) - @test cov(X, wv1; mean=zeros(1,8)) ≈ Sz1w ./ sum(wv1) - @test cov(X, wv2; mean=zeros(3), vardim=2) ≈ Sz2w ./ sum(wv2) + @test cov(X, wv1; mean=zeros(1,8), corrected=false) ≈ Sz1w ./ sum(wv1) + @test cov(X, wv2; mean=zeros(3), vardim=2, corrected=false) ≈ Sz2w ./ sum(wv2) else - @test cov(X, wv1) ≈ S1w ./ sum(wv1) - @test cov(X, wv2, 2) ≈ S2w ./ sum(wv2) + @test cov(X, wv1; corrected=false) ≈ S1w ./ sum(wv1) + @test cov(X, wv2, 2; corrected=false) ≈ S2w ./ sum(wv2) - @test Base.covm(X, 0, wv1) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, 0, wv2, 2) ≈ Sz2w ./ sum(wv2) + @test Base.covm(X, 0, wv1, 1, false) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, 0, wv2, 2, false) ≈ Sz2w ./ sum(wv2) - @test Base.covm(X, mean(X, wv1, 1), wv1) ≈ S1w ./ sum(wv1) - @test Base.covm(X, mean(X, wv2, 2), wv2, 2) ≈ S2w ./ sum(wv2) + @test Base.covm(X, mean(X, wv1, 1), wv1, 1, false) ≈ S1w ./ sum(wv1) + @test Base.covm(X, mean(X, wv2, 2), wv2, 2, false) ≈ S2w ./ sum(wv2) - @test Base.covm(X, zeros(1,8), wv1) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, zeros(3), wv2, 2) ≈ Sz2w ./ sum(wv2) + @test Base.covm(X, zeros(1,8), wv1, 1, false) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, zeros(3), wv2, 2, false) ≈ Sz2w ./ sum(wv2) end # mean_and_cov if VERSION < v"0.5.0-dev+679" - (m, C) = mean_and_cov(X; vardim=1) + (m, C) = mean_and_cov(X; vardim=1, corrected=false) @test m == mean(X, 1) - @test C == cov(X; vardim=1) + @test C == cov(X, vardim=1, corrected=false) - (m, C) = mean_and_cov(X; vardim=2) + (m, C) = mean_and_cov(X; vardim=2, corrected=false) @test m == mean(X, 2) - @test C == cov(X; vardim=2) + @test C == cov(X; vardim=2, corrected=false) - (m, C) = mean_and_cov(X, wv1; vardim=1) + (m, C) = mean_and_cov(X, wv1; vardim=1, corrected=false) @test m == mean(X, wv1, 1) - @test C == cov(X, wv1; vardim=1) + @test C == cov(X, wv1; vardim=1, corrected=false) - (m, C) = mean_and_cov(X, wv2; vardim=2) + (m, C) = mean_and_cov(X, wv2; vardim=2, corrected=false) @test m == mean(X, wv2, 2) - @test C == cov(X, wv2; vardim=2) + @test C == cov(X, wv2; vardim=2, corrected=false) else - (m, C) = mean_and_cov(X, 1) + (m, C) = mean_and_cov(X, 1; corrected=false) @test m == mean(X, 1) - @test C == cov(X, 1) + @test C == cov(X, 1, false) - (m, C) = mean_and_cov(X, 2) + (m, C) = mean_and_cov(X, 2; corrected=false) @test m == mean(X, 2) - @test C == cov(X, 2) + @test C == cov(X, 2, false) - (m, C) = mean_and_cov(X, wv1, 1) + (m, C) = mean_and_cov(X, wv1, 1; corrected=false) @test m == mean(X, wv1, 1) - @test C == cov(X, wv1, 1) + @test C == cov(X, wv1, 1; corrected=false) - (m, C) = mean_and_cov(X, wv2, 2) + (m, C) = mean_and_cov(X, wv2, 2; corrected=false) @test m == mean(X, wv2, 2) - @test C == cov(X, wv2, 2) + @test C == cov(X, wv2, 2; corrected=false) end diff --git a/test/hist.jl b/test/hist.jl index 7e4f551db..cad5641ce 100644 --- a/test/hist.jl +++ b/test/hist.jl @@ -23,12 +23,12 @@ using Base.Test @test fit(Histogram,(0:99,0:99),nbins=(5,5), closed=:left).weights == diagm([20,20,20,20,20]) # FIXME: closed (all lines in this block): -@test fit(Histogram,0:99,weights(ones(100), false),nbins=5, closed=:left).weights == [20,20,20,20,20] -@test fit(Histogram,0:99,weights(2*ones(100), false),nbins=5, closed=:left).weights == [40,40,40,40,40] +@test fit(Histogram,0:99,weights(ones(100)),nbins=5, closed=:left).weights == [20,20,20,20,20] +@test fit(Histogram,0:99,weights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] # FIXME: closed (all lines in this block): -@test eltype(fit(Histogram,1:100,weights(ones(Int,100), false),nbins=5, closed=:left).weights) == Int -@test eltype(fit(Histogram,1:100,weights(ones(Float64,100), false),nbins=5, closed=:left).weights) == Float64 +@test eltype(fit(Histogram,1:100,weights(ones(Int,100)),nbins=5, closed=:left).weights) == Int +@test eltype(fit(Histogram,1:100,weights(ones(Float64,100)),nbins=5, closed=:left).weights) == Float64 # histrange # Note: atm histrange must be qualified diff --git a/test/moments.jl b/test/moments.jl index 17c956c71..c1f3e2234 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -4,119 +4,150 @@ using Base.Test ##### weighted var & std x = rand(10) -wv = weights(rand(10), false) +wv = weights(rand(10)) m = mean(x, wv) -@test var(x, wv) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) -@test var(x, wv; mean=0) ≈ sum(abs2.(x), wv) ./ sum(wv) -@test var(x, wv; mean=1.0) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) +@test var(x, wv; corrected=false) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) +@test var(x, wv; mean=0, corrected=false) ≈ sum(abs2.(x), wv) ./ sum(wv) +@test var(x, wv; mean=1.0, corrected=false) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) -@test std(x, wv) ≈ sqrt(var(x, wv)) -@test std(x, wv; mean=0) ≈ sqrt(var(x, wv; mean=0)) -@test std(x, wv; mean=1.0) ≈ sqrt(var(x, wv; mean=1.0)) +@test std(x, wv; corrected=false) ≈ sqrt(var(x, wv; corrected=false)) +@test std(x, wv; mean=0, corrected=false) ≈ sqrt(var(x, wv; mean=0, corrected=false)) +@test std(x, wv; mean=1.0, corrected=false) ≈ sqrt(var(x, wv; mean=1.0, corrected=false)) -(m, v) = mean_and_var(x) +(m, v) = mean_and_var(x; corrected=false) @test m == mean(x) -@test v == var(x) +@test v == var(x; corrected=false) -(m, s) = mean_and_std(x) +(m, s) = mean_and_std(x; corrected=false) @test m == mean(x) -@test s == std(x) +@test s == std(x; corrected=false) -(m, v) = mean_and_var(x, wv) +(m, v) = mean_and_var(x, wv; corrected=false) @test m == mean(x, wv) -@test v == var(x, wv) +@test v == var(x, wv; corrected=false) -(m, s) = mean_and_std(x, wv) +(m, s) = mean_and_std(x, wv; corrected=false) @test m == mean(x, wv) -@test s == std(x, wv) +@test s == std(x, wv; corrected=false) x = rand(5, 6) w1 = rand(5) w2 = rand(6) -wv1 = weights(w1, false) -wv2 = weights(w2, false) +wv1 = weights(w1) +wv2 = weights(w2) m1 = mean(x, wv1, 1) m2 = mean(x, wv2, 2) -@test var(x, wv1, 1; mean=0) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2; mean=0) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) +@test var(x, wv1, 1; mean=0, corrected=false) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) +@test var(x, wv2, 2; mean=0, corrected=false) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) -@test var(x, wv1, 1; mean=m1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2; mean=m2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) +@test var(x, wv1, 1; mean=m1, corrected=false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) +@test var(x, wv2, 2; mean=m2, corrected=false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) -@test var(x, wv1, 1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) +@test var(x, wv1, 1; corrected=false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) +@test var(x, wv2, 2; corrected=false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) -@test std(x, wv1, 1) ≈ sqrt.(var(x, wv1, 1)) -@test std(x, wv2, 2) ≈ sqrt.(var(x, wv2, 2)) -@test std(x, wv1, 1; mean=0) ≈ sqrt.(var(x, wv1, 1; mean=0)) -@test std(x, wv2, 2; mean=0) ≈ sqrt.(var(x, wv2, 2; mean=0)) -@test std(x, wv1, 1; mean=m1) ≈ sqrt.(var(x, wv1, 1; mean=m1)) -@test std(x, wv2, 2; mean=m2) ≈ sqrt.(var(x, wv2, 2; mean=m2)) +@test std(x, wv1, 1; corrected=false) ≈ sqrt.(var(x, wv1, 1; corrected=false)) +@test std(x, wv2, 2; corrected=false) ≈ sqrt.(var(x, wv2, 2; corrected=false)) +@test std(x, wv1, 1; mean=0, corrected=false) ≈ sqrt.(var(x, wv1, 1; mean=0, corrected=false)) +@test std(x, wv2, 2; mean=0, corrected=false) ≈ sqrt.(var(x, wv2, 2; mean=0, corrected=false)) +@test std(x, wv1, 1; mean=m1, corrected=false) ≈ sqrt.(var(x, wv1, 1; mean=m1, corrected=false)) +@test std(x, wv2, 2; mean=m2, corrected=false) ≈ sqrt.(var(x, wv2, 2; mean=m2, corrected=false)) for d in 1:2 - (m, v) = mean_and_var(x, d) + (m, v) = mean_and_var(x, d; corrected=false) @test m == mean(x, d) - @test v == var(x, d) + @test v == var(x, d; corrected=false) - (m, s) = mean_and_std(x, d) + (m, s) = mean_and_std(x, d; corrected=false) @test m == mean(x, d) - @test s == std(x, d) + @test s == std(x, d; corrected=false) end -(m, v) = mean_and_var(x, wv1, 1) +(m, v) = mean_and_var(x, wv1, 1; corrected=false) @test m == mean(x, wv1, 1) -@test v == var(x, wv1, 1) +@test v == var(x, wv1, 1; corrected=false) -(m, v) = mean_and_var(x, wv2, 2) +(m, v) = mean_and_var(x, wv2, 2; corrected=false) @test m == mean(x, wv2, 2) -@test v == var(x, wv2, 2) +@test v == var(x, wv2, 2; corrected=false) -(m, s) = mean_and_std(x, wv1, 1) +(m, s) = mean_and_std(x, wv1, 1; corrected=false) @test m == mean(x, wv1, 1) -@test s == std(x, wv1, 1) +@test s == std(x, wv1, 1; corrected=false) -(m, s) = mean_and_std(x, wv2, 2) +(m, s) = mean_and_std(x, wv2, 2; corrected=false) @test m == mean(x, wv2, 2) -@test s == std(x, wv2, 2) +@test s == std(x, wv2, 2; corrected=false) ##### skewness & kurtosis -wv = weights(ones(5) * 2.0, false) +wv = weights(ones(5) * 2.0) -@test skewness(1:5) ≈ 0.0 -@test skewness([1, 2, 3, 4, 5]) ≈ 0.0 -@test skewness([1, 2, 2, 2, 5]) ≈ 1.1731251294063556 -@test skewness([1, 4, 4, 4, 5]) ≈ -1.1731251294063556 +@test skewness(1:5; corrected=false) ≈ 0.0 +@test skewness([1, 2, 3, 4, 5]; corrected=false) ≈ 0.0 +@test skewness([1, 2, 2, 2, 5]; corrected=false) ≈ 1.1731251294063556 +@test skewness([1, 4, 4, 4, 5]; corrected=false) ≈ -1.1731251294063556 -@test skewness([1, 2, 2, 2, 5], wv) ≈ 1.1731251294063556 +@test skewness([1, 2, 2, 2, 5], wv; corrected=false) ≈ 1.1731251294063556 -@test kurtosis(1:5) ≈ -1.3 -@test kurtosis([1, 2, 3, 4, 5]) ≈ -1.3 -@test kurtosis([1, 2, 3, 3, 2]) ≈ -1.1530612244897953 +@test kurtosis(1:5; corrected=false) ≈ -1.3 +@test kurtosis([1, 2, 3, 4, 5]; corrected=false) ≈ -1.3 +@test kurtosis([1, 2, 3, 3, 2]; corrected=false) ≈ -1.1530612244897953 -@test kurtosis([1, 2, 3, 4, 5], wv) ≈ -1.3 +@test kurtosis([1, 2, 3, 4, 5], wv; corrected=false) ≈ -1.3 ##### general moments x = collect(2.0:8.0) -@test moment(x, 2) ≈ sum((x .- 5).^2) / length(x) -@test moment(x, 3) ≈ sum((x .- 5).^3) / length(x) -@test moment(x, 4) ≈ sum((x .- 5).^4) / length(x) -@test moment(x, 5) ≈ sum((x .- 5).^5) / length(x) +@test moment(x, 2; corrected=false) ≈ sum((x .- 5).^2) / length(x) +@test moment(x, 3; corrected=false) ≈ sum((x .- 5).^3) / length(x) +@test moment(x, 4; corrected=false) ≈ sum((x .- 5).^4) / length(x) +@test moment(x, 5; corrected=false) ≈ sum((x .- 5).^5) / length(x) -@test moment(x, 2, 4.0) ≈ sum((x .- 4).^2) / length(x) -@test moment(x, 3, 4.0) ≈ sum((x .- 4).^3) / length(x) -@test moment(x, 4, 4.0) ≈ sum((x .- 4).^4) / length(x) -@test moment(x, 5, 4.0) ≈ sum((x .- 4).^5) / length(x) +@test moment(x, 2, 4.0; corrected=false) ≈ sum((x .- 4).^2) / length(x) +@test moment(x, 3, 4.0; corrected=false) ≈ sum((x .- 4).^3) / length(x) +@test moment(x, 4, 4.0; corrected=false) ≈ sum((x .- 4).^4) / length(x) +@test moment(x, 5, 4.0; corrected=false) ≈ sum((x .- 4).^5) / length(x) w = weights([1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) x2 = collect(2.0:6.0) -@test moment(x, 2, w) ≈ sum((x2 .- 4).^2) / 5 -@test moment(x, 3, w) ≈ sum((x2 .- 4).^3) / 5 -@test moment(x, 4, w) ≈ sum((x2 .- 4).^4) / 5 -@test moment(x, 5, w) ≈ sum((x2 .- 4).^5) / 5 +@test moment(x, 2, w; corrected=false) ≈ sum((x2 .- 4).^2) / 5 +@test moment(x, 3, w; corrected=false) ≈ sum((x2 .- 4).^3) / 5 +@test moment(x, 4, w; corrected=false) ≈ sum((x2 .- 4).^4) / 5 +@test moment(x, 5, w; corrected=false) ≈ sum((x2 .- 4).^5) / 5 + +# Test corrected cases (this will be cleaner in testsets) +x = rand(10) + +# AnalyticWeights +@test var(x, aweights(ones(10))) ≈ var(x) + +w = aweights(rand(10)) +n = length(w) # Could be count(!iszero, w) instead +w = aweights(w .* (n / sum(w))) +sw = sum(w) # This is now equal to n, but maybe we should support non-normalized weights? +xbar = sum(w .* x) ./ sw +expected = sum(w .* (x .- xbar).^2)/(sw - sum(w.^2)/sw) +@test var(x, w) ≈ expected + +# FrequencyWeights +@test var(x, fweights(ones(Int, 10))) ≈ var(x) +w = fweights(rand(UInt, 10)) +sw = sum(w) +xbar = sum(w .* x) / sw +expected = sum(w .* (x .- xbar).^2) ./ (sum(w) - 1) +@test var(x, w) ≈ expected + +# ProbabilityWeights +@test var(x, pweights(ones(10))) ≈ var(x) +w = pweights(rand(10)) +n = count(!iszero, w) +sw = sum(w) +xbar = sum(w .* x)/sw +expected = sum(w .* (x .- xbar).^2)/sw * n/(n - 1) +@test var(x, w) ≈ expected diff --git a/test/weights.jl b/test/weights.jl index e27e3e74f..941fcc2c3 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -3,35 +3,25 @@ using Base.Test using Compat import Compat: view -@test isa(weights([1, 2, 3]), Weights{Int}) -@test isa(weights([1., 2., 3.]), Weights{Float64}) -@test isa(weights([1 2 3; 4 5 6]), Weights{Int}) +@test isa(weights([1, 2, 3]), AbstractWeights{Int}) +@test isa(weights([1., 2., 3.]), AbstractWeights{Float64}) +@test isa(weights([1 2 3; 4 5 6]), AbstractWeights{Int}) -@test isa(frequency([1, 2, 3]), FrequencyWeights) -@test isa(frequency([1 2 3; 4 5 6]), FrequencyWeights) -@test isa(FrequencyWeights([1, 2, 3], 6; corrected=false), FrequencyWeights) +@test isa(AnalyticWeights([1, 2, 3], 6), AbstractWeights{Int}) @test isempty(weights(Float64[])) @test size(weights([1, 2, 3])) == (3,) w = [1., 2., 3.] -wv = weights(w, false) +wv = weights(w) @test eltype(wv) === Float64 @test length(wv) === 3 @test values(wv) === w @test sum(wv) === 6.0 @test !isempty(wv) -fw = [1, 2, 3] -fwv = frequency(fw) -@test eltype(fwv) === Int -@test length(fwv) === 3 -@test values(fwv) === fw -@test sum(fwv) === 6 -@test !isempty(wv) - b = trues(3) -bv = frequency(b) +bv = weights(b) @test eltype(bv) === Bool @test length(bv) === 3 @test values(bv) === b @@ -41,8 +31,8 @@ bv = frequency(b) ba = BitArray([true, false, true]) sa = sparsevec([1., 0., 2.]) -@test sum(ba, fwv) === 4 -@test sum(sa, fwv) === 7.0 +@test sum(ba, wv) === 4.0 +@test sum(sa, wv) === 7.0 ## wsum @@ -161,20 +151,21 @@ r = ones(8, 6) ## the sum and mean syntax -@test sum([1.0, 2.0, 3.0], weights([1.0, 0.5, 0.5], false)) ≈ 3.5 -@test sum(1:3, weights([1.0, 1.0, 0.5], false)) ≈ 4.5 -@test mean([1:3;], weights([1.0, 1.0, 0.5], false)) ≈ 1.8 -@test mean(1:3, weights([1.0, 1.0, 0.5], false)) ≈ 1.8 +@test sum([1.0, 2.0, 3.0], weights([1.0, 0.5, 0.5])) ≈ 3.5 +@test sum(1:3, weights([1.0, 1.0, 0.5])) ≈ 4.5 + +@test mean([1:3;], weights([1.0, 1.0, 0.5])) ≈ 1.8 +@test mean(1:3, weights([1.0, 1.0, 0.5])) ≈ 1.8 a = reshape(1.0:27.0, 3, 3, 3) for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) - @test sum(a, weights(wt, false), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) - @test sum(a, weights(wt, false), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) - @test sum(a, weights(wt, false), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) - @test mean(a, weights(wt, false), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) - @test mean(a, weights(wt, false), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) - @test mean(a, weights(wt, false), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) + @test sum(a, weights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) + @test sum(a, weights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) + @test sum(a, weights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) + @test mean(a, weights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) + @test mean(a, weights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) + @test mean(a, weights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) @test_throws ErrorException mean(a, weights(wt), 4) end @@ -235,16 +226,15 @@ median_answers = (7.0, 4.0, 8.5, num_tests = length(data) for i = 1:num_tests @test wmedian(data[i], wt[i]) == median_answers[i] - @test wmedian(data[i], weights(wt[i], false)) == median_answers[i] - @test median(data[i], weights(wt[i], false)) == median_answers[i] + @test wmedian(data[i], weights(wt[i])) == median_answers[i] + @test median(data[i], weights(wt[i])) == median_answers[i] for j = 1:100 # Make sure the weighted median does not change if the data # and weights are reordered. reorder = sortperm(rand(length(data[i]))) - @test median(data[i][reorder], weights(wt[i][reorder], false)) == median_answers[i] + @test median(data[i][reorder], weights(wt[i][reorder])) == median_answers[i] end end - data = [4, 3, 2, 1] wt = [0, 0, 0, 0] @test_throws MethodError wmedian(data[1]) @@ -266,6 +256,7 @@ wt = [-1, -1, -1, -1, -1] wt = [-1, -1, -1, 0, 0] @test_throws ErrorException median(data, weights(wt)) + # Weighted quantile tests data = ( [7, 1, 2, 4, 10], @@ -289,25 +280,25 @@ data = ( [-10, 1, 1, -10, -10], ) wt = ( - weights([1, 1/3, 1/3, 1/3, 1], false), - weights([1, 1, 1, 1, 1], false), - weights([1, 1/3, 1/3, 1/3, 1, 1], false), - weights([1/3, 1/3, 1/3, 1, 1, 1], false), - weights([30, 191, 9, 0], false), - weights([10, 1, 1, 1, 9], false), - weights([10, 1, 1, 1, 900], false), - weights([1, 3, 5, 4, 2], false), - weights([2, 2, 5, 1, 2, 2, 1, 6], false), - weights([0.1, 0.1, 0.8], false), - weights([5, 5, 4, 1], false), - weights([30, 56, 144, 24, 55, 43, 67], false), - weights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], false), - weights([12], false), - weights([7, 1, 1, 1, 6], false), - weights([1, 0, 0, 0, 2], false), - weights([1, 2, 3, 4, 5], false), - weights([0.1, 0.2, 0.3, 0.2, 0.1], false), - weights([1, 1, 1, 1, 1], false), + weights([1, 1/3, 1/3, 1/3, 1]), + weights([1, 1, 1, 1, 1]), + weights([1, 1/3, 1/3, 1/3, 1, 1]), + weights([1/3, 1/3, 1/3, 1, 1, 1]), + weights([30, 191, 9, 0]), + weights([10, 1, 1, 1, 9]), + weights([10, 1, 1, 1, 900]), + weights([1, 3, 5, 4, 2]), + weights([2, 2, 5, 1, 2, 2, 1, 6]), + weights([0.1, 0.1, 0.8]), + weights([5, 5, 4, 1]), + weights([30, 56, 144, 24, 55, 43, 67]), + weights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), + weights([12]), + weights([7, 1, 1, 1, 6]), + weights([1, 0, 0, 0, 2]), + weights([1, 2, 3, 4, 5]), + weights([0.1, 0.2, 0.3, 0.2, 0.1]), + weights([1, 1, 1, 1, 1]), ) quantile_answers = ( [1.0,3.6000000000000005,6.181818181818182,8.2,10.0], @@ -343,15 +334,15 @@ for i = 1:length(data) for j = 1:10 # order of w does not matter reorder = sortperm(rand(length(data[i]))) - @test quantile(data[i][reorder], weights(wt[i][reorder], false), p) ≈ quantile_answers[i] + @test quantile(data[i][reorder], weights(wt[i][reorder]), p) ≈ quantile_answers[i] end end # w = 1 corresponds to base quantile for i = 1:length(data) - @test quantile(data[i], weights(ones(Int64, length(data[i])), false), p) ≈ quantile(data[i], p) + @test quantile(data[i], weights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) for j = 1:10 prandom = rand(4) - @test quantile(data[i], weights(ones(Int64, length(data[i])), false), prandom) ≈ quantile(data[i], prandom) + @test quantile(data[i], weights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) end end @@ -359,8 +350,8 @@ end v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] answer = 6.181818181818182 -@test quantile(data[1], weights(w, false), 0.5) ≈ answer -@test wquantile(data[1], weights(w, false), [0.5]) ≈ [answer] -@test wquantile(data[1], weights(w, false), 0.5) ≈ answer +@test quantile(data[1], weights(w), 0.5) ≈ answer +@test wquantile(data[1], weights(w), [0.5]) ≈ [answer] +@test wquantile(data[1], weights(w), 0.5) ≈ answer @test wquantile(data[1], w, [0.5]) ≈ [answer] @test wquantile(data[1], w, 0.5) ≈ answer diff --git a/test/wsampling.jl b/test/wsampling.jl index f4a924828..8357df6a2 100644 --- a/test/wsampling.jl +++ b/test/wsampling.jl @@ -35,7 +35,7 @@ end import StatsBase: direct_sample!, alias_sample! n = 10^5 -wv = weights([0.2, 0.8, 0.4, 0.6], false) +wv = weights([0.2, 0.8, 0.4, 0.6]) a = direct_sample!(4:7, wv, zeros(Int, n, 3)) check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=false) @@ -78,7 +78,7 @@ import StatsBase: naive_wsample_norep!, efraimidis_a_wsample_norep!, efraimidis_ares_wsample_norep!, efraimidis_aexpj_wsample_norep! n = 10^5 -wv = weights([0.2, 0.8, 0.4, 0.6], false) +wv = weights([0.2, 0.8, 0.4, 0.6]) a = zeros(Int, 3, n) for j = 1:n