From a323fc9ab06bd1a3429f88a58044d413ac5633ef Mon Sep 17 00:00:00 2001 From: Rory-Finnegan Date: Tue, 25 Apr 2017 09:52:31 -0500 Subject: [PATCH 01/50] Initial work on adding weight vector types with bias correction. NOTE: This broke a lot of things, so for now I'm keeping the existing tests without correction * WeightVec -> AbstractWeights * Weights (aka default reliability weights) * FrequencyWeights (limited to vectors of integers) * All weights take a corrected argument which defaults to true. * Added an exponential function which creates a set of exponential `Weights`. * Updated existing test cases to work with the appropriate types and added false to `weights` calls to avoid breaking existing test cases. --- src/StatsBase.jl | 8 ++- src/counts.jl | 48 +++++++-------- src/cov.jl | 28 ++++----- src/deprecates.jl | 20 +++---- src/hist.jl | 19 +++--- src/moments.jl | 69 +++++++++++----------- src/sampling.jl | 54 ++++++++--------- src/weights.jl | 145 ++++++++++++++++++++++++++++++++++------------ test/counts.jl | 8 +-- test/cov.jl | 4 +- test/moments.jl | 8 +-- test/sampling.jl | 5 +- test/weights.jl | 105 ++++++++++++++++++--------------- test/wsampling.jl | 8 +-- 14 files changed, 304 insertions(+), 225 deletions(-) diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 7b9934d3e..6ba58fced 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -17,8 +17,12 @@ module StatsBase export ## weights - WeightVec, # the type to represent a weight vector - weights, # construct a weight vector + AbstractWeights, # the abstract type to represent any weight vector + Weights, # the default type for representing a weight vector + FrequencyWeights, # the type for representing a frequency weight vectors + weights, # construct a weights vector + frequency, # construct a frequency weights vector + exponential, # construct a weights vector using a exponential smoothing schema wsum, # weighted sum with vector as second argument wsum!, # weighted sum across dimensions with provided storage wmean, # weighted mean diff --git a/src/counts.jl b/src/counts.jl index 5535b7066..853fc9382 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -35,7 +35,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange) return r end -function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv::WeightVec) +function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) k = length(levels) length(r) == k || throw(DimensionMismatch()) @@ -55,7 +55,7 @@ end """ - counts(x, levels=span(x), [wv::WeightVec]) + counts(x, levels=span(x), [wv::AbstractWeights]) Count the number of times that values in the range `levels` occur in `x`. The output is a vector of length `length(levels)`. If a weighting @@ -64,40 +64,40 @@ raw counts. """ counts(x::IntegerArray, levels::IntUnitRange) = addcounts!(zeros(Int, length(levels)), x, levels) -counts(x::IntegerArray, levels::IntUnitRange, wv::WeightVec) = +counts(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) = addcounts!(zeros(eltype(wv), length(levels)), x, levels, wv) """ - counts(x, k::Integer, [wv::WeightVec]) + counts(x, k::Integer, [wv::AbstractWeights]) Count the number of times integers in the range 1 to `k` occur in `x`. """ counts(x::IntegerArray, k::Integer) = counts(x, 1:k) -counts(x::IntegerArray, k::Integer, wv::WeightVec) = counts(x, 1:k, wv) +counts(x::IntegerArray, k::Integer, wv::AbstractWeights) = counts(x, 1:k, wv) counts(x::IntegerArray) = counts(x, span(x)) -counts(x::IntegerArray, wv::WeightVec) = counts(x, span(x), wv) +counts(x::IntegerArray, wv::AbstractWeights) = counts(x, span(x), wv) """ - proportions(x, levels=span(x), [wv::WeightVec]) + proportions(x, levels=span(x), [wv::AbstractWeights]) Return the proportion of values in the range `levels` that occur in `x`. Equivalent to `counts(x, levels) / length(x)`. If a weighting vector `wv` is specified, the sum of the weights is used rather than the raw counts. """ proportions(x::IntegerArray, levels::IntUnitRange) = counts(x, levels) .* inv(length(x)) -proportions(x::IntegerArray, levels::IntUnitRange, wv::WeightVec) = +proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) = counts(x, levels, wv) .* inv(sum(wv)) """ - proportions(x, k::Integer, [wv::WeightVec]) + proportions(x, k::Integer, [wv::AbstractWeights]) Return the proportion of integers in 1 to `k` that occur in `x`. """ proportions(x::IntegerArray, k::Integer) = proportions(x, 1:k) -proportions(x::IntegerArray, k::Integer, wv::WeightVec) = proportions(x, 1:k, wv) +proportions(x::IntegerArray, k::Integer, wv::AbstractWeights) = proportions(x, 1:k, wv) proportions(x::IntegerArray) = proportions(x, span(x)) -proportions(x::IntegerArray, wv::WeightVec) = proportions(x, span(x), wv) +proportions(x::IntegerArray, wv::AbstractWeights) = proportions(x, span(x), wv) #### functions for counting a single list of integers (2D) @@ -132,7 +132,7 @@ function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels:: end function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, - levels::NTuple{2,IntUnitRange}, wv::WeightVec) + levels::NTuple{2,IntUnitRange}, wv::AbstractWeights) # add counts of integers from x to r n = length(x) @@ -169,39 +169,39 @@ function counts(x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange} addcounts!(zeros(Int, length(levels[1]), length(levels[2])), x, y, levels) end -function counts(x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::WeightVec) +function counts(x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::AbstractWeights) addcounts!(zeros(eltype(wv), length(levels[1]), length(levels[2])), x, y, levels, wv) end counts(x::IntegerArray, y::IntegerArray, levels::IntUnitRange) = counts(x, y, (levels, levels)) -counts(x::IntegerArray, y::IntegerArray, levels::IntUnitRange, wv::WeightVec) = +counts(x::IntegerArray, y::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) = counts(x, y, (levels, levels), wv) counts(x::IntegerArray, y::IntegerArray, ks::NTuple{2,Integer}) = counts(x, y, (1:ks[1], 1:ks[2])) -counts(x::IntegerArray, y::IntegerArray, ks::NTuple{2,Integer}, wv::WeightVec) = +counts(x::IntegerArray, y::IntegerArray, ks::NTuple{2,Integer}, wv::AbstractWeights) = counts(x, y, (1:ks[1], 1:ks[2]), wv) counts(x::IntegerArray, y::IntegerArray, k::Integer) = counts(x, y, (1:k, 1:k)) -counts(x::IntegerArray, y::IntegerArray, k::Integer, wv::WeightVec) = +counts(x::IntegerArray, y::IntegerArray, k::Integer, wv::AbstractWeights) = counts(x, y, (1:k, 1:k), wv) counts(x::IntegerArray, y::IntegerArray) = counts(x, y, (span(x), span(y))) -counts(x::IntegerArray, y::IntegerArray, wv::WeightVec) = counts(x, y, (span(x), span(y)), wv) +counts(x::IntegerArray, y::IntegerArray, wv::AbstractWeights) = counts(x, y, (span(x), span(y)), wv) proportions(x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}) = counts(x, y, levels) .* inv(length(x)) -proportions(x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::WeightVec) = +proportions(x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::AbstractWeights) = counts(x, y, levels, wv) .* inv(sum(wv)) proportions(x::IntegerArray, y::IntegerArray, ks::NTuple{2,Integer}) = proportions(x, y, (1:ks[1], 1:ks[2])) -proportions(x::IntegerArray, y::IntegerArray, ks::NTuple{2,Integer}, wv::WeightVec) = +proportions(x::IntegerArray, y::IntegerArray, ks::NTuple{2,Integer}, wv::AbstractWeights) = proportions(x, y, (1:ks[1], 1:ks[2]), wv) proportions(x::IntegerArray, y::IntegerArray, k::Integer) = proportions(x, y, (1:k, 1:k)) -proportions(x::IntegerArray, y::IntegerArray, k::Integer, wv::WeightVec) = +proportions(x::IntegerArray, y::IntegerArray, k::Integer, wv::AbstractWeights) = proportions(x, y, (1:k, 1:k), wv) proportions(x::IntegerArray, y::IntegerArray) = proportions(x, y, (span(x), span(y))) -proportions(x::IntegerArray, y::IntegerArray, wv::WeightVec) = +proportions(x::IntegerArray, y::IntegerArray, wv::AbstractWeights) = proportions(x, y, (span(x), span(y)), wv) @@ -233,7 +233,7 @@ function addcounts!{T}(cm::Dict{T}, x::AbstractArray{T}) return cm end -function addcounts!{T,W}(cm::Dict{T}, x::AbstractArray{T}, wv::WeightVec{W}) +function addcounts!{T,W}(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractWeights{W}) n = length(x) length(wv) == n || throw(DimensionMismatch()) w = values(wv) @@ -255,7 +255,7 @@ Return a dictionary mapping each unique value in `x` to its number of occurrences. """ countmap{T}(x::AbstractArray{T}) = addcounts!(Dict{T,Int}(), x) -countmap{T,W}(x::AbstractArray{T}, wv::WeightVec{W}) = addcounts!(Dict{T,W}(), x, wv) +countmap{T,W}(x::AbstractArray{T}, wv::AbstractWeights{W}) = addcounts!(Dict{T,W}(), x, wv) """ @@ -265,4 +265,4 @@ Return a dictionary mapping each unique value in `x` to its proportion in `x`. """ proportionmap(x::AbstractArray) = _normalize_countmap(countmap(x), length(x)) -proportionmap(x::AbstractArray, wv::WeightVec) = _normalize_countmap(countmap(x, wv), sum(wv)) +proportionmap(x::AbstractArray, wv::AbstractWeights) = _normalize_countmap(countmap(x, wv), sum(wv)) diff --git a/src/cov.jl b/src/cov.jl index 1715b236f..83faeda84 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -25,12 +25,12 @@ end scattermat_zm(x::DenseMatrix, vardim::Int) = Base.unscaled_covzm(x, vardim) -scattermat_zm(x::DenseMatrix, wv::WeightVec, vardim::Int) = +scattermat_zm(x::DenseMatrix, wv::AbstractWeights, vardim::Int) = _symmetrize!(Base.unscaled_covzm(x, _scalevars(x, values(wv), vardim), vardim)) """ - scattermat(X, [wv::WeightVec]; mean=nothing, vardim=1) + scattermat(X, [wv::AbstractWeights]; mean=nothing, vardim=1) Compute the scatter matrix, which is an unnormalized covariance matrix. A weighting vector `wv` can be specified to weight @@ -48,7 +48,7 @@ function scattermat end """ - cov(X, wv::WeightVec; mean=nothing, vardim=1) + cov(X, wv::AbstractWeights; mean=nothing, vardim=1) Compute the weighted covariance matrix. By default, the covariance matrix is normalized by the sum of the weights. That is, `cov(X, wv)` @@ -58,7 +58,7 @@ cov """ - mean_and_cov(x, [wv::WeightVec]; vardim=1) -> (mean, cov) + mean_and_cov(x, [wv::AbstractWeights]; vardim=1) -> (mean, cov) Return the mean and covariance matrix as a tuple. A weighting vector `wv` can be specified. `vardim` that designates whether @@ -74,21 +74,21 @@ function mean_and_cov end scattermat_zm(x .- mean, vardim) end - function scattermat(x::DenseMatrix, wv::WeightVec; mean=nothing, vardim::Int=1) + function scattermat(x::DenseMatrix, wv::AbstractWeights; mean=nothing, vardim::Int=1) mean == 0 ? scattermat_zm(x, wv, vardim) : mean == nothing ? scattermat_zm(x .- Base.mean(x, wv, vardim), wv, vardim) : scattermat_zm(x .- mean, wv, vardim) end ## weighted cov - Base.cov(x::DenseMatrix, wv::WeightVec; mean=nothing, vardim::Int=1) = - scale!(scattermat(x, wv; mean=mean, vardim=vardim), inv(sum(wv))) + Base.cov(x::DenseMatrix, wv::AbstractWeights; mean=nothing, vardim::Int=1) = + scale!(scattermat(x, wv; mean=mean, vardim=vardim), inv(bias(wv))) function mean_and_cov(x::DenseMatrix; vardim::Int=1) m = mean(x, vardim) return m, Base.covm(x, m; vardim=vardim) end - function mean_and_cov(x::DenseMatrix, wv::WeightVec; vardim::Int=1) + function mean_and_cov(x::DenseMatrix, wv::AbstractWeights; vardim::Int=1) m = mean(x, wv, vardim) return m, Base.cov(x, wv; mean=m, vardim=vardim) end @@ -96,27 +96,27 @@ else scattermatm(x::DenseMatrix, mean, vardim::Int=1) = scattermat_zm(x .- mean, vardim) - scattermatm(x::DenseMatrix, mean, wv::WeightVec, vardim::Int=1) = + scattermatm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1) = scattermat_zm(x .- mean, wv, vardim) scattermat(x::DenseMatrix, vardim::Int=1) = scattermatm(x, Base.mean(x, vardim), vardim) - scattermat(x::DenseMatrix, wv::WeightVec, vardim::Int=1) = + scattermat(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = scattermatm(x, Base.mean(x, wv, vardim), wv, vardim) ## weighted cov - Base.covm(x::DenseMatrix, mean, wv::WeightVec, vardim::Int=1) = - scale!(scattermatm(x, mean, wv, vardim), inv(sum(wv))) + Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1) = + scale!(scattermatm(x, mean, wv, vardim), inv(bias(wv))) - Base.cov(x::DenseMatrix, wv::WeightVec, vardim::Int=1) = + Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = Base.covm(x, Base.mean(x, wv, vardim), wv, vardim) function mean_and_cov(x::DenseMatrix, vardim::Int=1) m = mean(x, vardim) return m, Base.covm(x, m, vardim) end - function mean_and_cov(x::DenseMatrix, wv::WeightVec, vardim::Int=1) + function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) m = mean(x, wv, vardim) return m, Base.cov(x, wv, vardim) end diff --git a/src/deprecates.jl b/src/deprecates.jl index 92cb047de..a75707fd0 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -2,16 +2,16 @@ import Base.@deprecate import Base.depwarn import Base.varm, Base.stdm -@deprecate varm(v::RealArray, m::Real, wv::WeightVec) varm(v, wv, m) -@deprecate varm(A::RealArray, M::RealArray, wv::WeightVec, dim::Int) varm(v, wv, m, dim) -@deprecate stdm(v::RealArray, m::Real, wv::WeightVec) stdm(v, wv, m) -@deprecate stdm(v::RealArray, m::RealArray, wv::WeightVec, dim::Int) stdm(v, wv, m, dim) - -@deprecate _moment2(v::RealArray, m::Real, wv::WeightVec) _moment2(v, wv, m) -@deprecate _moment3(v::RealArray, m::Real, wv::WeightVec) _moment3(v, wv, m) -@deprecate _moment4(v::RealArray, m::Real, wv::WeightVec) _moment4(v, wv, m) -@deprecate _momentk(v::RealArray, k::Int, m::Real, wv::WeightVec) _momentk(v, k, wv, m) -@deprecate moment(v::RealArray, k::Int, m::Real, wv::WeightVec) moment(v, k, wv, m) +@deprecate varm(v::RealArray, m::Real, wv::AbstractWeights) varm(v, wv, m) +@deprecate varm(A::RealArray, M::RealArray, wv::AbstractWeights, dim::Int) varm(v, wv, m, dim) +@deprecate stdm(v::RealArray, m::Real, wv::AbstractWeights) stdm(v, wv, m) +@deprecate stdm(v::RealArray, m::RealArray, wv::AbstractWeights, dim::Int) stdm(v, wv, m, dim) + +@deprecate _moment2(v::RealArray, m::Real, wv::AbstractWeights) _moment2(v, wv, m) +@deprecate _moment3(v::RealArray, m::Real, wv::AbstractWeights) _moment3(v, wv, m) +@deprecate _moment4(v::RealArray, m::Real, wv::AbstractWeights) _moment4(v, wv, m) +@deprecate _momentk(v::RealArray, k::Int, m::Real, wv::AbstractWeights) _momentk(v, k, wv, m) +@deprecate moment(v::RealArray, k::Int, m::Real, wv::AbstractWeights) moment(v, k, wv, m) @deprecate AIC(obj::StatisticalModel) aic(obj) @deprecate AICc(obj::StatisticalModel) aicc(obj) diff --git a/src/hist.jl b/src/hist.jl index 73b6a1f37..f4a65f6aa 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -218,20 +218,18 @@ Histogram(edge::AbstractVector, closed::Symbol=:default_left, isdensity::Bool=fa push!{T,E}(h::AbstractHistogram{T,1,E}, x::Real, w::Real) = push!(h, (x,), w) push!{T,E}(h::AbstractHistogram{T,1,E}, x::Real) = push!(h,x,one(T)) append!{T}(h::AbstractHistogram{T,1}, v::AbstractVector) = append!(h, (v,)) -append!{T}(h::AbstractHistogram{T,1}, v::AbstractVector, wv::Union{AbstractVector,WeightVec}) = append!(h, (v,), wv) - +append!{T}(h::AbstractHistogram{T,1}, v::AbstractVector, wv::Union{AbstractVector,AbstractWeights}) = append!(h, (v,), wv) fit{T}(::Type{Histogram{T}},v::AbstractVector, edg::AbstractVector; closed::Symbol=:default_left) = fit(Histogram{T},(v,), (edg,), closed=closed) fit{T}(::Type{Histogram{T}},v::AbstractVector; closed::Symbol=:default_left, nbins=sturges(length(v))) = fit(Histogram{T},(v,); closed=closed, nbins=nbins) -fit{T}(::Type{Histogram{T}},v::AbstractVector, wv::WeightVec, edg::AbstractVector; closed::Symbol=:default_left) = +fit{T}(::Type{Histogram{T}},v::AbstractVector, wv::AbstractWeights, edg::AbstractVector; closed::Symbol=:default_left) = fit(Histogram{T},(v,), wv, (edg,), closed=closed) -fit{T}(::Type{Histogram{T}},v::AbstractVector, wv::WeightVec; closed::Symbol=:default_left, nbins=sturges(length(v))) = +fit{T}(::Type{Histogram{T}},v::AbstractVector, wv::AbstractWeights; closed::Symbol=:default_left, nbins=sturges(length(v))) = fit(Histogram{T}, (v,), wv; closed=closed, nbins=nbins) -fit{W}(::Type{Histogram}, v::AbstractVector, wv::WeightVec{W}, args...; kwargs...) = fit(Histogram{W}, v, wv, args...; kwargs...) - +fit{W}(::Type{Histogram}, v::AbstractVector, wv::AbstractWeights{W}, args...; kwargs...) = fit(Histogram{W}, v, wv, args...; kwargs...) # N-dimensional @@ -262,6 +260,7 @@ function append!{T,N}(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}) end h end + function append!{T,N}(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, wv::AbstractVector) @inbounds for i in eachindex(wv, vs...) xs = _multi_getindex(i, vs...) @@ -269,7 +268,7 @@ function append!{T,N}(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, w end h end -append!{T,N}(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, wv::WeightVec) = append!(h, vs, values(wv)) +append!{T,N}(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, wv::AbstractWeights) = append!(h, vs, values(wv)) # Turn kwargs nbins into a type-stable tuple of integers: @@ -299,16 +298,16 @@ fit{T,N}(::Type{Histogram{T}}, vs::NTuple{N,AbstractVector}; closed::Symbol=:def fit(Histogram{T}, vs, histrange(vs,_nbins_tuple(vs, nbins),closed); closed=closed) end -fit{T,N,W}(::Type{Histogram{T}}, vs::NTuple{N,AbstractVector}, wv::WeightVec{W}, edges::NTuple{N,AbstractVector}; closed::Symbol=:default_left) = +fit{T,N,W}(::Type{Histogram{T}}, vs::NTuple{N,AbstractVector}, wv::AbstractWeights{W}, edges::NTuple{N,AbstractVector}; closed::Symbol=:default_left) = append!(Histogram(edges, T, _check_closed_arg(closed,:fit), false), vs, wv) -fit{T,N}(::Type{Histogram{T}}, vs::NTuple{N,AbstractVector}, wv::WeightVec; closed::Symbol=:default_left, nbins=sturges(length(vs[1]))) = begin +fit{T,N}(::Type{Histogram{T}}, vs::NTuple{N,AbstractVector}, wv::AbstractWeights; closed::Symbol=:default_left, nbins=sturges(length(vs[1]))) = begin closed = _check_closed_arg(closed,:fit) fit(Histogram{T}, vs, wv, histrange(vs,_nbins_tuple(vs, nbins),closed); closed=closed) end fit(::Type{Histogram}, args...; kwargs...) = fit(Histogram{Int}, args...; kwargs...) -fit{N,W}(::Type{Histogram}, vs::NTuple{N,AbstractVector}, wv::WeightVec{W}, args...; kwargs...) = fit(Histogram{W}, vs, wv, args...; kwargs...) +fit{N,W}(::Type{Histogram}, vs::NTuple{N,AbstractVector}, wv::AbstractWeights{W}, args...; kwargs...) = fit(Histogram{W}, vs, wv, args...; kwargs...) # Get a suitable high-precision type for the norm of a histogram. diff --git a/src/moments.jl b/src/moments.jl index f18b1fd02..0c5b83f07 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -3,7 +3,7 @@ ## var """ - varm(x, wv::WeightVec, m, [dim]) + varm(x, wv::AbstractWeights, m, [dim]) Return the variance of a real-valued array `x` with a known mean `m`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights @@ -15,10 +15,10 @@ whereas it's `length(x)-1` in `Base.varm`. The impact is that this is not a weighted estimate of the population variance based on the sample; it's the weighted variance of the sample. """ -Base.varm(v::RealArray, wv::WeightVec, m::Real) = _moment2(v, wv, m) +Base.varm(v::RealArray, wv::AbstractWeights, m::Real) = _moment2(v, wv, m) """ - var(x, wv::WeightVec, [dim]; mean=nothing) + var(x, wv::AbstractWeights, [dim]; mean=nothing) Return the variance of a real-valued array `x`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights (also called case weights) @@ -30,7 +30,7 @@ whereas it's `length(x)-1` in `Base.var`. The impact is that this is not a weighted estimate of the population variance based on the sample; it's the weighted variance of the sample. """ -function Base.var(v::RealArray, wv::WeightVec; mean=nothing) +function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing) mean == 0 ? Base.varm(v, wv, 0) : mean == nothing ? varm(v, wv, Base.mean(v, wv)) : varm(v, wv, mean) @@ -38,10 +38,10 @@ end ## var along dim -Base.varm!(R::AbstractArray, A::RealArray, wv::WeightVec, M::RealArray, dim::Int) = - scale!(_wsum_centralize!(R, @functorize(abs2), A, values(wv), M, dim, true), inv(sum(wv))) +Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int) = + scale!(_wsum_centralize!(R, @functorize(abs2), A, values(wv), M, dim, true), inv(bias(wv))) -function var!(R::AbstractArray, A::RealArray, wv::WeightVec, dim::Int; mean=nothing) +function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) if mean == 0 Base.varm!(R, A, wv, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim) @@ -62,14 +62,14 @@ function var!(R::AbstractArray, A::RealArray, wv::WeightVec, dim::Int; mean=noth end end -Base.varm(A::RealArray, wv::WeightVec, M::RealArray, dim::Int) = +Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int) = @static if VERSION < v"0.6.0-dev.1121" Base.varm!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, M, dim) else Base.varm!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, M, dim) end -Base.var(A::RealArray, wv::WeightVec, dim::Int; mean=nothing) = +Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) = @static if VERSION < v"0.6.0-dev.1121" var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim; mean=mean) else @@ -78,31 +78,31 @@ Base.var(A::RealArray, wv::WeightVec, dim::Int; mean=nothing) = ## std """ - stdm(v, wv::WeightVec, m, [dim]) + stdm(v, wv::AbstractWeights, m, [dim]) Return the standard deviation of a real-valued array `v` with a known mean `m`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights (also called case weights) for the estimate. """ -Base.stdm(v::RealArray, wv::WeightVec, m::Real) = sqrt(varm(v, wv, m)) +Base.stdm(v::RealArray, wv::AbstractWeights, m::Real) = sqrt(varm(v, wv, m)) """ - std(v, wv::WeightVec, [dim]; mean=nothing) + std(v, wv::AbstractWeights, [dim]; mean=nothing) Return the standard deviation of a real-valued array `v`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights (also called case weights) for the estimate. """ -Base.std(v::RealArray, wv::WeightVec; mean=nothing) = sqrt.(var(v, wv; mean=mean)) +Base.std(v::RealArray, wv::AbstractWeights; mean=nothing) = sqrt.(var(v, wv; mean=mean)) Base.stdm(v::RealArray, m::RealArray, dim::Int) = Base.sqrt!(varm(v, m, dim)) -Base.stdm(v::RealArray, wv::WeightVec, m::RealArray, dim::Int) = sqrt.(varm(v, wv, m, dim)) -Base.std(v::RealArray, wv::WeightVec, dim::Int; mean=nothing) = sqrt.(var(v, wv, dim; mean=mean)) +Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int) = sqrt.(varm(v, wv, m, dim)) +Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) = sqrt.(var(v, wv, dim; mean=mean)) ##### Fused statistics """ - mean_and_var(x, [wv::WeightVec], [dim]) -> (mean, var) + mean_and_var(x, [wv::AbstractWeights], [dim]) -> (mean, var) Return the mean and variance of a real-valued array `x`, optionally over a dimension `dim`, as a tuple. A weighting vector `wv` can be specified to weight the estimates. @@ -111,7 +111,7 @@ The weights are assumed to be frequency weights, also called case weights. mean_and_var(A::RealArray) = (m = mean(A); (m, varm(A, m))) """ - mean_and_std(x, [wv::WeightVec], [dim]) -> (mean, std) + mean_and_std(x, [wv::AbstractWeights], [dim]) -> (mean, std) Return the mean and standard deviation of a real-valued array `x`, optionally over a dimension `dim`, as a tuple. A weighting vector `wv` can be specified @@ -120,14 +120,14 @@ called case weights. """ mean_and_std(A::RealArray) = (m = mean(A); (m, stdm(A, m))) -mean_and_var(A::RealArray, wv::WeightVec) = (m = mean(A, wv); (m, varm(A, wv, m))) -mean_and_std(A::RealArray, wv::WeightVec) = (m = mean(A, wv); (m, stdm(A, wv, m))) +mean_and_var(A::RealArray, wv::AbstractWeights) = (m = mean(A, wv); (m, varm(A, wv, m))) +mean_and_std(A::RealArray, wv::AbstractWeights) = (m = mean(A, wv); (m, stdm(A, wv, m))) mean_and_var(A::RealArray, dim::Int) = (m = mean(A, dim); (m, varm(A, m, dim))) mean_and_std(A::RealArray, dim::Int) = (m = mean(A, dim); (m, stdm(A, m, dim))) -mean_and_var(A::RealArray, wv::WeightVec, dim::Int) = (m = mean(A, wv, dim); (m, varm(A, wv, m, dim))) -mean_and_std(A::RealArray, wv::WeightVec, dim::Int) = (m = mean(A, wv, dim); (m, stdm(A, wv, m, dim))) +mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int) = (m = mean(A, wv, dim); (m, varm(A, wv, m, dim))) +mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int) = (m = mean(A, wv, dim); (m, stdm(A, wv, m, dim))) ##### General central moment @@ -142,7 +142,7 @@ function _moment2(v::RealArray, m::Real) s / n end -function _moment2(v::RealArray, wv::WeightVec, m::Real) +function _moment2(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 w = values(wv) @@ -163,7 +163,7 @@ function _moment3(v::RealArray, m::Real) s / n end -function _moment3(v::RealArray, wv::WeightVec, m::Real) +function _moment3(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 w = values(wv) @@ -184,7 +184,7 @@ function _moment4(v::RealArray, m::Real) s / n end -function _moment4(v::RealArray, wv::WeightVec, m::Real) +function _moment4(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 w = values(wv) @@ -205,7 +205,7 @@ function _momentk(v::RealArray, k::Int, m::Real) s / n end -function _momentk(v::RealArray, k::Int, wv::WeightVec, m::Real) +function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 w = values(wv) @@ -218,7 +218,7 @@ end """ - moment(v, k, [wv::WeightVec], m=mean(v)) + moment(v, k, [wv::AbstractWeights], m=mean(v)) Return the `k`th order central moment of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. @@ -230,7 +230,7 @@ function moment(v::RealArray, k::Int, m::Real) _momentk(v, k, m) end -function moment(v::RealArray, k::Int, wv::WeightVec, m::Real) +function moment(v::RealArray, k::Int, wv::AbstractWeights, m::Real) k == 2 ? _moment2(v, wv, m) : k == 3 ? _moment3(v, wv, m) : k == 4 ? _moment4(v, wv, m) : @@ -238,7 +238,7 @@ function moment(v::RealArray, k::Int, wv::WeightVec, m::Real) end moment(v::RealArray, k::Int) = moment(v, k, mean(v)) -moment(v::RealArray, k::Int, wv::WeightVec) = moment(v, k, wv, mean(v, wv)) +moment(v::RealArray, k::Int, wv::AbstractWeights) = moment(v, k, wv, mean(v, wv)) ##### Skewness and Kurtosis @@ -246,7 +246,7 @@ moment(v::RealArray, k::Int, wv::WeightVec) = moment(v, k, wv, mean(v, wv)) # Skewness # This is Type 1 definition according to Joanes and Gill (1998) """ - skewness(v, [wv::WeightVec], m=mean(v)) + skewness(v, [wv::AbstractWeights], m=mean(v)) Compute the standardized skewness of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. @@ -267,7 +267,7 @@ function skewness(v::RealArray, m::Real) return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 end -function skewness(v::RealArray, wv::WeightVec, m::Real) +function skewness(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) @@ -289,12 +289,12 @@ function skewness(v::RealArray, wv::WeightVec, m::Real) end skewness(v::RealArray) = skewness(v, mean(v)) -skewness(v::RealArray, wv::WeightVec) = skewness(v, wv, mean(v, wv)) +skewness(v::RealArray, wv::AbstractWeights) = skewness(v, wv, mean(v, wv)) # (excessive) Kurtosis # This is Type 1 definition according to Joanes and Gill (1998) """ - kurtosis(v, [wv::WeightVec], m=mean(v)) + kurtosis(v, [wv::AbstractWeights], m=mean(v)) Compute the excess kurtosis of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. @@ -314,7 +314,7 @@ function kurtosis(v::RealArray, m::Real) return (cm4 / (cm2 * cm2)) - 3.0 end -function kurtosis(v::RealArray, wv::WeightVec, m::Real) +function kurtosis(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) @@ -337,5 +337,4 @@ function kurtosis(v::RealArray, wv::WeightVec, m::Real) end kurtosis(v::RealArray) = kurtosis(v, mean(v)) -kurtosis(v::RealArray, wv::WeightVec) = kurtosis(v, wv, mean(v, wv)) - +kurtosis(v::RealArray, wv::AbstractWeights) = kurtosis(v, wv, mean(v, wv)) diff --git a/src/sampling.jl b/src/sampling.jl index 4a134472a..c2eacb2c0 100644 --- a/src/sampling.jl +++ b/src/sampling.jl @@ -266,7 +266,7 @@ seqsample_c!(a::AbstractArray, x::AbstractArray) = seqsample_c!(Base.GLOBAL_RNG, ### Interface functions (poly-algorithms) """ - sample([rng], a, [wv::WeightVec]) + sample([rng], a, [wv::AbstractWeights]) Select a single random element of `a`. Sampling probabilities are proportional to the weights given in `wv`, if provided. @@ -279,7 +279,7 @@ sample(a::AbstractArray) = sample(Base.GLOBAL_RNG, a) """ - sample!([rng], a, [wv::WeightVec], x; replace=true, ordered=false) + sample!([rng], a, [wv::AbstractWeights], x; replace=true, ordered=false) Draw a random sample of `length(x)` elements from an array `a` and store the result in `x`. A polyalgorithm is used for sampling. @@ -332,7 +332,7 @@ sample!(a::AbstractArray, x::AbstractArray; replace::Bool=true, ordered::Bool=fa """ - sample([rng], a, [wv::WeightVec], n::Integer; replace=true, ordered=false) + sample([rng], a, [wv::AbstractWeights], n::Integer; replace=true, ordered=false) Select a random, optionally weighted sample of size `n` from an array `a` using a polyalgorithm. Sampling probabilities are proportional to the weights @@ -352,7 +352,7 @@ sample(a::AbstractArray, n::Integer; replace::Bool=true, ordered::Bool=false) = """ - sample([rng], a, [wv::WeightVec], dims::Dims; replace=true, ordered=false) + sample([rng], a, [wv::AbstractWeights], dims::Dims; replace=true, ordered=false) Select a random, optionally weighted sample from an array `a` specifying the dimensions `dims` of the output array. Sampling probabilities are @@ -377,7 +377,7 @@ sample(a::AbstractArray, dims::Dims; replace::Bool=true, ordered::Bool=false) = ################################################################ """ - sample([rng], wv::WeightVec) + sample([rng], wv::AbstractWeights) Select a single random integer in `1:length(wv)` with probabilities proportional to the weights given in `wv`. @@ -385,7 +385,7 @@ proportional to the weights given in `wv`. Optionally specify a random number generator ``rng`` as the first argument (defaults to ``Base.GLOBAL_RNG``). """ -function sample(rng::AbstractRNG, wv::WeightVec) +function sample(rng::AbstractRNG, wv::AbstractWeights) t = rand(rng) * sum(wv) w = values(wv) n = length(w) @@ -397,13 +397,13 @@ function sample(rng::AbstractRNG, wv::WeightVec) end return i end -sample(wv::WeightVec) = sample(Base.GLOBAL_RNG, wv) +sample(wv::AbstractWeights) = sample(Base.GLOBAL_RNG, wv) -sample(rng::AbstractRNG, a::AbstractArray, wv::WeightVec) = a[sample(rng, wv)] -sample(a::AbstractArray, wv::WeightVec) = sample(Base.GLOBAL_RNG, a, wv) +sample(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights) = a[sample(rng, wv)] +sample(a::AbstractArray, wv::AbstractWeights) = sample(Base.GLOBAL_RNG, a, wv) function direct_sample!(rng::AbstractRNG, a::AbstractArray, - wv::WeightVec, x::AbstractArray) + wv::AbstractWeights, x::AbstractArray) n = length(a) length(wv) == n || throw(DimensionMismatch("Inconsistent lengths.")) for i = 1:length(x) @@ -411,7 +411,7 @@ function direct_sample!(rng::AbstractRNG, a::AbstractArray, end return x end -direct_sample!(a::AbstractArray, wv::WeightVec, x::AbstractArray) = +direct_sample!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) = direct_sample!(Base.GLOBAL_RNG, a, wv, x) function make_alias_table!(w::AbstractVector{Float64}, wsum::Float64, @@ -473,7 +473,7 @@ function make_alias_table!(w::AbstractVector{Float64}, wsum::Float64, nothing end -function alias_sample!(rng::AbstractRNG, a::AbstractArray, wv::WeightVec, x::AbstractArray) +function alias_sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, x::AbstractArray) n = length(a) length(wv) == n || throw(DimensionMismatch("Inconsistent lengths.")) @@ -490,11 +490,11 @@ function alias_sample!(rng::AbstractRNG, a::AbstractArray, wv::WeightVec, x::Abs end return x end -alias_sample!(a::AbstractArray, wv::WeightVec, x::AbstractArray) = +alias_sample!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) = alias_sample!(Base.GLOBAL_RNG, a, wv, x) function naive_wsample_norep!(rng::AbstractRNG, a::AbstractArray, - wv::WeightVec, x::AbstractArray) + wv::AbstractWeights, x::AbstractArray) n = length(a) length(wv) == n || throw(DimensionMismatch("Inconsistent lengths.")) k = length(x) @@ -517,7 +517,7 @@ function naive_wsample_norep!(rng::AbstractRNG, a::AbstractArray, end return x end -naive_wsample_norep!(a::AbstractArray, wv::WeightVec, x::AbstractArray) = +naive_wsample_norep!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) = naive_wsample_norep!(Base.GLOBAL_RNG, a, wv, x) # Weighted sampling without replacement @@ -530,7 +530,7 @@ naive_wsample_norep!(a::AbstractArray, wv::WeightVec, x::AbstractArray) = # # Instead of keys u^(1/w) where u = random(0,1) keys w/v where v = randexp(1) are used. function efraimidis_a_wsample_norep!(rng::AbstractRNG, a::AbstractArray, - wv::WeightVec, x::AbstractArray) + wv::AbstractWeights, x::AbstractArray) n = length(a) length(wv) == n || throw(DimensionMismatch("a and wv must be of same length (got $n and $(length(wv))).")) k = length(x) @@ -548,7 +548,7 @@ function efraimidis_a_wsample_norep!(rng::AbstractRNG, a::AbstractArray, end return x end -efraimidis_a_wsample_norep!(a::AbstractArray, wv::WeightVec, x::AbstractArray) = +efraimidis_a_wsample_norep!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) = efraimidis_a_wsample_norep!(Base.GLOBAL_RNG, a, wv, x) # Weighted sampling without replacement @@ -561,7 +561,7 @@ efraimidis_a_wsample_norep!(a::AbstractArray, wv::WeightVec, x::AbstractArray) = # # Instead of keys u^(1/w) where u = random(0,1) keys w/v where v = randexp(1) are used. function efraimidis_ares_wsample_norep!(rng::AbstractRNG, a::AbstractArray, - wv::WeightVec, x::AbstractArray) + wv::AbstractWeights, x::AbstractArray) n = length(a) length(wv) == n || throw(DimensionMismatch("a and wv must be of same length (got $n and $(length(wv))).")) k = length(x) @@ -609,7 +609,7 @@ function efraimidis_ares_wsample_norep!(rng::AbstractRNG, a::AbstractArray, end return x end -efraimidis_ares_wsample_norep!(a::AbstractArray, wv::WeightVec, x::AbstractArray) = +efraimidis_ares_wsample_norep!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) = efraimidis_ares_wsample_norep!(Base.GLOBAL_RNG, a, wv, x) # Weighted sampling without replacement @@ -622,7 +622,7 @@ efraimidis_ares_wsample_norep!(a::AbstractArray, wv::WeightVec, x::AbstractArray # # Instead of keys u^(1/w) where u = random(0,1) keys w/v where v = randexp(1) are used. function efraimidis_aexpj_wsample_norep!(rng::AbstractRNG, a::AbstractArray, - wv::WeightVec, x::AbstractArray) + wv::AbstractWeights, x::AbstractArray) n = length(a) length(wv) == n || throw(DimensionMismatch("a and wv must be of same length (got $n and $(length(wv))).")) k = length(x) @@ -671,10 +671,10 @@ function efraimidis_aexpj_wsample_norep!(rng::AbstractRNG, a::AbstractArray, end return x end -efraimidis_aexpj_wsample_norep!(a::AbstractArray, wv::WeightVec, x::AbstractArray) = +efraimidis_aexpj_wsample_norep!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) = efraimidis_aexpj_wsample_norep!(Base.GLOBAL_RNG, a, wv, x) -function sample!(rng::AbstractRNG, a::AbstractArray, wv::WeightVec, x::AbstractArray; +function sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, x::AbstractArray; replace::Bool=true, ordered::Bool=false) n = length(a) k = length(x) @@ -704,20 +704,20 @@ function sample!(rng::AbstractRNG, a::AbstractArray, wv::WeightVec, x::AbstractA end return x end -sample!(a::AbstractArray, wv::WeightVec, x::AbstractArray) = +sample!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) = sample!(Base.GLOBAL_RNG, a, wv, x) -sample{T}(rng::AbstractRNG, a::AbstractArray{T}, wv::WeightVec, n::Integer; +sample{T}(rng::AbstractRNG, a::AbstractArray{T}, wv::AbstractWeights, n::Integer; replace::Bool=true, ordered::Bool=false) = sample!(rng, a, wv, Vector{T}(n); replace=replace, ordered=ordered) -sample(a::AbstractArray, wv::WeightVec, n::Integer; +sample(a::AbstractArray, wv::AbstractWeights, n::Integer; replace::Bool=true, ordered::Bool=false) = sample(Base.GLOBAL_RNG, a, wv, n; replace=replace, ordered=ordered) -sample{T}(rng::AbstractRNG, a::AbstractArray{T}, wv::WeightVec, dims::Dims; +sample{T}(rng::AbstractRNG, a::AbstractArray{T}, wv::AbstractWeights, dims::Dims; replace::Bool=true, ordered::Bool=false) = sample!(rng, a, wv, Array{T}(dims); replace=replace, ordered=ordered) -sample(a::AbstractArray, wv::WeightVec, dims::Dims; +sample(a::AbstractArray, wv::AbstractWeights, dims::Dims; replace::Bool=true, ordered::Bool=false) = sample!(Base.GLOBAL_RNG, a, wv, Array{T}(dims); replace=replace, ordered=ordered) diff --git a/src/weights.jl b/src/weights.jl index b542942b3..3ab1a0253 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -2,43 +2,112 @@ ###### Weight vector ##### if VERSION < v"0.6.0-dev.2123" - immutable WeightVec{S<:Real, T<:Real, V<:RealVector} <: RealVector{T} + abstract AbstractWeights{S<:Real, T<:Real, V<:RealVector} <: RealVector{T} + + immutable Weights{S<:Real, T<:Real, V<:RealVector} <: AbstractWeights{S, T, V} + values::V + sum::S + bias::Real + end + + immutable FrequencyWeights{S<:Integer, T<:Integer, V<:IntegerVector} <: AbstractWeights{S, T, V} + values::V + sum::S + bias::Int + end + + immutable ProbabilityWeights{S<:Real, T<:Real, V<:RealVector} <: AbstractWeights{S, T, V} values::V sum::S + bias::Real end else - immutable WeightVec{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} + abstract AbstractWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} + + immutable Weights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} + values::V + sum::S + bias::Real + end + + immutable FrequencyWeights{S<:Integer, T<:Integer, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} values::V sum::S + bias::Int + end + + immutable ProbabilityWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} + values::V + sum::S + bias::Real end end """ - WeightVec(vs, [wsum]) + Weights(vs, [wsum]) -Construct a `WeightVec` with weight values `vs` and sum of weights `wsum`. +Construct a `Weights` with weight values `vs` and sum of weights `wsum`. If omitted, `wsum` is computed. """ -function WeightVec{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) - return WeightVec{S, eltype(vs), V}(vs, s) +function Weights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs); corrected::Bool=true) + if isempty(vs) || !corrected + return Weights{typeof(s), eltype(vs), V}(vs, s, s) + else + return Weights{S, eltype(vs), V}(vs, s, s * (1 - sum(normalize(vs, 1) .^ 2))) + end +end + +function FrequencyWeights{S<:Integer, V<:IntegerVector}(vs::V, s::S=sum(vs); corrected::Bool=true) + return FrequencyWeights{S, eltype(vs), V}(vs, s, s - Int(corrected)) end +# TODO: constructor for ProbabilityWeights, but I'm not familiar with how bias correction works with these +# types of weights or if bias correction even makes sense. +# https://en.wikipedia.org/wiki/Inverse_probability_weighting + """ weights(vs) -Construct a `WeightVec` from a given array. +Construct a `Weights` type from a given array. +""" +weights(vs::RealVector, corrected=true) = Weights(vs; corrected=corrected) +weights(vs::RealArray, corrected=true) = Weights(vec(vs); corrected=corrected) + """ -weights(vs::RealVector) = WeightVec(vs) -weights(vs::RealArray) = WeightVec(vec(vs)) + frequency(vs) + +Construct a `FrequencyWeights` type from a given array. +""" +frequency(vs::RealVector, corrected=true) = FrequencyWeights(vs; corrected=corrected) +frequency(vs::RealArray, corrected=true) = FrequencyWeights(vec(vs); corrected=corrected) + +""" + exponential(n, [λ]) + +Constructs a `Weights` type with a desired length `n` and smoothing factor `λ`, +where each element is set to `λ * (1 - λ)^(1 - i)`. + +# Arguments +* `n::Integer`: the desired length of the `Weights` +* `λ::Real`: is a smoothing factor or rate paremeter between 0 .. 1. + As this value approaches 0 the resulting weights will be almost equal(), + while values closer to 1 will put higher weight on the end elements of the vector. +""" +function exponential(n::Integer, λ::Real=0.99) + @assert 0 <= λ <= 1 && n > 0 + w0 = map(i -> λ * (1 - λ)^(1 - i), 1:n) + return weights(w0) +end -eltype(wv::WeightVec) = eltype(wv.values) -length(wv::WeightVec) = length(wv.values) -values(wv::WeightVec) = wv.values -sum(wv::WeightVec) = wv.sum -isempty(wv::WeightVec) = isempty(wv.values) +eltype(wv::AbstractWeights) = eltype(wv.values) +length(wv::AbstractWeights) = length(wv.values) +values(wv::AbstractWeights) = wv.values +sum(wv::AbstractWeights) = wv.sum +bias(wv::AbstractWeights) = wv.bias +isempty(wv::AbstractWeights) = isempty(wv.values) -Base.getindex(wv::WeightVec, i) = getindex(wv.values, i) -Base.size(wv::WeightVec) = size(wv.values) +Base.getindex(wv::AbstractWeights, i) = getindex(wv.values, i) +Base.size(wv::AbstractWeights) = size(wv.values) ##### Weighted sum ##### @@ -54,9 +123,9 @@ wsum(v::AbstractVector, w::AbstractVector) = dot(v, w) wsum(v::AbstractArray, w::AbstractVector) = dot(vec(v), w) # Note: the methods for BitArray and SparseMatrixCSC are to avoid ambiguities -Base.sum(v::BitArray, w::WeightVec) = wsum(v, values(w)) -Base.sum(v::SparseMatrixCSC, w::WeightVec) = wsum(v, values(w)) -Base.sum(v::AbstractArray, w::WeightVec) = dot(v, values(w)) +Base.sum(v::BitArray, w::AbstractWeights) = wsum(v, values(w)) +Base.sum(v::SparseMatrixCSC, w::AbstractWeights) = wsum(v, values(w)) +Base.sum(v::AbstractArray, w::AbstractWeights) = dot(v, values(w)) ## wsum along dimension # @@ -255,10 +324,10 @@ end # extended sum! and wsum -Base.sum!{W<:Real}(R::AbstractArray, A::AbstractArray, w::WeightVec{W}, dim::Int; init::Bool=true) = +Base.sum!{W<:Real}(R::AbstractArray, A::AbstractArray, w::AbstractWeights{W}, dim::Int; init::Bool=true) = wsum!(R, A, values(w), dim; init=init) -Base.sum{T<:Number,W<:Real}(A::AbstractArray{T}, w::WeightVec{W}, dim::Int) = wsum(A, values(w), dim) +Base.sum{T<:Number,W<:Real}(A::AbstractArray{T}, w::AbstractWeights{W}, dim::Int) = wsum(A, values(w), dim) ###### Weighted means ##### @@ -270,18 +339,18 @@ Compute the weighted mean of an array `v` with weights `w`. """ function wmean{T<:Number}(v::AbstractArray{T}, w::AbstractVector) Base.depwarn("wmean is deprecated, use mean(v, weights(w)) instead.", :wmean) - mean(v, weights(w)) + mean(v, weights(w, false)) end -Base.mean(v::AbstractArray, w::WeightVec) = sum(v, w) / sum(w) +Base.mean(v::AbstractArray, w::AbstractWeights) = sum(v, w) / sum(w) -Base.mean!(R::AbstractArray, A::AbstractArray, w::WeightVec, dim::Int) = +Base.mean!(R::AbstractArray, A::AbstractArray, w::AbstractWeights, dim::Int) = scale!(Base.sum!(R, A, w, dim), inv(sum(w))) wmeantype{T,W}(::Type{T}, ::Type{W}) = typeof((zero(T)*zero(W) + zero(T)*zero(W)) / one(W)) wmeantype{T<:BlasReal}(::Type{T}, ::Type{T}) = T -Base.mean{T<:Number,W<:Real}(A::AbstractArray{T}, w::WeightVec{W}, dim::Int) = +Base.mean{T<:Number,W<:Real}(A::AbstractArray{T}, w::AbstractWeights{W}, dim::Int) = @static if VERSION < v"0.6.0-dev.1121" mean!(similar(A, wmeantype(T, W), Base.reduced_dims(size(A), dim)), A, w, dim) else @@ -290,11 +359,11 @@ Base.mean{T<:Number,W<:Real}(A::AbstractArray{T}, w::WeightVec{W}, dim::Int) = ###### Weighted median ##### -function Base.median(v::AbstractArray, w::WeightVec) +function Base.median(v::AbstractArray, w::AbstractWeights) throw(MethodError(median, (v, w))) end -function Base.median{W<:Real}(v::RealVector, w::WeightVec{W}) +function Base.median{W<:Real}(v::RealVector, w::AbstractWeights{W}) isempty(v) && error("median of an empty array is undefined") if length(v) != length(w) error("data and weight vectors must be the same size") @@ -345,10 +414,10 @@ end wmedian(v, w) Compute the weighted median of an array `v` with weights `w`, given as either a -vector or `WeightVec`. +vector or `AbstractWeights`. """ -wmedian(v::RealVector, w::RealVector) = median(v, weights(w)) -wmedian{W<:Real}(v::RealVector, w::WeightVec{W}) = median(v, w) +wmedian(v::RealVector, w::RealVector) = median(v, weights(w, false)) +wmedian{W<:Real}(v::RealVector, w::AbstractWeights{W}) = median(v, w) ###### Weighted quantile ##### @@ -358,11 +427,11 @@ wmedian{W<:Real}(v::RealVector, w::WeightVec{W}) = median(v, w) # Here there is a supplementary function from index to weighted index k -> Sk """ - quantile(v, w::WeightVec, p) + quantile(v, w::AbstractWeights, p) Compute `p`th quantile(s) of `v` with weights `w`. """ -function quantile{V, W <: Real}(v::RealVector{V}, w::WeightVec{W}, p::RealVector) +function quantile{V, W <: Real}(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) # checks isempty(v) && error("quantile of an empty array is undefined") @@ -430,16 +499,16 @@ function bound_quantiles{T <: Real}(qs::AbstractVector{T}) T[min(one(T), max(zero(T), q)) for q = qs] end -quantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::Number) = quantile(v, w, [p])[1] +quantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::Number) = quantile(v, w, [p])[1] """ wquantile(v, w, p) Compute the `p`th quantile(s) of `v` with weights `w`, given as either a vector -or a `WeightVec`. +or a `AbstractWeights`. """ -wquantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::RealVector) = quantile(v, w, p) -wquantile{W <: Real}(v::RealVector, w::WeightVec{W}, p::Number) = quantile(v, w, [p])[1] -wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, weights(w), p) -wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, weights(w), [p])[1] +wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::RealVector) = quantile(v, w, p) +wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::Number) = quantile(v, w, [p])[1] +wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, weights(w, false), p) +wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, weights(w, false), [p])[1] diff --git a/test/counts.jl b/test/counts.jl index 2fae64eed..d763817d3 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -6,7 +6,7 @@ n = 5000 # 1D integer counts x = rand(1:5, n) -w = weights(rand(n)) +w = weights(rand(n), false) c = counts(x, 5) @test size(c) == (5,) @@ -40,7 +40,7 @@ c0 = Float64[sum(w.values[x .== i]) for i in 1 : 5] x = rand(1:4, n) y = rand(1:5, n) -w = weights(rand(n)) +w = weights(rand(n), false) c = counts(x, y, (4, 5)) @test size(c) == (4, 5) @@ -85,11 +85,11 @@ pm = proportionmap(x) @test pm["b"] ≈ (1/3) @test pm["c"] ≈ (1/6) -cm = countmap(x, weights(w)) +cm = countmap(x, weights(w, false)) @test cm["a"] == 5.5 @test cm["b"] == 4.5 @test cm["c"] == 3.5 -pm = proportionmap(x, weights(w)) +pm = proportionmap(x, weights(w, false)) @test pm["a"] ≈ (5.5 / 13.5) @test pm["b"] ≈ (4.5 / 13.5) @test pm["c"] ≈ (3.5 / 13.5) diff --git a/test/cov.jl b/test/cov.jl index de02651ea..7523cb5b7 100644 --- a/test/cov.jl +++ b/test/cov.jl @@ -9,8 +9,8 @@ Z2 = X .- mean(X, 2) w1 = rand(3) w2 = rand(8) -wv1 = weights(w1) -wv2 = weights(w2) +wv1 = weights(w1, false) +wv2 = weights(w2, false) Z1w = X .- mean(X, wv1, 1) Z2w = X .- mean(X, wv2, 2) diff --git a/test/moments.jl b/test/moments.jl index 014a3a1a5..17c956c71 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -4,7 +4,7 @@ using Base.Test ##### weighted var & std x = rand(10) -wv = weights(rand(10)) +wv = weights(rand(10), false) m = mean(x, wv) @test var(x, wv) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) @@ -34,8 +34,8 @@ m = mean(x, wv) x = rand(5, 6) w1 = rand(5) w2 = rand(6) -wv1 = weights(w1) -wv2 = weights(w2) +wv1 = weights(w1, false) +wv2 = weights(w2, false) m1 = mean(x, wv1, 1) m2 = mean(x, wv2, 2) @@ -85,7 +85,7 @@ end ##### skewness & kurtosis -wv = weights(ones(5) * 2.0) +wv = weights(ones(5) * 2.0, false) @test skewness(1:5) ≈ 0.0 @test skewness([1, 2, 3, 4, 5]) ≈ 0.0 diff --git a/test/sampling.jl b/test/sampling.jl index b0e9ce8cf..453996f46 100644 --- a/test/sampling.jl +++ b/test/sampling.jl @@ -187,7 +187,7 @@ check_sample_norep(a, (3, 12), 0; ordered=true) # test of weighted sampling without replacement a = [1:10;] -wv = WeightVec([zeros(6); 1:4]) +wv = Weights([zeros(6); 1:4]) x = vcat([sample(a, wv, 1, replace=false) for j in 1:100000]...) @test minimum(x) == 7 @test maximum(x) == 10 @@ -206,6 +206,5 @@ x = vcat([sample(a, wv, 4, replace=false) for j in 1:10000]...) @test_throws DimensionMismatch sample(a, wv, 5, replace=false) -wv = WeightVec([zeros(5); 1:4; -1]) +wv = Weights([zeros(5); 1:4; -1]) @test_throws ErrorException sample(a, wv, 1, replace=false) - diff --git a/test/weights.jl b/test/weights.jl index f712dc01f..e27e3e74f 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -3,25 +3,35 @@ using Base.Test using Compat import Compat: view -@test isa(weights([1, 2, 3]), WeightVec{Int}) -@test isa(weights([1., 2., 3.]), WeightVec{Float64}) -@test isa(weights([1 2 3; 4 5 6]), WeightVec{Int}) +@test isa(weights([1, 2, 3]), Weights{Int}) +@test isa(weights([1., 2., 3.]), Weights{Float64}) +@test isa(weights([1 2 3; 4 5 6]), Weights{Int}) -@test isa(WeightVec([1, 2, 3], 6), WeightVec{Int}) +@test isa(frequency([1, 2, 3]), FrequencyWeights) +@test isa(frequency([1 2 3; 4 5 6]), FrequencyWeights) +@test isa(FrequencyWeights([1, 2, 3], 6; corrected=false), FrequencyWeights) @test isempty(weights(Float64[])) @test size(weights([1, 2, 3])) == (3,) w = [1., 2., 3.] -wv = weights(w) +wv = weights(w, false) @test eltype(wv) === Float64 @test length(wv) === 3 @test values(wv) === w @test sum(wv) === 6.0 @test !isempty(wv) +fw = [1, 2, 3] +fwv = frequency(fw) +@test eltype(fwv) === Int +@test length(fwv) === 3 +@test values(fwv) === fw +@test sum(fwv) === 6 +@test !isempty(wv) + b = trues(3) -bv = weights(b) +bv = frequency(b) @test eltype(bv) === Bool @test length(bv) === 3 @test values(bv) === b @@ -31,8 +41,8 @@ bv = weights(b) ba = BitArray([true, false, true]) sa = sparsevec([1., 0., 2.]) -@test sum(ba, wv) === 4.0 -@test sum(sa, wv) === 7.0 +@test sum(ba, fwv) === 4 +@test sum(sa, fwv) === 7.0 ## wsum @@ -151,21 +161,20 @@ r = ones(8, 6) ## the sum and mean syntax +@test sum([1.0, 2.0, 3.0], weights([1.0, 0.5, 0.5], false)) ≈ 3.5 +@test sum(1:3, weights([1.0, 1.0, 0.5], false)) ≈ 4.5 -@test sum([1.0, 2.0, 3.0], weights([1.0, 0.5, 0.5])) ≈ 3.5 -@test sum(1:3, weights([1.0, 1.0, 0.5])) ≈ 4.5 - -@test mean([1:3;], weights([1.0, 1.0, 0.5])) ≈ 1.8 -@test mean(1:3, weights([1.0, 1.0, 0.5])) ≈ 1.8 +@test mean([1:3;], weights([1.0, 1.0, 0.5], false)) ≈ 1.8 +@test mean(1:3, weights([1.0, 1.0, 0.5], false)) ≈ 1.8 a = reshape(1.0:27.0, 3, 3, 3) for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) - @test sum(a, weights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) - @test sum(a, weights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) - @test sum(a, weights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) - @test mean(a, weights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) - @test mean(a, weights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) - @test mean(a, weights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) + @test sum(a, weights(wt, false), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) + @test sum(a, weights(wt, false), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) + @test sum(a, weights(wt, false), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) + @test mean(a, weights(wt, false), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) + @test mean(a, weights(wt, false), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) + @test mean(a, weights(wt, false), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) @test_throws ErrorException mean(a, weights(wt), 4) end @@ -226,15 +235,16 @@ median_answers = (7.0, 4.0, 8.5, num_tests = length(data) for i = 1:num_tests @test wmedian(data[i], wt[i]) == median_answers[i] - @test wmedian(data[i], weights(wt[i])) == median_answers[i] - @test median(data[i], weights(wt[i])) == median_answers[i] + @test wmedian(data[i], weights(wt[i], false)) == median_answers[i] + @test median(data[i], weights(wt[i], false)) == median_answers[i] for j = 1:100 # Make sure the weighted median does not change if the data # and weights are reordered. reorder = sortperm(rand(length(data[i]))) - @test median(data[i][reorder], weights(wt[i][reorder])) == median_answers[i] + @test median(data[i][reorder], weights(wt[i][reorder], false)) == median_answers[i] end end + data = [4, 3, 2, 1] wt = [0, 0, 0, 0] @test_throws MethodError wmedian(data[1]) @@ -256,7 +266,6 @@ wt = [-1, -1, -1, -1, -1] wt = [-1, -1, -1, 0, 0] @test_throws ErrorException median(data, weights(wt)) - # Weighted quantile tests data = ( [7, 1, 2, 4, 10], @@ -280,25 +289,25 @@ data = ( [-10, 1, 1, -10, -10], ) wt = ( - weights([1, 1/3, 1/3, 1/3, 1]), - weights([1, 1, 1, 1, 1]), - weights([1, 1/3, 1/3, 1/3, 1, 1]), - weights([1/3, 1/3, 1/3, 1, 1, 1]), - weights([30, 191, 9, 0]), - weights([10, 1, 1, 1, 9]), - weights([10, 1, 1, 1, 900]), - weights([1, 3, 5, 4, 2]), - weights([2, 2, 5, 1, 2, 2, 1, 6]), - weights([0.1, 0.1, 0.8]), - weights([5, 5, 4, 1]), - weights([30, 56, 144, 24, 55, 43, 67]), - weights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), - weights([12]), - weights([7, 1, 1, 1, 6]), - weights([1, 0, 0, 0, 2]), - weights([1, 2, 3, 4, 5]), - weights([0.1, 0.2, 0.3, 0.2, 0.1]), - weights([1, 1, 1, 1, 1]), + weights([1, 1/3, 1/3, 1/3, 1], false), + weights([1, 1, 1, 1, 1], false), + weights([1, 1/3, 1/3, 1/3, 1, 1], false), + weights([1/3, 1/3, 1/3, 1, 1, 1], false), + weights([30, 191, 9, 0], false), + weights([10, 1, 1, 1, 9], false), + weights([10, 1, 1, 1, 900], false), + weights([1, 3, 5, 4, 2], false), + weights([2, 2, 5, 1, 2, 2, 1, 6], false), + weights([0.1, 0.1, 0.8], false), + weights([5, 5, 4, 1], false), + weights([30, 56, 144, 24, 55, 43, 67], false), + weights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], false), + weights([12], false), + weights([7, 1, 1, 1, 6], false), + weights([1, 0, 0, 0, 2], false), + weights([1, 2, 3, 4, 5], false), + weights([0.1, 0.2, 0.3, 0.2, 0.1], false), + weights([1, 1, 1, 1, 1], false), ) quantile_answers = ( [1.0,3.6000000000000005,6.181818181818182,8.2,10.0], @@ -334,15 +343,15 @@ for i = 1:length(data) for j = 1:10 # order of w does not matter reorder = sortperm(rand(length(data[i]))) - @test quantile(data[i][reorder], weights(wt[i][reorder]), p) ≈ quantile_answers[i] + @test quantile(data[i][reorder], weights(wt[i][reorder], false), p) ≈ quantile_answers[i] end end # w = 1 corresponds to base quantile for i = 1:length(data) - @test quantile(data[i], weights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) + @test quantile(data[i], weights(ones(Int64, length(data[i])), false), p) ≈ quantile(data[i], p) for j = 1:10 prandom = rand(4) - @test quantile(data[i], weights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) + @test quantile(data[i], weights(ones(Int64, length(data[i])), false), prandom) ≈ quantile(data[i], prandom) end end @@ -350,8 +359,8 @@ end v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] answer = 6.181818181818182 -@test quantile(data[1], weights(w), 0.5) ≈ answer -@test wquantile(data[1], weights(w), [0.5]) ≈ [answer] -@test wquantile(data[1], weights(w), 0.5) ≈ answer +@test quantile(data[1], weights(w, false), 0.5) ≈ answer +@test wquantile(data[1], weights(w, false), [0.5]) ≈ [answer] +@test wquantile(data[1], weights(w, false), 0.5) ≈ answer @test wquantile(data[1], w, [0.5]) ≈ [answer] @test wquantile(data[1], w, 0.5) ≈ answer diff --git a/test/wsampling.jl b/test/wsampling.jl index a0091382d..b29f902e5 100644 --- a/test/wsampling.jl +++ b/test/wsampling.jl @@ -7,7 +7,7 @@ srand(1234) #### weighted sample with replacement -function check_wsample_wrep(a::AbstractArray, vrgn, wv::WeightVec, ptol::Real; ordered::Bool=false) +function check_wsample_wrep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::Real; ordered::Bool=false) K = length(wv) (vmin, vmax) = vrgn (amin, amax) = extrema(a) @@ -35,7 +35,7 @@ end import StatsBase: direct_sample!, alias_sample! n = 10^5 -wv = weights([0.2, 0.8, 0.4, 0.6]) +wv = weights([0.2, 0.8, 0.4, 0.6], false) a = direct_sample!(4:7, wv, zeros(Int, n, 3)) check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=false) @@ -53,7 +53,7 @@ check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=true) #### weighted sampling without replacement -function check_wsample_norep(a::AbstractArray, vrgn, wv::WeightVec, ptol::Real; ordered::Bool=false) +function check_wsample_norep(a::AbstractArray, vrgn, wv::AbstractWeights, ptol::Real; ordered::Bool=false) # each column of a for one run vmin, vmax = vrgn @@ -79,7 +79,7 @@ import StatsBase: naive_wsample_norep!, efraimidis_a_wsample_norep!, efraimidis_ares_wsample_norep!, efraimidis_aexpj_wsample_norep! n = 10^5 -wv = weights([0.2, 0.8, 0.4, 0.6]) +wv = weights([0.2, 0.8, 0.4, 0.6], false) a = zeros(Int, 3, n) for j = 1:n From 457f919c802b4d616e8f30208f692b1f4753d5e8 Mon Sep 17 00:00:00 2001 From: Rory-Finnegan Date: Wed, 26 Apr 2017 16:27:21 -0500 Subject: [PATCH 02/50] Added corrected option to many stats methods. * Reverts many test changes from the last commit * Changed a bunch of test cases to use `corrected=false` for now * This included a lot of little changes to method definition and some resulting formatting changes where necessary * Added a few test cases for the corrected variances * Added bias correction for ProbabililtyWeights. * Deprecated `WeightVec` * Added a macro for easier creation of weight types. * Renamed weight creation function to `fweights`, `pweights`, etc. --- src/StatsBase.jl | 12 ++- src/cov.jl | 32 ++++--- src/deprecates.jl | 2 + src/moments.jl | 234 +++++++++++++++++++++++++++++++--------------- src/weights.jl | 210 +++++++++++++++++++++++++++-------------- test/counts.jl | 8 +- test/cov.jl | 68 +++++++------- test/moments.jl | 157 ++++++++++++++++++------------- test/weights.jl | 105 ++++++++++----------- test/wsampling.jl | 4 +- 10 files changed, 506 insertions(+), 326 deletions(-) diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 6ba58fced..cebd7cf37 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -18,11 +18,15 @@ module StatsBase ## weights AbstractWeights, # the abstract type to represent any weight vector - Weights, # the default type for representing a weight vector + AnalyticWeights, # the default type for representing a analytic/precision/reliability weight vectors FrequencyWeights, # the type for representing a frequency weight vectors - weights, # construct a weights vector - frequency, # construct a frequency weights vector - exponential, # construct a weights vector using a exponential smoothing schema + ProbabilityWeights,# the type for representing a probability/sampling weight vectors + ExponentialWeights,# the type for representing exponential weights + weights, # alias for aweights + aweights, # construct an AnalyticWeights vector + fweights, # construct a FrequencyWeights vector + pweights, # construct a ProbabilityWeights vector + eweights, # construct an ExponentialWeights vector wsum, # weighted sum with vector as second argument wsum!, # weighted sum across dimensions with provided storage wmean, # weighted mean diff --git a/src/cov.jl b/src/cov.jl index 83faeda84..fc68c12c5 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -81,16 +81,17 @@ function mean_and_cov end end ## weighted cov - Base.cov(x::DenseMatrix, wv::AbstractWeights; mean=nothing, vardim::Int=1) = - scale!(scattermat(x, wv; mean=mean, vardim=vardim), inv(bias(wv))) + function Base.cov(x::DenseMatrix, wv::AbstractWeights; mean=nothing, vardim::Int=1, corrected=true) + scale!(scattermat(x, wv; mean=mean, vardim=vardim), bias(wv, corrected)) + end - function mean_and_cov(x::DenseMatrix; vardim::Int=1) + function mean_and_cov(x::DenseMatrix; vardim::Int=1, corrected=true) m = mean(x, vardim) - return m, Base.covm(x, m; vardim=vardim) + return m, Base.covm(x, m; vardim=vardim, corrected=corrected) end - function mean_and_cov(x::DenseMatrix, wv::AbstractWeights; vardim::Int=1) + function mean_and_cov(x::DenseMatrix, wv::AbstractWeights; vardim::Int=1, corrected=true) m = mean(x, wv, vardim) - return m, Base.cov(x, wv; mean=m, vardim=vardim) + return m, Base.cov(x, wv; mean=m, vardim=vardim, corrected=corrected) end else scattermatm(x::DenseMatrix, mean, vardim::Int=1) = @@ -106,18 +107,21 @@ else scattermatm(x, Base.mean(x, wv, vardim), wv, vardim) ## weighted cov - Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1) = - scale!(scattermatm(x, mean, wv, vardim), inv(bias(wv))) + function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1, corrected::Bool=true) + scale!(scattermatm(x, mean, wv, vardim), bias(wv, corrected)) + end - Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = - Base.covm(x, Base.mean(x, wv, vardim), wv, vardim) + function Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=true) + Base.covm(x, Base.mean(x, wv, vardim), wv, vardim, corrected) + end - function mean_and_cov(x::DenseMatrix, vardim::Int=1) + function mean_and_cov(x::DenseMatrix, vardim::Int=1; corrected=true) m = mean(x, vardim) - return m, Base.covm(x, m, vardim) + return m, Base.covm(x, m, vardim, corrected) end - function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) + + function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=true) m = mean(x, wv, vardim) - return m, Base.cov(x, wv, vardim) + return m, Base.cov(x, wv, vardim; corrected=corrected) end end diff --git a/src/deprecates.jl b/src/deprecates.jl index a75707fd0..6f3905e65 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -43,3 +43,5 @@ findat(a::AbstractArray, b::AbstractArray) = findat!(Array{Int}(size(b)), a, b) @deprecate df(obj::StatisticalModel) dof(obj) @deprecate df_residual(obj::StatisticalModel) dof_residual(obj) + +@deprecate WeightVec{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) AnalyticWeights(vs, s) diff --git a/src/moments.jl b/src/moments.jl index 0c5b83f07..2ea437d9c 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -15,7 +15,9 @@ whereas it's `length(x)-1` in `Base.varm`. The impact is that this is not a weighted estimate of the population variance based on the sample; it's the weighted variance of the sample. """ -Base.varm(v::RealArray, wv::AbstractWeights, m::Real) = _moment2(v, wv, m) +function Base.varm(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) + _moment2(v, wv, m, corrected=corrected) +end """ var(x, wv::AbstractWeights, [dim]; mean=nothing) @@ -30,23 +32,32 @@ whereas it's `length(x)-1` in `Base.var`. The impact is that this is not a weighted estimate of the population variance based on the sample; it's the weighted variance of the sample. """ -function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing) - mean == 0 ? Base.varm(v, wv, 0) : - mean == nothing ? varm(v, wv, Base.mean(v, wv)) : - varm(v, wv, mean) +function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=true) + mean == 0 ? varm(v, wv, 0; corrected=corrected) : + mean == nothing ? varm(v, wv, Base.mean(v, wv); corrected=corrected) : + varm(v, wv, mean; corrected=corrected) end ## var along dim -Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int) = - scale!(_wsum_centralize!(R, @functorize(abs2), A, values(wv), M, dim, true), inv(bias(wv))) +function Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=true) + scale!( + _wsum_centralize!(R, @functorize(abs2), A, values(wv), M, dim, true), + bias(wv, corrected) + ) +end -function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) +function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) if mean == 0 - Base.varm!(R, A, wv, - Base.reducedim_initarray(A, dim, 0, eltype(R)), dim) + Base.varm!( + R, A, wv, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim; + corrected=corrected + ) elseif mean == nothing - Base.varm!(R, A, wv, Base.mean(A, wv, dim), dim) + Base.varm!( + R, A, wv, Base.mean(A, wv, dim), dim; + corrected=corrected + ) else # check size of mean for i = 1:ndims(A) @@ -58,23 +69,37 @@ function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mea dM == dA || throw(DimensionMismatch("Incorrect size of mean.")) end end - Base.varm!(R, A, wv, mean, dim) + Base.varm!(R, A, wv, mean, dim; corrected=corrected) end end -Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int) = +function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=true) @static if VERSION < v"0.6.0-dev.1121" - Base.varm!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, M, dim) + return Base.varm!( + similar(A, Float64, Base.reduced_dims(size(A), dim)), + A, wv, M, dim; corrected=corrected + ) else - Base.varm!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, M, dim) + return Base.varm!( + similar(A, Float64, Base.reduced_indices(indices(A), dim)), + A, wv, M, dim; corrected=corrected + ) end +end -Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) = +function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) @static if VERSION < v"0.6.0-dev.1121" - var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim; mean=mean) + return var!( + similar(A, Float64, Base.reduced_dims(size(A), dim)), + A, wv, dim; mean=mean, corrected=corrected + ) else - var!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, dim; mean=mean) + return var!( + similar(A, Float64, Base.reduced_indices(indices(A), dim)), + A, wv, dim; mean=mean, corrected=corrected + ) end +end ## std """ @@ -84,7 +109,9 @@ Return the standard deviation of a real-valued array `v` with a known mean `m`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights (also called case weights) for the estimate. """ -Base.stdm(v::RealArray, wv::AbstractWeights, m::Real) = sqrt(varm(v, wv, m)) +function Base.stdm(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) + sqrt(varm(v, wv, m; corrected=corrected)) +end """ std(v, wv::AbstractWeights, [dim]; mean=nothing) @@ -93,12 +120,21 @@ Return the standard deviation of a real-valued array `v`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights (also called case weights) for the estimate. """ -Base.std(v::RealArray, wv::AbstractWeights; mean=nothing) = sqrt.(var(v, wv; mean=mean)) +function Base.std(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=true) + sqrt.(var(v, wv; mean=mean, corrected=corrected)) +end -Base.stdm(v::RealArray, m::RealArray, dim::Int) = Base.sqrt!(varm(v, m, dim)) -Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int) = sqrt.(varm(v, wv, m, dim)) -Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) = sqrt.(var(v, wv, dim; mean=mean)) +function Base.stdm(v::RealArray, m::RealArray, dim::Int; corrected=true) + Base.sqrt!(varm(v, m, dim; corrected=corrected)) +end + +function Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int; corrected=true) + sqrt.(varm(v, wv, m, dim; corrected=corrected)) +end +function Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) + sqrt.(var(v, wv, dim; mean=mean, corrected=corrected)) +end ##### Fused statistics """ @@ -108,7 +144,11 @@ Return the mean and variance of a real-valued array `x`, optionally over a dimen `dim`, as a tuple. A weighting vector `wv` can be specified to weight the estimates. The weights are assumed to be frequency weights, also called case weights. """ -mean_and_var(A::RealArray) = (m = mean(A); (m, varm(A, m))) +function mean_and_var(A::RealArray; corrected=true) + m = mean(A) + v = varm(A, m; corrected=corrected) + m, v +end """ mean_and_std(x, [wv::AbstractWeights], [dim]) -> (mean, std) @@ -118,31 +158,61 @@ over a dimension `dim`, as a tuple. A weighting vector `wv` can be specified to weight the estimates. The weights are assumed to be frequency weights, also called case weights. """ -mean_and_std(A::RealArray) = (m = mean(A); (m, stdm(A, m))) +function mean_and_std(A::RealArray; corrected=true) + m = mean(A) + s = stdm(A, m; corrected=corrected) + m, s +end -mean_and_var(A::RealArray, wv::AbstractWeights) = (m = mean(A, wv); (m, varm(A, wv, m))) -mean_and_std(A::RealArray, wv::AbstractWeights) = (m = mean(A, wv); (m, stdm(A, wv, m))) +function mean_and_var(A::RealArray, wv::AbstractWeights; corrected=true) + m = mean(A, wv) + v = varm(A, wv, m; corrected=corrected) + m, v +end + +function mean_and_std(A::RealArray, wv::AbstractWeights; corrected=true) + m = mean(A, wv) + s = stdm(A, wv, m; corrected=corrected) + m, s +end -mean_and_var(A::RealArray, dim::Int) = (m = mean(A, dim); (m, varm(A, m, dim))) -mean_and_std(A::RealArray, dim::Int) = (m = mean(A, dim); (m, stdm(A, m, dim))) +function mean_and_var(A::RealArray, dim::Int; corrected=true) + m = mean(A, dim) + v = varm(A, m, dim; corrected=corrected) + m, v +end + +function mean_and_std(A::RealArray, dim::Int; corrected=true) + m = mean(A, dim) + s = stdm(A, m, dim; corrected=corrected) + m, s +end -mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int) = (m = mean(A, wv, dim); (m, varm(A, wv, m, dim))) -mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int) = (m = mean(A, wv, dim); (m, stdm(A, wv, m, dim))) +function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int; corrected=true) + m = mean(A, wv, dim) + v = varm(A, wv, m, dim; corrected=corrected) + m, v +end +function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int; corrected=true) + m = mean(A, wv, dim) + s = stdm(A, wv, m, dim; corrected=corrected) + m, s +end ##### General central moment -function _moment2(v::RealArray, m::Real) +function _moment2(v::RealArray, m::Real; corrected=true) n = length(v) s = 0.0 for i = 1:n @inbounds z = v[i] - m s += z * z end - s / n + s * bias(n, corrected) end -function _moment2(v::RealArray, wv::AbstractWeights, m::Real) +function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) n = length(v) s = 0.0 w = values(wv) @@ -150,20 +220,22 @@ function _moment2(v::RealArray, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += (z * z) * w[i] end - s / sum(wv) + + result = s * bias(wv, corrected) + return result end -function _moment3(v::RealArray, m::Real) +function _moment3(v::RealArray, m::Real; corrected=true) n = length(v) s = 0.0 for i = 1:n @inbounds z = v[i] - m s += z * z * z end - s / n + s * bias(n, corrected) end -function _moment3(v::RealArray, wv::AbstractWeights, m::Real) +function _moment3(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) n = length(v) s = 0.0 w = values(wv) @@ -171,20 +243,20 @@ function _moment3(v::RealArray, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += (z * z * z) * w[i] end - s / sum(wv) + s * bias(wv, corrected) end -function _moment4(v::RealArray, m::Real) +function _moment4(v::RealArray, m::Real; corrected=true) n = length(v) s = 0.0 for i = 1:n @inbounds z = v[i] - m s += abs2(z * z) end - s / n + s * bias(n, corrected) end -function _moment4(v::RealArray, wv::AbstractWeights, m::Real) +function _moment4(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) n = length(v) s = 0.0 w = values(wv) @@ -192,20 +264,20 @@ function _moment4(v::RealArray, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += abs2(z * z) * w[i] end - s / sum(wv) + s * bias(wv, corrected) end -function _momentk(v::RealArray, k::Int, m::Real) +function _momentk(v::RealArray, k::Int, m::Real; corrected=true) n = length(v) s = 0.0 for i = 1:n @inbounds z = v[i] - m s += (z ^ k) end - s / n + s * bias(n, corrected) end -function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) +function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=true) n = length(v) s = 0.0 w = values(wv) @@ -213,7 +285,7 @@ function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += (z ^ k) * w[i] end - s / sum(wv) + s * bias(wv, corrected) end @@ -223,22 +295,24 @@ end Return the `k`th order central moment of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. """ -function moment(v::RealArray, k::Int, m::Real) - k == 2 ? _moment2(v, m) : - k == 3 ? _moment3(v, m) : - k == 4 ? _moment4(v, m) : - _momentk(v, k, m) +function moment(v::RealArray, k::Int, m::Real; corrected=true) + k == 2 ? _moment2(v, m; corrected=corrected) : + k == 3 ? _moment3(v, m; corrected=corrected) : + k == 4 ? _moment4(v, m; corrected=corrected) : + _momentk(v, k, m; corrected=corrected) end -function moment(v::RealArray, k::Int, wv::AbstractWeights, m::Real) - k == 2 ? _moment2(v, wv, m) : - k == 3 ? _moment3(v, wv, m) : - k == 4 ? _moment4(v, wv, m) : - _momentk(v, k, wv, m) +function moment(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=true) + k == 2 ? _moment2(v, wv, m; corrected=corrected) : + k == 3 ? _moment3(v, wv, m; corrected=corrected) : + k == 4 ? _moment4(v, wv, m; corrected=corrected) : + _momentk(v, k, wv, m; corrected=corrected) end -moment(v::RealArray, k::Int) = moment(v, k, mean(v)) -moment(v::RealArray, k::Int, wv::AbstractWeights) = moment(v, k, wv, mean(v, wv)) +moment(v::RealArray, k::Int; corrected=true) = moment(v, k, mean(v); corrected=corrected) +function moment(v::RealArray, k::Int, wv::AbstractWeights; corrected=true) + moment(v, k, wv, mean(v, wv); corrected=corrected) +end ##### Skewness and Kurtosis @@ -251,7 +325,7 @@ moment(v::RealArray, k::Int, wv::AbstractWeights) = moment(v, k, wv, mean(v, wv) Compute the standardized skewness of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. """ -function skewness(v::RealArray, m::Real) +function skewness(v::RealArray, m::Real; corrected=true) n = length(v) cm2 = 0.0 # empirical 2nd centered moment (variance) cm3 = 0.0 # empirical 3rd centered moment @@ -262,12 +336,13 @@ function skewness(v::RealArray, m::Real) cm2 += z2 cm3 += z2 * z end - cm3 /= n - cm2 /= n + b = bias(n, corrected) + cm3 *= b + cm2 *= b return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 end -function skewness(v::RealArray, wv::AbstractWeights, m::Real) +function skewness(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) n = length(v) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) @@ -282,14 +357,16 @@ function skewness(v::RealArray, wv::AbstractWeights, m::Real) cm2 += z2w cm3 += z2w * z end - sw = sum(wv) - cm3 /= sw - cm2 /= sw + b = bias(wv, corrected) + cm3 *= b + cm2 *= b return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 end -skewness(v::RealArray) = skewness(v, mean(v)) -skewness(v::RealArray, wv::AbstractWeights) = skewness(v, wv, mean(v, wv)) +skewness(v::RealArray; corrected=true) = skewness(v, mean(v); corrected=corrected) +function skewness(v::RealArray, wv::AbstractWeights; corrected=true) + skewness(v, wv, mean(v, wv); corrected=corrected) +end # (excessive) Kurtosis # This is Type 1 definition according to Joanes and Gill (1998) @@ -299,7 +376,7 @@ skewness(v::RealArray, wv::AbstractWeights) = skewness(v, wv, mean(v, wv)) Compute the excess kurtosis of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. """ -function kurtosis(v::RealArray, m::Real) +function kurtosis(v::RealArray, m::Real; corrected=true) n = length(v) cm2 = 0.0 # empirical 2nd centered moment (variance) cm4 = 0.0 # empirical 4th centered moment @@ -309,12 +386,13 @@ function kurtosis(v::RealArray, m::Real) cm2 += z2 cm4 += z2 * z2 end - cm4 /= n - cm2 /= n + b = bias(n, corrected) + cm4 *= b + cm2 *= b return (cm4 / (cm2 * cm2)) - 3.0 end -function kurtosis(v::RealArray, wv::AbstractWeights, m::Real) +function kurtosis(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) n = length(v) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) @@ -330,11 +408,13 @@ function kurtosis(v::RealArray, wv::AbstractWeights, m::Real) cm2 += z2w cm4 += z2w * z2 end - sw = sum(wv) - cm4 /= sw - cm2 /= sw + b = bias(wv, corrected) + cm4 *= b + cm2 *= b return (cm4 / (cm2 * cm2)) - 3.0 end -kurtosis(v::RealArray) = kurtosis(v, mean(v)) -kurtosis(v::RealArray, wv::AbstractWeights) = kurtosis(v, wv, mean(v, wv)) +kurtosis(v::RealArray; corrected=true) = kurtosis(v, mean(v); corrected=corrected) +function kurtosis(v::RealArray, wv::AbstractWeights; corrected=true) + kurtosis(v, wv, mean(v, wv); corrected=corrected) +end diff --git a/src/weights.jl b/src/weights.jl index 3ab1a0253..ac4f60add 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -3,113 +3,181 @@ if VERSION < v"0.6.0-dev.2123" abstract AbstractWeights{S<:Real, T<:Real, V<:RealVector} <: RealVector{T} +else + abstract AbstractWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} +end - immutable Weights{S<:Real, T<:Real, V<:RealVector} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Real - end +""" + `@weights name` - immutable FrequencyWeights{S<:Integer, T<:Integer, V<:IntegerVector} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Int +Generates a new generic weight type with specified `name`, which subtypes `AbstractWeights` +and stores the `values` (`V<:RealVector`) and `sum` (`S<:Real`). +""" +macro weights(name) + return quote + if VERSION < v"0.6.0-dev.2123" + immutable $name{S<:Real, T<:Real, V<:RealVector} <: AbstractWeights{S, T, V} + values::V + sum::S + end + else + immutable $name{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} + values::V + sum::S + end + end end +end - immutable ProbabilityWeights{S<:Real, T<:Real, V<:RealVector} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Real - end -else - abstract AbstractWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} +eltype(wv::AbstractWeights) = eltype(wv.values) +length(wv::AbstractWeights) = length(wv.values) +values(wv::AbstractWeights) = wv.values +sum(wv::AbstractWeights) = wv.sum +isempty(wv::AbstractWeights) = isempty(wv.values) - immutable Weights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Real - end +Base.getindex(wv::AbstractWeights, i) = getindex(wv.values, i) +Base.size(wv::AbstractWeights) = size(wv.values) - immutable FrequencyWeights{S<:Integer, T<:Integer, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Int - end +""" + bias(n::Integer, [corrected]) - immutable ProbabilityWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractWeights{S, T, V} - values::V - sum::S - bias::Real - end -end +Computes the corrected (default) or uncorrected bias for any `n` observations. +```math +\fraction{1}{n - 1} +``` """ - Weights(vs, [wsum]) +bias(n::Integer, corrected=true) = inv(n - Int(corrected)) -Construct a `Weights` with weight values `vs` and sum of weights `wsum`. -If omitted, `wsum` is computed. """ -function Weights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs); corrected::Bool=true) - if isempty(vs) || !corrected - return Weights{typeof(s), eltype(vs), V}(vs, s, s) + bias(w::AbstractWeights, [corrected]) + +Computes the corrected (default) or uncorrected bias for any weight vector. +The default equation assumes analytic/precision/reliability weights and determines the +bias as: + +```math +\fraction{1}{∑w × (1 - ∑(w'²))} +``` +where w' represents the normalized weights +""" +function bias(w::AbstractWeights, corrected=true) + s = sum(w) + if corrected + return inv(s * (1 - sum(normalize(values(w), 1) .^ 2))) else - return Weights{S, eltype(vs), V}(vs, s, s * (1 - sum(normalize(vs, 1) .^ 2))) + return inv(s) end end -function FrequencyWeights{S<:Integer, V<:IntegerVector}(vs::V, s::S=sum(vs); corrected::Bool=true) - return FrequencyWeights{S, eltype(vs), V}(vs, s, s - Int(corrected)) +@weights AnalyticWeights + +""" + AnalyticWeights(vs, [wsum]) + +Construct a `AnalyticWeights` with weight values `vs` and sum of weights `wsum`. +If omitted, `wsum` is computed. +""" +function AnalyticWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) + return AnalyticWeights{S, eltype(vs), V}(vs, s) end -# TODO: constructor for ProbabilityWeights, but I'm not familiar with how bias correction works with these -# types of weights or if bias correction even makes sense. -# https://en.wikipedia.org/wiki/Inverse_probability_weighting +""" + aweights(vs) + +Construct a `AnalyticWeights` type from a given array. +""" +aweights(vs::RealVector) = AnalyticWeights(vs) +aweights(vs::RealArray) = AnalyticWeights(vec(vs)) """ weights(vs) -Construct a `Weights` type from a given array. +Alias for aweights(vs) """ -weights(vs::RealVector, corrected=true) = Weights(vs; corrected=corrected) -weights(vs::RealArray, corrected=true) = Weights(vec(vs); corrected=corrected) +weights(vs) = aweights(vs) + +@weights FrequencyWeights + +function FrequencyWeights{S<:Integer, V<:IntegerVector}(vs::V, s::S=sum(vs)) + return FrequencyWeights{S, eltype(vs), V}(vs, s) +end """ - frequency(vs) + fweights(vs) Construct a `FrequencyWeights` type from a given array. """ -frequency(vs::RealVector, corrected=true) = FrequencyWeights(vs; corrected=corrected) -frequency(vs::RealArray, corrected=true) = FrequencyWeights(vec(vs); corrected=corrected) +fweights(vs::IntegerVector) = FrequencyWeights(vs) +fweights(vs::IntegerArray) = FrequencyWeights(vec(vs)) + +""" + bias(w::FrequencyWeights, [corrected]) + +```math +\fraction{1}{∑w - 1} +``` +""" +bias(w::FrequencyWeights, corrected=true) = inv(sum(w) - Int(corrected)) + +@weights ProbabilityWeights + +function ProbabilityWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) + return ProbabilityWeights{S, eltype(vs), V}(vs, s) +end + +""" + pweights(vs) + +Construct a `ProbabilityWeights` type from a given array. +""" +pweights(vs::RealVector) = ProbabilityWeights(vs) +pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) + +""" + bias(w::ProbabilityWeights, [corrected]) + +```math +\fraction{n}{∑w × (n - 1)} +``` +""" +function bias(w::ProbabilityWeights, corrected=true) + s = sum(w) + + if corrected + n = length(values(w)) + return n / (s * (n - 1)) + else + return inv(s) + end +end + +@weights ExponentialWeights + +function ExponentialWeights{V<:RealVector}(vs::V) + s = sum(vs) + return ExponentialWeights{typeof(s), eltype(vs), V}(vs, s) +end """ - exponential(n, [λ]) + eweights(n, [λ]) -Constructs a `Weights` type with a desired length `n` and smoothing factor `λ`, +Constructs a `ExponentialWeights` type with a desired length `n` and smoothing factor `λ`, where each element is set to `λ * (1 - λ)^(1 - i)`. # Arguments * `n::Integer`: the desired length of the `Weights` -* `λ::Real`: is a smoothing factor or rate paremeter between 0 .. 1. - As this value approaches 0 the resulting weights will be almost equal(), +* `λ::Real`: a smoothing factor or rate parameter between 0 and 1. + As this value approaches 0 the resulting weights will be almost equal, while values closer to 1 will put higher weight on the end elements of the vector. """ -function exponential(n::Integer, λ::Real=0.99) - @assert 0 <= λ <= 1 && n > 0 +function eweights(n::Integer, λ::Real=0.99) + n > 0 || throw(ArgumentError("cannot construct weights of length < 1")) + 0 <= λ <= 1 || throw(ArgumentError("smoothing factor must be between 0 and 1")) w0 = map(i -> λ * (1 - λ)^(1 - i), 1:n) return weights(w0) end -eltype(wv::AbstractWeights) = eltype(wv.values) -length(wv::AbstractWeights) = length(wv.values) -values(wv::AbstractWeights) = wv.values -sum(wv::AbstractWeights) = wv.sum -bias(wv::AbstractWeights) = wv.bias -isempty(wv::AbstractWeights) = isempty(wv.values) - -Base.getindex(wv::AbstractWeights, i) = getindex(wv.values, i) -Base.size(wv::AbstractWeights) = size(wv.values) - - ##### Weighted sum ##### ## weighted sum over vectors @@ -339,7 +407,7 @@ Compute the weighted mean of an array `v` with weights `w`. """ function wmean{T<:Number}(v::AbstractArray{T}, w::AbstractVector) Base.depwarn("wmean is deprecated, use mean(v, weights(w)) instead.", :wmean) - mean(v, weights(w, false)) + mean(v, weights(w)) end Base.mean(v::AbstractArray, w::AbstractWeights) = sum(v, w) / sum(w) @@ -416,7 +484,7 @@ end Compute the weighted median of an array `v` with weights `w`, given as either a vector or `AbstractWeights`. """ -wmedian(v::RealVector, w::RealVector) = median(v, weights(w, false)) +wmedian(v::RealVector, w::RealVector) = median(v, weights(w)) wmedian{W<:Real}(v::RealVector, w::AbstractWeights{W}) = median(v, w) ###### Weighted quantile ##### @@ -510,5 +578,5 @@ or a `AbstractWeights`. """ wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::RealVector) = quantile(v, w, p) wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::Number) = quantile(v, w, [p])[1] -wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, weights(w, false), p) -wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, weights(w, false), [p])[1] +wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, weights(w), p) +wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, weights(w), [p])[1] diff --git a/test/counts.jl b/test/counts.jl index d763817d3..2fae64eed 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -6,7 +6,7 @@ n = 5000 # 1D integer counts x = rand(1:5, n) -w = weights(rand(n), false) +w = weights(rand(n)) c = counts(x, 5) @test size(c) == (5,) @@ -40,7 +40,7 @@ c0 = Float64[sum(w.values[x .== i]) for i in 1 : 5] x = rand(1:4, n) y = rand(1:5, n) -w = weights(rand(n), false) +w = weights(rand(n)) c = counts(x, y, (4, 5)) @test size(c) == (4, 5) @@ -85,11 +85,11 @@ pm = proportionmap(x) @test pm["b"] ≈ (1/3) @test pm["c"] ≈ (1/6) -cm = countmap(x, weights(w, false)) +cm = countmap(x, weights(w)) @test cm["a"] == 5.5 @test cm["b"] == 4.5 @test cm["c"] == 3.5 -pm = proportionmap(x, weights(w, false)) +pm = proportionmap(x, weights(w)) @test pm["a"] ≈ (5.5 / 13.5) @test pm["b"] ≈ (4.5 / 13.5) @test pm["c"] ≈ (3.5 / 13.5) diff --git a/test/cov.jl b/test/cov.jl index 7523cb5b7..2cbd1d049 100644 --- a/test/cov.jl +++ b/test/cov.jl @@ -9,8 +9,8 @@ Z2 = X .- mean(X, 2) w1 = rand(3) w2 = rand(8) -wv1 = weights(w1, false) -wv2 = weights(w2, false) +wv1 = weights(w1) +wv2 = weights(w2) Z1w = X .- mean(X, wv1, 1) Z2w = X .- mean(X, wv2, 2) @@ -88,62 +88,62 @@ end # weighted covariance if VERSION < v"0.5.0-dev+679" - @test cov(X, wv1) ≈ S1w ./ sum(wv1) - @test cov(X, wv2; vardim=2) ≈ S2w ./ sum(wv2) + @test cov(X, wv1; corrected=false) ≈ S1w ./ sum(wv1) + @test cov(X, wv2; vardim=2, corrected=false) ≈ S2w ./ sum(wv2) - @test cov(X, wv1; mean=0) ≈ Sz1w ./ sum(wv1) - @test cov(X, wv2; mean=0, vardim=2) ≈ Sz2w ./ sum(wv2) + @test cov(X, wv1; mean=0, corrected=false) ≈ Sz1w ./ sum(wv1) + @test cov(X, wv2; mean=0, vardim=2, corrected=false) ≈ Sz2w ./ sum(wv2) - @test cov(X, wv1; mean=mean(X, wv1, 1)) ≈ S1w ./ sum(wv1) - @test cov(X, wv2; mean=mean(X, wv2, 2), vardim=2) ≈ S2w ./ sum(wv2) + @test cov(X, wv1; mean=mean(X, wv1, 1), corrected=false) ≈ S1w ./ sum(wv1) + @test cov(X, wv2; mean=mean(X, wv2, 2), vardim=2, corrected=false) ≈ S2w ./ sum(wv2) - @test cov(X, wv1; mean=zeros(1,8)) ≈ Sz1w ./ sum(wv1) - @test cov(X, wv2; mean=zeros(3), vardim=2) ≈ Sz2w ./ sum(wv2) + @test cov(X, wv1; mean=zeros(1,8), corrected=false) ≈ Sz1w ./ sum(wv1) + @test cov(X, wv2; mean=zeros(3), vardim=2, corrected=false) ≈ Sz2w ./ sum(wv2) else - @test cov(X, wv1) ≈ S1w ./ sum(wv1) - @test cov(X, wv2, 2) ≈ S2w ./ sum(wv2) + @test cov(X, wv1; corrected=false) ≈ S1w ./ sum(wv1) + @test cov(X, wv2, 2; corrected=false) ≈ S2w ./ sum(wv2) - @test Base.covm(X, 0, wv1) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, 0, wv2, 2) ≈ Sz2w ./ sum(wv2) + @test Base.covm(X, 0, wv1, 1, false) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, 0, wv2, 2, false) ≈ Sz2w ./ sum(wv2) - @test Base.covm(X, mean(X, wv1, 1), wv1) ≈ S1w ./ sum(wv1) - @test Base.covm(X, mean(X, wv2, 2), wv2, 2) ≈ S2w ./ sum(wv2) + @test Base.covm(X, mean(X, wv1, 1), wv1, 1, false) ≈ S1w ./ sum(wv1) + @test Base.covm(X, mean(X, wv2, 2), wv2, 2, false) ≈ S2w ./ sum(wv2) - @test Base.covm(X, zeros(1,8), wv1) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, zeros(3), wv2, 2) ≈ Sz2w ./ sum(wv2) + @test Base.covm(X, zeros(1,8), wv1, 1, false) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, zeros(3), wv2, 2, false) ≈ Sz2w ./ sum(wv2) end # mean_and_cov if VERSION < v"0.5.0-dev+679" - (m, C) = mean_and_cov(X; vardim=1) + (m, C) = mean_and_cov(X; vardim=1, corrected=false) @test m == mean(X, 1) - @test C == cov(X; vardim=1) + @test C == cov(X, vardim=1, corrected=false) - (m, C) = mean_and_cov(X; vardim=2) + (m, C) = mean_and_cov(X; vardim=2, corrected=false) @test m == mean(X, 2) - @test C == cov(X; vardim=2) + @test C == cov(X; vardim=2, corrected=false) - (m, C) = mean_and_cov(X, wv1; vardim=1) + (m, C) = mean_and_cov(X, wv1; vardim=1, corrected=false) @test m == mean(X, wv1, 1) - @test C == cov(X, wv1; vardim=1) + @test C == cov(X, wv1; vardim=1, corrected=false) - (m, C) = mean_and_cov(X, wv2; vardim=2) + (m, C) = mean_and_cov(X, wv2; vardim=2, corrected=false) @test m == mean(X, wv2, 2) - @test C == cov(X, wv2; vardim=2) + @test C == cov(X, wv2; vardim=2, corrected=false) else - (m, C) = mean_and_cov(X, 1) + (m, C) = mean_and_cov(X, 1; corrected=false) @test m == mean(X, 1) - @test C == cov(X, 1) + @test C == cov(X, 1, false) - (m, C) = mean_and_cov(X, 2) + (m, C) = mean_and_cov(X, 2; corrected=false) @test m == mean(X, 2) - @test C == cov(X, 2) + @test C == cov(X, 2, false) - (m, C) = mean_and_cov(X, wv1, 1) + (m, C) = mean_and_cov(X, wv1, 1; corrected=false) @test m == mean(X, wv1, 1) - @test C == cov(X, wv1, 1) + @test C == cov(X, wv1, 1; corrected=false) - (m, C) = mean_and_cov(X, wv2, 2) + (m, C) = mean_and_cov(X, wv2, 2; corrected=false) @test m == mean(X, wv2, 2) - @test C == cov(X, wv2, 2) + @test C == cov(X, wv2, 2; corrected=false) end diff --git a/test/moments.jl b/test/moments.jl index 17c956c71..c1f3e2234 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -4,119 +4,150 @@ using Base.Test ##### weighted var & std x = rand(10) -wv = weights(rand(10), false) +wv = weights(rand(10)) m = mean(x, wv) -@test var(x, wv) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) -@test var(x, wv; mean=0) ≈ sum(abs2.(x), wv) ./ sum(wv) -@test var(x, wv; mean=1.0) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) +@test var(x, wv; corrected=false) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) +@test var(x, wv; mean=0, corrected=false) ≈ sum(abs2.(x), wv) ./ sum(wv) +@test var(x, wv; mean=1.0, corrected=false) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) -@test std(x, wv) ≈ sqrt(var(x, wv)) -@test std(x, wv; mean=0) ≈ sqrt(var(x, wv; mean=0)) -@test std(x, wv; mean=1.0) ≈ sqrt(var(x, wv; mean=1.0)) +@test std(x, wv; corrected=false) ≈ sqrt(var(x, wv; corrected=false)) +@test std(x, wv; mean=0, corrected=false) ≈ sqrt(var(x, wv; mean=0, corrected=false)) +@test std(x, wv; mean=1.0, corrected=false) ≈ sqrt(var(x, wv; mean=1.0, corrected=false)) -(m, v) = mean_and_var(x) +(m, v) = mean_and_var(x; corrected=false) @test m == mean(x) -@test v == var(x) +@test v == var(x; corrected=false) -(m, s) = mean_and_std(x) +(m, s) = mean_and_std(x; corrected=false) @test m == mean(x) -@test s == std(x) +@test s == std(x; corrected=false) -(m, v) = mean_and_var(x, wv) +(m, v) = mean_and_var(x, wv; corrected=false) @test m == mean(x, wv) -@test v == var(x, wv) +@test v == var(x, wv; corrected=false) -(m, s) = mean_and_std(x, wv) +(m, s) = mean_and_std(x, wv; corrected=false) @test m == mean(x, wv) -@test s == std(x, wv) +@test s == std(x, wv; corrected=false) x = rand(5, 6) w1 = rand(5) w2 = rand(6) -wv1 = weights(w1, false) -wv2 = weights(w2, false) +wv1 = weights(w1) +wv2 = weights(w2) m1 = mean(x, wv1, 1) m2 = mean(x, wv2, 2) -@test var(x, wv1, 1; mean=0) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2; mean=0) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) +@test var(x, wv1, 1; mean=0, corrected=false) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) +@test var(x, wv2, 2; mean=0, corrected=false) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) -@test var(x, wv1, 1; mean=m1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2; mean=m2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) +@test var(x, wv1, 1; mean=m1, corrected=false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) +@test var(x, wv2, 2; mean=m2, corrected=false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) -@test var(x, wv1, 1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) +@test var(x, wv1, 1; corrected=false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) +@test var(x, wv2, 2; corrected=false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) -@test std(x, wv1, 1) ≈ sqrt.(var(x, wv1, 1)) -@test std(x, wv2, 2) ≈ sqrt.(var(x, wv2, 2)) -@test std(x, wv1, 1; mean=0) ≈ sqrt.(var(x, wv1, 1; mean=0)) -@test std(x, wv2, 2; mean=0) ≈ sqrt.(var(x, wv2, 2; mean=0)) -@test std(x, wv1, 1; mean=m1) ≈ sqrt.(var(x, wv1, 1; mean=m1)) -@test std(x, wv2, 2; mean=m2) ≈ sqrt.(var(x, wv2, 2; mean=m2)) +@test std(x, wv1, 1; corrected=false) ≈ sqrt.(var(x, wv1, 1; corrected=false)) +@test std(x, wv2, 2; corrected=false) ≈ sqrt.(var(x, wv2, 2; corrected=false)) +@test std(x, wv1, 1; mean=0, corrected=false) ≈ sqrt.(var(x, wv1, 1; mean=0, corrected=false)) +@test std(x, wv2, 2; mean=0, corrected=false) ≈ sqrt.(var(x, wv2, 2; mean=0, corrected=false)) +@test std(x, wv1, 1; mean=m1, corrected=false) ≈ sqrt.(var(x, wv1, 1; mean=m1, corrected=false)) +@test std(x, wv2, 2; mean=m2, corrected=false) ≈ sqrt.(var(x, wv2, 2; mean=m2, corrected=false)) for d in 1:2 - (m, v) = mean_and_var(x, d) + (m, v) = mean_and_var(x, d; corrected=false) @test m == mean(x, d) - @test v == var(x, d) + @test v == var(x, d; corrected=false) - (m, s) = mean_and_std(x, d) + (m, s) = mean_and_std(x, d; corrected=false) @test m == mean(x, d) - @test s == std(x, d) + @test s == std(x, d; corrected=false) end -(m, v) = mean_and_var(x, wv1, 1) +(m, v) = mean_and_var(x, wv1, 1; corrected=false) @test m == mean(x, wv1, 1) -@test v == var(x, wv1, 1) +@test v == var(x, wv1, 1; corrected=false) -(m, v) = mean_and_var(x, wv2, 2) +(m, v) = mean_and_var(x, wv2, 2; corrected=false) @test m == mean(x, wv2, 2) -@test v == var(x, wv2, 2) +@test v == var(x, wv2, 2; corrected=false) -(m, s) = mean_and_std(x, wv1, 1) +(m, s) = mean_and_std(x, wv1, 1; corrected=false) @test m == mean(x, wv1, 1) -@test s == std(x, wv1, 1) +@test s == std(x, wv1, 1; corrected=false) -(m, s) = mean_and_std(x, wv2, 2) +(m, s) = mean_and_std(x, wv2, 2; corrected=false) @test m == mean(x, wv2, 2) -@test s == std(x, wv2, 2) +@test s == std(x, wv2, 2; corrected=false) ##### skewness & kurtosis -wv = weights(ones(5) * 2.0, false) +wv = weights(ones(5) * 2.0) -@test skewness(1:5) ≈ 0.0 -@test skewness([1, 2, 3, 4, 5]) ≈ 0.0 -@test skewness([1, 2, 2, 2, 5]) ≈ 1.1731251294063556 -@test skewness([1, 4, 4, 4, 5]) ≈ -1.1731251294063556 +@test skewness(1:5; corrected=false) ≈ 0.0 +@test skewness([1, 2, 3, 4, 5]; corrected=false) ≈ 0.0 +@test skewness([1, 2, 2, 2, 5]; corrected=false) ≈ 1.1731251294063556 +@test skewness([1, 4, 4, 4, 5]; corrected=false) ≈ -1.1731251294063556 -@test skewness([1, 2, 2, 2, 5], wv) ≈ 1.1731251294063556 +@test skewness([1, 2, 2, 2, 5], wv; corrected=false) ≈ 1.1731251294063556 -@test kurtosis(1:5) ≈ -1.3 -@test kurtosis([1, 2, 3, 4, 5]) ≈ -1.3 -@test kurtosis([1, 2, 3, 3, 2]) ≈ -1.1530612244897953 +@test kurtosis(1:5; corrected=false) ≈ -1.3 +@test kurtosis([1, 2, 3, 4, 5]; corrected=false) ≈ -1.3 +@test kurtosis([1, 2, 3, 3, 2]; corrected=false) ≈ -1.1530612244897953 -@test kurtosis([1, 2, 3, 4, 5], wv) ≈ -1.3 +@test kurtosis([1, 2, 3, 4, 5], wv; corrected=false) ≈ -1.3 ##### general moments x = collect(2.0:8.0) -@test moment(x, 2) ≈ sum((x .- 5).^2) / length(x) -@test moment(x, 3) ≈ sum((x .- 5).^3) / length(x) -@test moment(x, 4) ≈ sum((x .- 5).^4) / length(x) -@test moment(x, 5) ≈ sum((x .- 5).^5) / length(x) +@test moment(x, 2; corrected=false) ≈ sum((x .- 5).^2) / length(x) +@test moment(x, 3; corrected=false) ≈ sum((x .- 5).^3) / length(x) +@test moment(x, 4; corrected=false) ≈ sum((x .- 5).^4) / length(x) +@test moment(x, 5; corrected=false) ≈ sum((x .- 5).^5) / length(x) -@test moment(x, 2, 4.0) ≈ sum((x .- 4).^2) / length(x) -@test moment(x, 3, 4.0) ≈ sum((x .- 4).^3) / length(x) -@test moment(x, 4, 4.0) ≈ sum((x .- 4).^4) / length(x) -@test moment(x, 5, 4.0) ≈ sum((x .- 4).^5) / length(x) +@test moment(x, 2, 4.0; corrected=false) ≈ sum((x .- 4).^2) / length(x) +@test moment(x, 3, 4.0; corrected=false) ≈ sum((x .- 4).^3) / length(x) +@test moment(x, 4, 4.0; corrected=false) ≈ sum((x .- 4).^4) / length(x) +@test moment(x, 5, 4.0; corrected=false) ≈ sum((x .- 4).^5) / length(x) w = weights([1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) x2 = collect(2.0:6.0) -@test moment(x, 2, w) ≈ sum((x2 .- 4).^2) / 5 -@test moment(x, 3, w) ≈ sum((x2 .- 4).^3) / 5 -@test moment(x, 4, w) ≈ sum((x2 .- 4).^4) / 5 -@test moment(x, 5, w) ≈ sum((x2 .- 4).^5) / 5 +@test moment(x, 2, w; corrected=false) ≈ sum((x2 .- 4).^2) / 5 +@test moment(x, 3, w; corrected=false) ≈ sum((x2 .- 4).^3) / 5 +@test moment(x, 4, w; corrected=false) ≈ sum((x2 .- 4).^4) / 5 +@test moment(x, 5, w; corrected=false) ≈ sum((x2 .- 4).^5) / 5 + +# Test corrected cases (this will be cleaner in testsets) +x = rand(10) + +# AnalyticWeights +@test var(x, aweights(ones(10))) ≈ var(x) + +w = aweights(rand(10)) +n = length(w) # Could be count(!iszero, w) instead +w = aweights(w .* (n / sum(w))) +sw = sum(w) # This is now equal to n, but maybe we should support non-normalized weights? +xbar = sum(w .* x) ./ sw +expected = sum(w .* (x .- xbar).^2)/(sw - sum(w.^2)/sw) +@test var(x, w) ≈ expected + +# FrequencyWeights +@test var(x, fweights(ones(Int, 10))) ≈ var(x) +w = fweights(rand(UInt, 10)) +sw = sum(w) +xbar = sum(w .* x) / sw +expected = sum(w .* (x .- xbar).^2) ./ (sum(w) - 1) +@test var(x, w) ≈ expected + +# ProbabilityWeights +@test var(x, pweights(ones(10))) ≈ var(x) +w = pweights(rand(10)) +n = count(!iszero, w) +sw = sum(w) +xbar = sum(w .* x)/sw +expected = sum(w .* (x .- xbar).^2)/sw * n/(n - 1) +@test var(x, w) ≈ expected diff --git a/test/weights.jl b/test/weights.jl index e27e3e74f..941fcc2c3 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -3,35 +3,25 @@ using Base.Test using Compat import Compat: view -@test isa(weights([1, 2, 3]), Weights{Int}) -@test isa(weights([1., 2., 3.]), Weights{Float64}) -@test isa(weights([1 2 3; 4 5 6]), Weights{Int}) +@test isa(weights([1, 2, 3]), AbstractWeights{Int}) +@test isa(weights([1., 2., 3.]), AbstractWeights{Float64}) +@test isa(weights([1 2 3; 4 5 6]), AbstractWeights{Int}) -@test isa(frequency([1, 2, 3]), FrequencyWeights) -@test isa(frequency([1 2 3; 4 5 6]), FrequencyWeights) -@test isa(FrequencyWeights([1, 2, 3], 6; corrected=false), FrequencyWeights) +@test isa(AnalyticWeights([1, 2, 3], 6), AbstractWeights{Int}) @test isempty(weights(Float64[])) @test size(weights([1, 2, 3])) == (3,) w = [1., 2., 3.] -wv = weights(w, false) +wv = weights(w) @test eltype(wv) === Float64 @test length(wv) === 3 @test values(wv) === w @test sum(wv) === 6.0 @test !isempty(wv) -fw = [1, 2, 3] -fwv = frequency(fw) -@test eltype(fwv) === Int -@test length(fwv) === 3 -@test values(fwv) === fw -@test sum(fwv) === 6 -@test !isempty(wv) - b = trues(3) -bv = frequency(b) +bv = weights(b) @test eltype(bv) === Bool @test length(bv) === 3 @test values(bv) === b @@ -41,8 +31,8 @@ bv = frequency(b) ba = BitArray([true, false, true]) sa = sparsevec([1., 0., 2.]) -@test sum(ba, fwv) === 4 -@test sum(sa, fwv) === 7.0 +@test sum(ba, wv) === 4.0 +@test sum(sa, wv) === 7.0 ## wsum @@ -161,20 +151,21 @@ r = ones(8, 6) ## the sum and mean syntax -@test sum([1.0, 2.0, 3.0], weights([1.0, 0.5, 0.5], false)) ≈ 3.5 -@test sum(1:3, weights([1.0, 1.0, 0.5], false)) ≈ 4.5 -@test mean([1:3;], weights([1.0, 1.0, 0.5], false)) ≈ 1.8 -@test mean(1:3, weights([1.0, 1.0, 0.5], false)) ≈ 1.8 +@test sum([1.0, 2.0, 3.0], weights([1.0, 0.5, 0.5])) ≈ 3.5 +@test sum(1:3, weights([1.0, 1.0, 0.5])) ≈ 4.5 + +@test mean([1:3;], weights([1.0, 1.0, 0.5])) ≈ 1.8 +@test mean(1:3, weights([1.0, 1.0, 0.5])) ≈ 1.8 a = reshape(1.0:27.0, 3, 3, 3) for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) - @test sum(a, weights(wt, false), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) - @test sum(a, weights(wt, false), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) - @test sum(a, weights(wt, false), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) - @test mean(a, weights(wt, false), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) - @test mean(a, weights(wt, false), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) - @test mean(a, weights(wt, false), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) + @test sum(a, weights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) + @test sum(a, weights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) + @test sum(a, weights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) + @test mean(a, weights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) + @test mean(a, weights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) + @test mean(a, weights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) @test_throws ErrorException mean(a, weights(wt), 4) end @@ -235,16 +226,15 @@ median_answers = (7.0, 4.0, 8.5, num_tests = length(data) for i = 1:num_tests @test wmedian(data[i], wt[i]) == median_answers[i] - @test wmedian(data[i], weights(wt[i], false)) == median_answers[i] - @test median(data[i], weights(wt[i], false)) == median_answers[i] + @test wmedian(data[i], weights(wt[i])) == median_answers[i] + @test median(data[i], weights(wt[i])) == median_answers[i] for j = 1:100 # Make sure the weighted median does not change if the data # and weights are reordered. reorder = sortperm(rand(length(data[i]))) - @test median(data[i][reorder], weights(wt[i][reorder], false)) == median_answers[i] + @test median(data[i][reorder], weights(wt[i][reorder])) == median_answers[i] end end - data = [4, 3, 2, 1] wt = [0, 0, 0, 0] @test_throws MethodError wmedian(data[1]) @@ -266,6 +256,7 @@ wt = [-1, -1, -1, -1, -1] wt = [-1, -1, -1, 0, 0] @test_throws ErrorException median(data, weights(wt)) + # Weighted quantile tests data = ( [7, 1, 2, 4, 10], @@ -289,25 +280,25 @@ data = ( [-10, 1, 1, -10, -10], ) wt = ( - weights([1, 1/3, 1/3, 1/3, 1], false), - weights([1, 1, 1, 1, 1], false), - weights([1, 1/3, 1/3, 1/3, 1, 1], false), - weights([1/3, 1/3, 1/3, 1, 1, 1], false), - weights([30, 191, 9, 0], false), - weights([10, 1, 1, 1, 9], false), - weights([10, 1, 1, 1, 900], false), - weights([1, 3, 5, 4, 2], false), - weights([2, 2, 5, 1, 2, 2, 1, 6], false), - weights([0.1, 0.1, 0.8], false), - weights([5, 5, 4, 1], false), - weights([30, 56, 144, 24, 55, 43, 67], false), - weights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], false), - weights([12], false), - weights([7, 1, 1, 1, 6], false), - weights([1, 0, 0, 0, 2], false), - weights([1, 2, 3, 4, 5], false), - weights([0.1, 0.2, 0.3, 0.2, 0.1], false), - weights([1, 1, 1, 1, 1], false), + weights([1, 1/3, 1/3, 1/3, 1]), + weights([1, 1, 1, 1, 1]), + weights([1, 1/3, 1/3, 1/3, 1, 1]), + weights([1/3, 1/3, 1/3, 1, 1, 1]), + weights([30, 191, 9, 0]), + weights([10, 1, 1, 1, 9]), + weights([10, 1, 1, 1, 900]), + weights([1, 3, 5, 4, 2]), + weights([2, 2, 5, 1, 2, 2, 1, 6]), + weights([0.1, 0.1, 0.8]), + weights([5, 5, 4, 1]), + weights([30, 56, 144, 24, 55, 43, 67]), + weights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), + weights([12]), + weights([7, 1, 1, 1, 6]), + weights([1, 0, 0, 0, 2]), + weights([1, 2, 3, 4, 5]), + weights([0.1, 0.2, 0.3, 0.2, 0.1]), + weights([1, 1, 1, 1, 1]), ) quantile_answers = ( [1.0,3.6000000000000005,6.181818181818182,8.2,10.0], @@ -343,15 +334,15 @@ for i = 1:length(data) for j = 1:10 # order of w does not matter reorder = sortperm(rand(length(data[i]))) - @test quantile(data[i][reorder], weights(wt[i][reorder], false), p) ≈ quantile_answers[i] + @test quantile(data[i][reorder], weights(wt[i][reorder]), p) ≈ quantile_answers[i] end end # w = 1 corresponds to base quantile for i = 1:length(data) - @test quantile(data[i], weights(ones(Int64, length(data[i])), false), p) ≈ quantile(data[i], p) + @test quantile(data[i], weights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) for j = 1:10 prandom = rand(4) - @test quantile(data[i], weights(ones(Int64, length(data[i])), false), prandom) ≈ quantile(data[i], prandom) + @test quantile(data[i], weights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) end end @@ -359,8 +350,8 @@ end v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] answer = 6.181818181818182 -@test quantile(data[1], weights(w, false), 0.5) ≈ answer -@test wquantile(data[1], weights(w, false), [0.5]) ≈ [answer] -@test wquantile(data[1], weights(w, false), 0.5) ≈ answer +@test quantile(data[1], weights(w), 0.5) ≈ answer +@test wquantile(data[1], weights(w), [0.5]) ≈ [answer] +@test wquantile(data[1], weights(w), 0.5) ≈ answer @test wquantile(data[1], w, [0.5]) ≈ [answer] @test wquantile(data[1], w, 0.5) ≈ answer diff --git a/test/wsampling.jl b/test/wsampling.jl index b29f902e5..31012438a 100644 --- a/test/wsampling.jl +++ b/test/wsampling.jl @@ -35,7 +35,7 @@ end import StatsBase: direct_sample!, alias_sample! n = 10^5 -wv = weights([0.2, 0.8, 0.4, 0.6], false) +wv = weights([0.2, 0.8, 0.4, 0.6]) a = direct_sample!(4:7, wv, zeros(Int, n, 3)) check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=false) @@ -79,7 +79,7 @@ import StatsBase: naive_wsample_norep!, efraimidis_a_wsample_norep!, efraimidis_ares_wsample_norep!, efraimidis_aexpj_wsample_norep! n = 10^5 -wv = weights([0.2, 0.8, 0.4, 0.6], false) +wv = weights([0.2, 0.8, 0.4, 0.6]) a = zeros(Int, 3, n) for j = 1:n From 7cd959ddcc0444c989d60df923b3353a0b8f4e1d Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 27 Apr 2017 12:51:51 -0500 Subject: [PATCH 03/50] Fixed style issues. --- src/StatsBase.jl | 32 ++++++++++++++++---------------- src/moments.jl | 41 +++++++++++++---------------------------- src/weights.jl | 31 ++++++++++++++----------------- 3 files changed, 43 insertions(+), 61 deletions(-) diff --git a/src/StatsBase.jl b/src/StatsBase.jl index cebd7cf37..598b38329 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -17,22 +17,22 @@ module StatsBase export ## weights - AbstractWeights, # the abstract type to represent any weight vector - AnalyticWeights, # the default type for representing a analytic/precision/reliability weight vectors - FrequencyWeights, # the type for representing a frequency weight vectors - ProbabilityWeights,# the type for representing a probability/sampling weight vectors - ExponentialWeights,# the type for representing exponential weights - weights, # alias for aweights - aweights, # construct an AnalyticWeights vector - fweights, # construct a FrequencyWeights vector - pweights, # construct a ProbabilityWeights vector - eweights, # construct an ExponentialWeights vector - wsum, # weighted sum with vector as second argument - wsum!, # weighted sum across dimensions with provided storage - wmean, # weighted mean - wmean!, # weighted mean across dimensions with provided storage - wmedian, # weighted median - wquantile, # weighted quantile + AbstractWeights, # the abstract type to represent any weight vector + AnalyticWeights, # the default type for representing a analytic/precision/reliability weight vectors + FrequencyWeights, # the type for representing a frequency weight vectors + ProbabilityWeights, # the type for representing a probability/sampling weight vectors + ExponentialWeights, # the type for representing exponential weights + weights, # alias for aweights + aweights, # construct an AnalyticWeights vector + fweights, # construct a FrequencyWeights vector + pweights, # construct a ProbabilityWeights vector + eweights, # construct an ExponentialWeights vector + wsum, # weighted sum with vector as second argument + wsum!, # weighted sum across dimensions with provided storage + wmean, # weighted mean + wmean!, # weighted mean across dimensions with provided storage + wmedian, # weighted median + wquantile, # weighted quantile ## moments skewness, # (standardized) skewness diff --git a/src/moments.jl b/src/moments.jl index 2ea437d9c..25901a25f 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -15,9 +15,8 @@ whereas it's `length(x)-1` in `Base.varm`. The impact is that this is not a weighted estimate of the population variance based on the sample; it's the weighted variance of the sample. """ -function Base.varm(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) +Base.varm(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) = _moment2(v, wv, m, corrected=corrected) -end """ var(x, wv::AbstractWeights, [dim]; mean=nothing) @@ -40,12 +39,8 @@ end ## var along dim -function Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=true) - scale!( - _wsum_centralize!(R, @functorize(abs2), A, values(wv), M, dim, true), - bias(wv, corrected) - ) -end +Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=true) = + scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), bias(wv, corrected)) function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) if mean == 0 @@ -89,15 +84,11 @@ end function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) @static if VERSION < v"0.6.0-dev.1121" - return var!( - similar(A, Float64, Base.reduced_dims(size(A), dim)), - A, wv, dim; mean=mean, corrected=corrected - ) + return var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim; + mean=mean, corrected=corrected) else - return var!( - similar(A, Float64, Base.reduced_indices(indices(A), dim)), - A, wv, dim; mean=mean, corrected=corrected - ) + return var!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, dim; + mean=mean, corrected=corrected) end end @@ -109,9 +100,8 @@ Return the standard deviation of a real-valued array `v` with a known mean `m`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights (also called case weights) for the estimate. """ -function Base.stdm(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) +Base.stdm(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) = sqrt(varm(v, wv, m; corrected=corrected)) -end """ std(v, wv::AbstractWeights, [dim]; mean=nothing) @@ -120,21 +110,17 @@ Return the standard deviation of a real-valued array `v`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights (also called case weights) for the estimate. """ -function Base.std(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=true) +Base.std(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=true) = sqrt.(var(v, wv; mean=mean, corrected=corrected)) -end -function Base.stdm(v::RealArray, m::RealArray, dim::Int; corrected=true) +Base.stdm(v::RealArray, m::RealArray, dim::Int; corrected=true) = Base.sqrt!(varm(v, m, dim; corrected=corrected)) -end -function Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int; corrected=true) +Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int; corrected=true) = sqrt.(varm(v, wv, m, dim; corrected=corrected)) -end -function Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) +Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) = sqrt.(var(v, wv, dim; mean=mean, corrected=corrected)) -end ##### Fused statistics """ @@ -221,8 +207,7 @@ function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) @inbounds s += (z * z) * w[i] end - result = s * bias(wv, corrected) - return result + s * bias(wv, corrected) end function _moment3(v::RealArray, m::Real; corrected=true) diff --git a/src/weights.jl b/src/weights.jl index ac4f60add..bda052f00 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -8,9 +8,9 @@ else end """ - `@weights name` + @weights name -Generates a new generic weight type with specified `name`, which subtypes `AbstractWeights` +generates a new generic weight type with specified `name`, which subtypes `AbstractWeights` and stores the `values` (`V<:RealVector`) and `sum` (`S<:Real`). """ macro weights(name) @@ -39,7 +39,7 @@ Base.getindex(wv::AbstractWeights, i) = getindex(wv.values, i) Base.size(wv::AbstractWeights) = size(wv.values) """ - bias(n::Integer, [corrected]) + bias(n::Integer, corrected=true) Computes the corrected (default) or uncorrected bias for any `n` observations. @@ -50,7 +50,7 @@ Computes the corrected (default) or uncorrected bias for any `n` observations. bias(n::Integer, corrected=true) = inv(n - Int(corrected)) """ - bias(w::AbstractWeights, [corrected]) + bias(w::AbstractWeights, corrected=true) Computes the corrected (default) or uncorrected bias for any weight vector. The default equation assumes analytic/precision/reliability weights and determines the @@ -78,9 +78,8 @@ end Construct a `AnalyticWeights` with weight values `vs` and sum of weights `wsum`. If omitted, `wsum` is computed. """ -function AnalyticWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) - return AnalyticWeights{S, eltype(vs), V}(vs, s) -end +AnalyticWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = + AnalyticWeights{S, eltype(vs), V}(vs, s) """ aweights(vs) @@ -99,9 +98,8 @@ weights(vs) = aweights(vs) @weights FrequencyWeights -function FrequencyWeights{S<:Integer, V<:IntegerVector}(vs::V, s::S=sum(vs)) - return FrequencyWeights{S, eltype(vs), V}(vs, s) -end +FrequencyWeights{S<:Integer, V<:IntegerVector}(vs::V, s::S=sum(vs)) = + FrequencyWeights{S, eltype(vs), V}(vs, s) """ fweights(vs) @@ -112,7 +110,7 @@ fweights(vs::IntegerVector) = FrequencyWeights(vs) fweights(vs::IntegerArray) = FrequencyWeights(vec(vs)) """ - bias(w::FrequencyWeights, [corrected]) + bias(w::FrequencyWeights, corrected=true) ```math \fraction{1}{∑w - 1} @@ -122,9 +120,8 @@ bias(w::FrequencyWeights, corrected=true) = inv(sum(w) - Int(corrected)) @weights ProbabilityWeights -function ProbabilityWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) - return ProbabilityWeights{S, eltype(vs), V}(vs, s) -end +ProbabilityWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = + ProbabilityWeights{S, eltype(vs), V}(vs, s) """ pweights(vs) @@ -135,7 +132,7 @@ pweights(vs::RealVector) = ProbabilityWeights(vs) pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) """ - bias(w::ProbabilityWeights, [corrected]) + bias(w::ProbabilityWeights, corrected=true) ```math \fraction{n}{∑w × (n - 1)} @@ -156,7 +153,7 @@ end function ExponentialWeights{V<:RealVector}(vs::V) s = sum(vs) - return ExponentialWeights{typeof(s), eltype(vs), V}(vs, s) + ExponentialWeights{typeof(s), eltype(vs), V}(vs, s) end """ @@ -175,7 +172,7 @@ function eweights(n::Integer, λ::Real=0.99) n > 0 || throw(ArgumentError("cannot construct weights of length < 1")) 0 <= λ <= 1 || throw(ArgumentError("smoothing factor must be between 0 and 1")) w0 = map(i -> λ * (1 - λ)^(1 - i), 1:n) - return weights(w0) + weights(w0) end ##### Weighted sum ##### From d0653dbcba72d2444768560338338f784140ae22 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 27 Apr 2017 14:00:38 -0500 Subject: [PATCH 04/50] Added re-added `WeightVec` only as a deprecated type and deprecated `weights`. --- src/deprecates.jl | 25 ++++++++++++++++++++++++- src/weights.jl | 7 ------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/deprecates.jl b/src/deprecates.jl index 6f3905e65..67bc6be85 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -44,4 +44,27 @@ findat(a::AbstractArray, b::AbstractArray) = findat!(Array{Int}(size(b)), a, b) @deprecate df(obj::StatisticalModel) dof(obj) @deprecate df_residual(obj::StatisticalModel) dof_residual(obj) -@deprecate WeightVec{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) AnalyticWeights(vs, s) +@weights WeightVec + +""" + WeightVec(vs, wsum=sum(vs)) + +Construct a `WeightVec` with weight values `vs` and sum of weights `wsum`. +""" +function WeightVec{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) + new_types = "AnalyticWeights, FrequencyWeights or ProbabilityWeights" + Base.depwarn("WeightVec is deprecated, use $new_types instead", :WeightVec) + WeightVec{S, eltype(vs), V}(vs, s) +end + +""" + weights(vs) + +Construct a `WeightVec` from a given array. +""" +function weights(vs::RealArray) + Base.depwarn("weights is deprecated, use aweights, fweights or pweights instead", :weights) + v = vec(vs) + s = sum(v) + WeightVec{typeof(s), eltype(v), typeof(v)}(v, s) +end diff --git a/src/weights.jl b/src/weights.jl index bda052f00..a945ebc0b 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -89,13 +89,6 @@ Construct a `AnalyticWeights` type from a given array. aweights(vs::RealVector) = AnalyticWeights(vs) aweights(vs::RealArray) = AnalyticWeights(vec(vs)) -""" - weights(vs) - -Alias for aweights(vs) -""" -weights(vs) = aweights(vs) - @weights FrequencyWeights FrequencyWeights{S<:Integer, V<:IntegerVector}(vs::V, s::S=sum(vs)) = From e8860665a422ea4c5b78eda50877f9d91fea20c5 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 27 Apr 2017 15:37:30 -0500 Subject: [PATCH 05/50] Cleaned up docstrings for weight bias methods (mostly latex equations). --- src/weights.jl | 48 ++++++++++++++++++------------------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index a945ebc0b..12bab2f6f 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -43,9 +43,7 @@ Base.size(wv::AbstractWeights) = size(wv.values) Computes the corrected (default) or uncorrected bias for any `n` observations. -```math -\fraction{1}{n - 1} -``` +``\\frac{1}{n - 1}`` """ bias(n::Integer, corrected=true) = inv(n - Int(corrected)) @@ -56,10 +54,7 @@ Computes the corrected (default) or uncorrected bias for any weight vector. The default equation assumes analytic/precision/reliability weights and determines the bias as: -```math -\fraction{1}{∑w × (1 - ∑(w'²))} -``` -where w' represents the normalized weights +``\\frac{1}{\sum w - \sum w / \sum {w^2}}`` """ function bias(w::AbstractWeights, corrected=true) s = sum(w) @@ -73,10 +68,9 @@ end @weights AnalyticWeights """ - AnalyticWeights(vs, [wsum]) + AnalyticWeights(vs, wsum=sum(vs)) -Construct a `AnalyticWeights` with weight values `vs` and sum of weights `wsum`. -If omitted, `wsum` is computed. +Construct an `AnalyticWeights` vector with weight values `vs` and sum of weights `wsum`. """ AnalyticWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = AnalyticWeights{S, eltype(vs), V}(vs, s) @@ -84,20 +78,25 @@ AnalyticWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = """ aweights(vs) -Construct a `AnalyticWeights` type from a given array. +Construct an `AnalyticWeights` vector from a given array. """ aweights(vs::RealVector) = AnalyticWeights(vs) aweights(vs::RealArray) = AnalyticWeights(vec(vs)) @weights FrequencyWeights +""" + FrequencyWeights(vs, wsum=sum(vs)) + +Construct a `FrequencyWeights` vector with weight values `vs` and sum of weights `wsum`. +""" FrequencyWeights{S<:Integer, V<:IntegerVector}(vs::V, s::S=sum(vs)) = FrequencyWeights{S, eltype(vs), V}(vs, s) """ fweights(vs) -Construct a `FrequencyWeights` type from a given array. +Construct a `FrequencyWeights` vector from a given array. """ fweights(vs::IntegerVector) = FrequencyWeights(vs) fweights(vs::IntegerArray) = FrequencyWeights(vec(vs)) @@ -105,9 +104,7 @@ fweights(vs::IntegerArray) = FrequencyWeights(vec(vs)) """ bias(w::FrequencyWeights, corrected=true) -```math -\fraction{1}{∑w - 1} -``` +``\\frac{1}{\sum{w} - 1}`` """ bias(w::FrequencyWeights, corrected=true) = inv(sum(w) - Int(corrected)) @@ -119,7 +116,7 @@ ProbabilityWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = """ pweights(vs) -Construct a `ProbabilityWeights` type from a given array. +Construct a `ProbabilityWeights` vector from a given array. """ pweights(vs::RealVector) = ProbabilityWeights(vs) pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) @@ -127,33 +124,24 @@ pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) """ bias(w::ProbabilityWeights, corrected=true) -```math -\fraction{n}{∑w × (n - 1)} -``` +``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ function bias(w::ProbabilityWeights, corrected=true) s = sum(w) if corrected - n = length(values(w)) + n = length(w) return n / (s * (n - 1)) else return inv(s) end end -@weights ExponentialWeights - -function ExponentialWeights{V<:RealVector}(vs::V) - s = sum(vs) - ExponentialWeights{typeof(s), eltype(vs), V}(vs, s) -end - """ eweights(n, [λ]) -Constructs a `ExponentialWeights` type with a desired length `n` and smoothing factor `λ`, -where each element is set to `λ * (1 - λ)^(1 - i)`. +Constructs an `AnalyticWeights` vector with a desired length `n` and smoothing factor `λ`, +where each element is set to ``λ * (1 - λ)^(1 - i)``. # Arguments * `n::Integer`: the desired length of the `Weights` @@ -165,7 +153,7 @@ function eweights(n::Integer, λ::Real=0.99) n > 0 || throw(ArgumentError("cannot construct weights of length < 1")) 0 <= λ <= 1 || throw(ArgumentError("smoothing factor must be between 0 and 1")) w0 = map(i -> λ * (1 - λ)^(1 - i), 1:n) - weights(w0) + aweights(w0) end ##### Weighted sum ##### From 6710bcb33012b46ecceb0c3dd6f29b21fb9e0429 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 27 Apr 2017 15:58:13 -0500 Subject: [PATCH 06/50] Updated the "default" bias correction code to iteratively compute the sum squared norm vs using vectorized operations. --- src/weights.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/weights.jl b/src/weights.jl index 12bab2f6f..d3b3fbd13 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -59,7 +59,13 @@ bias as: function bias(w::AbstractWeights, corrected=true) s = sum(w) if corrected - return inv(s * (1 - sum(normalize(values(w), 1) .^ 2))) + # sum square norm + sum_sn = 0.0 + for x in w + sum_sn += (x / s) ^ 2 + end + + return inv(s * (1 - sum_sn)) else return inv(s) end From e662b9d035ebbc2af2731b5982c884f8c8492701 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 27 Apr 2017 16:01:44 -0500 Subject: [PATCH 07/50] Updated `FrequecyWeights` to accept `Real`s --- src/weights.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index d3b3fbd13..bc7de60b3 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -96,7 +96,7 @@ aweights(vs::RealArray) = AnalyticWeights(vec(vs)) Construct a `FrequencyWeights` vector with weight values `vs` and sum of weights `wsum`. """ -FrequencyWeights{S<:Integer, V<:IntegerVector}(vs::V, s::S=sum(vs)) = +FrequencyWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = FrequencyWeights{S, eltype(vs), V}(vs, s) """ @@ -104,8 +104,8 @@ FrequencyWeights{S<:Integer, V<:IntegerVector}(vs::V, s::S=sum(vs)) = Construct a `FrequencyWeights` vector from a given array. """ -fweights(vs::IntegerVector) = FrequencyWeights(vs) -fweights(vs::IntegerArray) = FrequencyWeights(vec(vs)) +fweights(vs::RealVector) = FrequencyWeights(vs) +fweights(vs::RealArray) = FrequencyWeights(vec(vs)) """ bias(w::FrequencyWeights, corrected=true) From e7f2ce9cb0b879143c6ef27d56583622ee2144a3 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 27 Apr 2017 16:12:31 -0500 Subject: [PATCH 08/50] Removed some unnecessary v0.5 pre-release code and version checks. --- src/cov.jl | 73 +++++++++------------------- test/cov.jl | 136 +++++++++++++++------------------------------------- 2 files changed, 61 insertions(+), 148 deletions(-) diff --git a/src/cov.jl b/src/cov.jl index fc68c12c5..dc82b6278 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -66,62 +66,33 @@ the variables are columns in the matrix (`1`) or rows (`2`). """ function mean_and_cov end +scattermatm(x::DenseMatrix, mean, vardim::Int=1) = + scattermat_zm(x .- mean, vardim) -@static if VERSION < v"0.5.0-dev+679" - function scattermat(x::DenseMatrix; mean=nothing, vardim::Int=1) - mean == 0 ? scattermat_zm(x, vardim) : - mean == nothing ? scattermat_zm(x .- Base.mean(x, vardim), vardim) : - scattermat_zm(x .- mean, vardim) - end - - function scattermat(x::DenseMatrix, wv::AbstractWeights; mean=nothing, vardim::Int=1) - mean == 0 ? scattermat_zm(x, wv, vardim) : - mean == nothing ? scattermat_zm(x .- Base.mean(x, wv, vardim), wv, vardim) : - scattermat_zm(x .- mean, wv, vardim) - end - - ## weighted cov - function Base.cov(x::DenseMatrix, wv::AbstractWeights; mean=nothing, vardim::Int=1, corrected=true) - scale!(scattermat(x, wv; mean=mean, vardim=vardim), bias(wv, corrected)) - end - - function mean_and_cov(x::DenseMatrix; vardim::Int=1, corrected=true) - m = mean(x, vardim) - return m, Base.covm(x, m; vardim=vardim, corrected=corrected) - end - function mean_and_cov(x::DenseMatrix, wv::AbstractWeights; vardim::Int=1, corrected=true) - m = mean(x, wv, vardim) - return m, Base.cov(x, wv; mean=m, vardim=vardim, corrected=corrected) - end -else - scattermatm(x::DenseMatrix, mean, vardim::Int=1) = - scattermat_zm(x .- mean, vardim) +scattermatm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1) = + scattermat_zm(x .- mean, wv, vardim) - scattermatm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1) = - scattermat_zm(x .- mean, wv, vardim) +scattermat(x::DenseMatrix, vardim::Int=1) = + scattermatm(x, Base.mean(x, vardim), vardim) - scattermat(x::DenseMatrix, vardim::Int=1) = - scattermatm(x, Base.mean(x, vardim), vardim) +scattermat(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = + scattermatm(x, Base.mean(x, wv, vardim), wv, vardim) - scattermat(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = - scattermatm(x, Base.mean(x, wv, vardim), wv, vardim) - - ## weighted cov - function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1, corrected::Bool=true) - scale!(scattermatm(x, mean, wv, vardim), bias(wv, corrected)) - end +## weighted cov +function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1, corrected::Bool=true) + scale!(scattermatm(x, mean, wv, vardim), bias(wv, corrected)) +end - function Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=true) - Base.covm(x, Base.mean(x, wv, vardim), wv, vardim, corrected) - end +function Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=true) + Base.covm(x, Base.mean(x, wv, vardim), wv, vardim, corrected) +end - function mean_and_cov(x::DenseMatrix, vardim::Int=1; corrected=true) - m = mean(x, vardim) - return m, Base.covm(x, m, vardim, corrected) - end +function mean_and_cov(x::DenseMatrix, vardim::Int=1; corrected=true) + m = mean(x, vardim) + return m, Base.covm(x, m, vardim, corrected) +end - function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=true) - m = mean(x, wv, vardim) - return m, Base.cov(x, wv, vardim; corrected=corrected) - end +function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=true) + m = mean(x, wv, vardim) + return m, Base.cov(x, wv, vardim; corrected=corrected) end diff --git a/test/cov.jl b/test/cov.jl index 2cbd1d049..83755f35d 100644 --- a/test/cov.jl +++ b/test/cov.jl @@ -31,119 +31,61 @@ Sz2w = X * diagm(w2) * X' ## scattermat -if VERSION < v"0.5.0-dev+679" - @test scattermat(X) ≈ S1 - @test scattermat(X; vardim=2) ≈ S2 - @test scattermat(X; mean=0) ≈ Sz1 - @test scattermat(X; mean=0, vardim=2) ≈ Sz2 +@test scattermat(X) ≈ S1 +@test scattermat(X, 2) ≈ S2 - @test scattermat(X; mean=mean(X,1)) ≈ S1 - @test scattermat(X; mean=mean(X,2), vardim=2) ≈ S2 +@test StatsBase.scattermatm(X, 0) ≈ Sz1 +@test StatsBase.scattermatm(X, 0, 2) ≈ Sz2 - @test scattermat(X; mean=zeros(1,8)) ≈ Sz1 - @test scattermat(X; mean=zeros(3), vardim=2) ≈ Sz2 +@test StatsBase.scattermatm(X, mean(X,1)) ≈ S1 +@test StatsBase.scattermatm(X, mean(X,2), 2) ≈ S2 - ## weighted scatter mat +@test StatsBase.scattermatm(X, zeros(1,8)) ≈ Sz1 +@test StatsBase.scattermatm(X, zeros(3), 2) ≈ Sz2 - @test scattermat(X, wv1) ≈ S1w - @test scattermat(X, wv2; vardim=2) ≈ S2w +## weighted scatter mat - @test scattermat(X, wv1; mean=0) ≈ Sz1w - @test scattermat(X, wv2; mean=0, vardim=2) ≈ Sz2w +@test scattermat(X, wv1) ≈ S1w +@test scattermat(X, wv2, 2) ≈ S2w - @test scattermat(X, wv1; mean=mean(X, wv1, 1)) ≈ S1w - @test scattermat(X, wv2; mean=mean(X, wv2, 2), vardim=2) ≈ S2w +@test StatsBase.scattermatm(X, 0, wv1) ≈ Sz1w +@test StatsBase.scattermatm(X, 0, wv2, 2) ≈ Sz2w - @test scattermat(X, wv1; mean=zeros(1,8)) ≈ Sz1w - @test scattermat(X, wv2; mean=zeros(3), vardim=2) ≈ Sz2w -else - @test scattermat(X) ≈ S1 - @test scattermat(X, 2) ≈ S2 +@test StatsBase.scattermatm(X, mean(X, wv1, 1), wv1) ≈ S1w +@test StatsBase.scattermatm(X, mean(X, wv2, 2), wv2, 2) ≈ S2w - @test StatsBase.scattermatm(X, 0) ≈ Sz1 - @test StatsBase.scattermatm(X, 0, 2) ≈ Sz2 - - @test StatsBase.scattermatm(X, mean(X,1)) ≈ S1 - @test StatsBase.scattermatm(X, mean(X,2), 2) ≈ S2 - - @test StatsBase.scattermatm(X, zeros(1,8)) ≈ Sz1 - @test StatsBase.scattermatm(X, zeros(3), 2) ≈ Sz2 - - ## weighted scatter mat - - @test scattermat(X, wv1) ≈ S1w - @test scattermat(X, wv2, 2) ≈ S2w - - @test StatsBase.scattermatm(X, 0, wv1) ≈ Sz1w - @test StatsBase.scattermatm(X, 0, wv2, 2) ≈ Sz2w - - @test StatsBase.scattermatm(X, mean(X, wv1, 1), wv1) ≈ S1w - @test StatsBase.scattermatm(X, mean(X, wv2, 2), wv2, 2) ≈ S2w - - @test StatsBase.scattermatm(X, zeros(1,8), wv1) ≈ Sz1w - @test StatsBase.scattermatm(X, zeros(3), wv2, 2) ≈ Sz2w -end +@test StatsBase.scattermatm(X, zeros(1,8), wv1) ≈ Sz1w +@test StatsBase.scattermatm(X, zeros(3), wv2, 2) ≈ Sz2w # weighted covariance -if VERSION < v"0.5.0-dev+679" - @test cov(X, wv1; corrected=false) ≈ S1w ./ sum(wv1) - @test cov(X, wv2; vardim=2, corrected=false) ≈ S2w ./ sum(wv2) +@test cov(X, wv1; corrected=false) ≈ S1w ./ sum(wv1) +@test cov(X, wv2, 2; corrected=false) ≈ S2w ./ sum(wv2) - @test cov(X, wv1; mean=0, corrected=false) ≈ Sz1w ./ sum(wv1) - @test cov(X, wv2; mean=0, vardim=2, corrected=false) ≈ Sz2w ./ sum(wv2) +@test Base.covm(X, 0, wv1, 1, false) ≈ Sz1w ./ sum(wv1) +@test Base.covm(X, 0, wv2, 2, false) ≈ Sz2w ./ sum(wv2) - @test cov(X, wv1; mean=mean(X, wv1, 1), corrected=false) ≈ S1w ./ sum(wv1) - @test cov(X, wv2; mean=mean(X, wv2, 2), vardim=2, corrected=false) ≈ S2w ./ sum(wv2) +@test Base.covm(X, mean(X, wv1, 1), wv1, 1, false) ≈ S1w ./ sum(wv1) +@test Base.covm(X, mean(X, wv2, 2), wv2, 2, false) ≈ S2w ./ sum(wv2) - @test cov(X, wv1; mean=zeros(1,8), corrected=false) ≈ Sz1w ./ sum(wv1) - @test cov(X, wv2; mean=zeros(3), vardim=2, corrected=false) ≈ Sz2w ./ sum(wv2) -else - @test cov(X, wv1; corrected=false) ≈ S1w ./ sum(wv1) - @test cov(X, wv2, 2; corrected=false) ≈ S2w ./ sum(wv2) +@test Base.covm(X, zeros(1,8), wv1, 1, false) ≈ Sz1w ./ sum(wv1) +@test Base.covm(X, zeros(3), wv2, 2, false) ≈ Sz2w ./ sum(wv2) - @test Base.covm(X, 0, wv1, 1, false) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, 0, wv2, 2, false) ≈ Sz2w ./ sum(wv2) +# mean_and_cov - @test Base.covm(X, mean(X, wv1, 1), wv1, 1, false) ≈ S1w ./ sum(wv1) - @test Base.covm(X, mean(X, wv2, 2), wv2, 2, false) ≈ S2w ./ sum(wv2) +(m, C) = mean_and_cov(X, 1; corrected=false) +@test m == mean(X, 1) +@test C == cov(X, 1, false) - @test Base.covm(X, zeros(1,8), wv1, 1, false) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, zeros(3), wv2, 2, false) ≈ Sz2w ./ sum(wv2) -end +(m, C) = mean_and_cov(X, 2; corrected=false) +@test m == mean(X, 2) +@test C == cov(X, 2, false) -# mean_and_cov -if VERSION < v"0.5.0-dev+679" - (m, C) = mean_and_cov(X; vardim=1, corrected=false) - @test m == mean(X, 1) - @test C == cov(X, vardim=1, corrected=false) - - (m, C) = mean_and_cov(X; vardim=2, corrected=false) - @test m == mean(X, 2) - @test C == cov(X; vardim=2, corrected=false) - - (m, C) = mean_and_cov(X, wv1; vardim=1, corrected=false) - @test m == mean(X, wv1, 1) - @test C == cov(X, wv1; vardim=1, corrected=false) - - (m, C) = mean_and_cov(X, wv2; vardim=2, corrected=false) - @test m == mean(X, wv2, 2) - @test C == cov(X, wv2; vardim=2, corrected=false) -else - (m, C) = mean_and_cov(X, 1; corrected=false) - @test m == mean(X, 1) - @test C == cov(X, 1, false) - - (m, C) = mean_and_cov(X, 2; corrected=false) - @test m == mean(X, 2) - @test C == cov(X, 2, false) - - (m, C) = mean_and_cov(X, wv1, 1; corrected=false) - @test m == mean(X, wv1, 1) - @test C == cov(X, wv1, 1; corrected=false) - - (m, C) = mean_and_cov(X, wv2, 2; corrected=false) - @test m == mean(X, wv2, 2) - @test C == cov(X, wv2, 2; corrected=false) -end +(m, C) = mean_and_cov(X, wv1, 1; corrected=false) +@test m == mean(X, wv1, 1) +@test C == cov(X, wv1, 1; corrected=false) + +(m, C) = mean_and_cov(X, wv2, 2; corrected=false) +@test m == mean(X, wv2, 2) +@test C == cov(X, wv2, 2; corrected=false) From 15d81003a38f8352a9869fb6b76e332540234d29 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 27 Apr 2017 16:35:19 -0500 Subject: [PATCH 09/50] Converted a ternary to an if statement to help with readability. --- src/moments.jl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/moments.jl b/src/moments.jl index 25901a25f..ef617e2b7 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -32,9 +32,13 @@ weighted estimate of the population variance based on the sample; it's the weigh variance of the sample. """ function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=true) - mean == 0 ? varm(v, wv, 0; corrected=corrected) : - mean == nothing ? varm(v, wv, Base.mean(v, wv); corrected=corrected) : - varm(v, wv, mean; corrected=corrected) + if mean == 0 + return varm(v, wv, 0; corrected=corrected) + elseif mean == nothing + return varm(v, wv, Base.mean(v, wv); corrected=corrected) : + else + return varm(v, wv, mean; corrected=corrected) + end end ## var along dim From 520c06176c01cd8dc353e3523ac8c67683a4ad36 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 27 Apr 2017 17:27:05 -0500 Subject: [PATCH 10/50] Replaced all internal (and test) calls to `weights(...)` with `fweights(...)`. --- src/moments.jl | 2 +- src/weights.jl | 10 ++--- test/counts.jl | 8 ++-- test/cov.jl | 4 +- test/hist.jl | 16 +++---- test/moments.jl | 10 ++--- test/weights.jl | 112 +++++++++++++++++++++++----------------------- test/wsampling.jl | 4 +- 8 files changed, 83 insertions(+), 83 deletions(-) diff --git a/src/moments.jl b/src/moments.jl index ef617e2b7..3460f9cd0 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -35,7 +35,7 @@ function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=tru if mean == 0 return varm(v, wv, 0; corrected=corrected) elseif mean == nothing - return varm(v, wv, Base.mean(v, wv); corrected=corrected) : + return varm(v, wv, Base.mean(v, wv); corrected=corrected) else return varm(v, wv, mean; corrected=corrected) end diff --git a/src/weights.jl b/src/weights.jl index bc7de60b3..acc50893b 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -390,8 +390,8 @@ Base.sum{T<:Number,W<:Real}(A::AbstractArray{T}, w::AbstractWeights{W}, dim::Int Compute the weighted mean of an array `v` with weights `w`. """ function wmean{T<:Number}(v::AbstractArray{T}, w::AbstractVector) - Base.depwarn("wmean is deprecated, use mean(v, weights(w)) instead.", :wmean) - mean(v, weights(w)) + Base.depwarn("wmean is deprecated, use mean(v, fweights(w)) instead.", :wmean) + mean(v, fweights(w)) end Base.mean(v::AbstractArray, w::AbstractWeights) = sum(v, w) / sum(w) @@ -468,7 +468,7 @@ end Compute the weighted median of an array `v` with weights `w`, given as either a vector or `AbstractWeights`. """ -wmedian(v::RealVector, w::RealVector) = median(v, weights(w)) +wmedian(v::RealVector, w::RealVector) = median(v, fweights(w)) wmedian{W<:Real}(v::RealVector, w::AbstractWeights{W}) = median(v, w) ###### Weighted quantile ##### @@ -562,5 +562,5 @@ or a `AbstractWeights`. """ wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::RealVector) = quantile(v, w, p) wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::Number) = quantile(v, w, [p])[1] -wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, weights(w), p) -wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, weights(w), [p])[1] +wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, fweights(w), p) +wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, fweights(w), [p])[1] diff --git a/test/counts.jl b/test/counts.jl index 2fae64eed..b6b7f632d 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -6,7 +6,7 @@ n = 5000 # 1D integer counts x = rand(1:5, n) -w = weights(rand(n)) +w = fweights(rand(n)) c = counts(x, 5) @test size(c) == (5,) @@ -40,7 +40,7 @@ c0 = Float64[sum(w.values[x .== i]) for i in 1 : 5] x = rand(1:4, n) y = rand(1:5, n) -w = weights(rand(n)) +w = fweights(rand(n)) c = counts(x, y, (4, 5)) @test size(c) == (4, 5) @@ -85,11 +85,11 @@ pm = proportionmap(x) @test pm["b"] ≈ (1/3) @test pm["c"] ≈ (1/6) -cm = countmap(x, weights(w)) +cm = countmap(x, fweights(w)) @test cm["a"] == 5.5 @test cm["b"] == 4.5 @test cm["c"] == 3.5 -pm = proportionmap(x, weights(w)) +pm = proportionmap(x, fweights(w)) @test pm["a"] ≈ (5.5 / 13.5) @test pm["b"] ≈ (4.5 / 13.5) @test pm["c"] ≈ (3.5 / 13.5) diff --git a/test/cov.jl b/test/cov.jl index 83755f35d..9cc501d82 100644 --- a/test/cov.jl +++ b/test/cov.jl @@ -9,8 +9,8 @@ Z2 = X .- mean(X, 2) w1 = rand(3) w2 = rand(8) -wv1 = weights(w1) -wv2 = weights(w2) +wv1 = fweights(w1) +wv2 = fweights(w2) Z1w = X .- mean(X, wv1, 1) Z2w = X .- mean(X, wv2, 2) diff --git a/test/hist.jl b/test/hist.jl index 60403a68d..3de1ce302 100644 --- a/test/hist.jl +++ b/test/hist.jl @@ -57,19 +57,19 @@ end @test fit(Histogram,(0:99,0:99),nbins=(5,5), closed=:left).weights == diagm([20,20,20,20,20]) # FIXME: closed (all lines in this block): - @test fit(Histogram,0:99,weights(ones(100)),nbins=5, closed=:left).weights == [20,20,20,20,20] - @test fit(Histogram,0:99,weights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] - @test fit(Histogram{Int32},0:99,weights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] - @test fit(Histogram{Float32},0:99,weights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] + @test fit(Histogram,0:99,fweights(ones(100)),nbins=5, closed=:left).weights == [20,20,20,20,20] + @test fit(Histogram,0:99,fweights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] + @test fit(Histogram{Int32},0:99,fweights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] + @test fit(Histogram{Float32},0:99,fweights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] end @testset "Histogram element type" begin # FIXME: closed (all lines in this block): - @test eltype(@inferred(fit(Histogram,1:100,weights(ones(Int,100)),nbins=5, closed=:left)).weights) == Int - @test eltype(@inferred(fit(Histogram{Float32},1:100,weights(ones(Int,100)),nbins=5, closed=:left)).weights) == Float32 - @test eltype(@inferred(fit(Histogram,1:100,weights(ones(Float64,100)),nbins=5, closed=:left)).weights) == Float64 - @test eltype(@inferred(fit(Histogram{Float32},1:100,weights(ones(Float64,100)),nbins=5, closed=:left)).weights) == Float32 + @test eltype(@inferred(fit(Histogram,1:100,fweights(ones(Int,100)),nbins=5, closed=:left)).weights) == Int + @test eltype(@inferred(fit(Histogram{Float32},1:100,fweights(ones(Int,100)),nbins=5, closed=:left)).weights) == Float32 + @test eltype(@inferred(fit(Histogram,1:100,fweights(ones(Float64,100)),nbins=5, closed=:left)).weights) == Float64 + @test eltype(@inferred(fit(Histogram{Float32},1:100,fweights(ones(Float64,100)),nbins=5, closed=:left)).weights) == Float32 end diff --git a/test/moments.jl b/test/moments.jl index c1f3e2234..f706e9f35 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -4,7 +4,7 @@ using Base.Test ##### weighted var & std x = rand(10) -wv = weights(rand(10)) +wv = fweights(rand(10)) m = mean(x, wv) @test var(x, wv; corrected=false) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) @@ -34,8 +34,8 @@ m = mean(x, wv) x = rand(5, 6) w1 = rand(5) w2 = rand(6) -wv1 = weights(w1) -wv2 = weights(w2) +wv1 = fweights(w1) +wv2 = fweights(w2) m1 = mean(x, wv1, 1) m2 = mean(x, wv2, 2) @@ -85,7 +85,7 @@ end ##### skewness & kurtosis -wv = weights(ones(5) * 2.0) +wv = fweights(ones(5) * 2.0) @test skewness(1:5; corrected=false) ≈ 0.0 @test skewness([1, 2, 3, 4, 5]; corrected=false) ≈ 0.0 @@ -114,7 +114,7 @@ x = collect(2.0:8.0) @test moment(x, 4, 4.0; corrected=false) ≈ sum((x .- 4).^4) / length(x) @test moment(x, 5, 4.0; corrected=false) ≈ sum((x .- 4).^5) / length(x) -w = weights([1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) +w = fweights([1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) x2 = collect(2.0:6.0) @test moment(x, 2, w; corrected=false) ≈ sum((x2 .- 4).^2) / 5 @test moment(x, 3, w; corrected=false) ≈ sum((x2 .- 4).^3) / 5 diff --git a/test/weights.jl b/test/weights.jl index 941fcc2c3..3df3e418a 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -3,17 +3,17 @@ using Base.Test using Compat import Compat: view -@test isa(weights([1, 2, 3]), AbstractWeights{Int}) -@test isa(weights([1., 2., 3.]), AbstractWeights{Float64}) -@test isa(weights([1 2 3; 4 5 6]), AbstractWeights{Int}) +@test isa(fweights([1, 2, 3]), AbstractWeights{Int}) +@test isa(fweights([1., 2., 3.]), AbstractWeights{Float64}) +@test isa(fweights([1 2 3; 4 5 6]), AbstractWeights{Int}) @test isa(AnalyticWeights([1, 2, 3], 6), AbstractWeights{Int}) -@test isempty(weights(Float64[])) -@test size(weights([1, 2, 3])) == (3,) +@test isempty(fweights(Float64[])) +@test size(fweights([1, 2, 3])) == (3,) w = [1., 2., 3.] -wv = weights(w) +wv = fweights(w) @test eltype(wv) === Float64 @test length(wv) === 3 @test values(wv) === w @@ -21,7 +21,7 @@ wv = weights(w) @test !isempty(wv) b = trues(3) -bv = weights(b) +bv = fweights(b) @test eltype(bv) === Bool @test length(bv) === 3 @test values(bv) === b @@ -152,21 +152,21 @@ r = ones(8, 6) ## the sum and mean syntax -@test sum([1.0, 2.0, 3.0], weights([1.0, 0.5, 0.5])) ≈ 3.5 -@test sum(1:3, weights([1.0, 1.0, 0.5])) ≈ 4.5 +@test sum([1.0, 2.0, 3.0], fweights([1.0, 0.5, 0.5])) ≈ 3.5 +@test sum(1:3, fweights([1.0, 1.0, 0.5])) ≈ 4.5 -@test mean([1:3;], weights([1.0, 1.0, 0.5])) ≈ 1.8 -@test mean(1:3, weights([1.0, 1.0, 0.5])) ≈ 1.8 +@test mean([1:3;], fweights([1.0, 1.0, 0.5])) ≈ 1.8 +@test mean(1:3, fweights([1.0, 1.0, 0.5])) ≈ 1.8 a = reshape(1.0:27.0, 3, 3, 3) for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) - @test sum(a, weights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) - @test sum(a, weights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) - @test sum(a, weights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) - @test mean(a, weights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) - @test mean(a, weights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) - @test mean(a, weights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) - @test_throws ErrorException mean(a, weights(wt), 4) + @test sum(a, fweights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) + @test sum(a, fweights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) + @test sum(a, fweights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) + @test mean(a, fweights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) + @test mean(a, fweights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) + @test mean(a, fweights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) + @test_throws ErrorException mean(a, fweights(wt), 4) end # Weighted median tests @@ -226,35 +226,35 @@ median_answers = (7.0, 4.0, 8.5, num_tests = length(data) for i = 1:num_tests @test wmedian(data[i], wt[i]) == median_answers[i] - @test wmedian(data[i], weights(wt[i])) == median_answers[i] - @test median(data[i], weights(wt[i])) == median_answers[i] + @test wmedian(data[i], fweights(wt[i])) == median_answers[i] + @test median(data[i], fweights(wt[i])) == median_answers[i] for j = 1:100 # Make sure the weighted median does not change if the data # and weights are reordered. reorder = sortperm(rand(length(data[i]))) - @test median(data[i][reorder], weights(wt[i][reorder])) == median_answers[i] + @test median(data[i][reorder], fweights(wt[i][reorder])) == median_answers[i] end end data = [4, 3, 2, 1] wt = [0, 0, 0, 0] @test_throws MethodError wmedian(data[1]) -@test_throws ErrorException median(data, weights(wt)) +@test_throws ErrorException median(data, fweights(wt)) @test_throws ErrorException wmedian(data, wt) -@test_throws ErrorException median((Float64)[], weights((Float64)[])) +@test_throws ErrorException median((Float64)[], fweights((Float64)[])) wt = [1, 2, 3, 4, 5] -@test_throws ErrorException median(data, weights(wt)) -@test_throws MethodError median([4 3 2 1 0], weights(wt)) -@test_throws MethodError median([[1 2];[4 5];[7 8];[10 11];[13 14]], weights(wt)) +@test_throws ErrorException median(data, fweights(wt)) +@test_throws MethodError median([4 3 2 1 0], fweights(wt)) +@test_throws MethodError median([[1 2];[4 5];[7 8];[10 11];[13 14]], fweights(wt)) data = [1, 3, 2, NaN, 2] -@test isnan(median(data, weights(wt))) +@test isnan(median(data, fweights(wt))) wt = [1, 2, NaN, 4, 5] -@test_throws ErrorException median(data, weights(wt)) +@test_throws ErrorException median(data, fweights(wt)) data = [1, 3, 2, 1, 2] -@test_throws ErrorException median(data, weights(wt)) +@test_throws ErrorException median(data, fweights(wt)) wt = [-1, -1, -1, -1, -1] -@test_throws ErrorException median(data, weights(wt)) +@test_throws ErrorException median(data, fweights(wt)) wt = [-1, -1, -1, 0, 0] -@test_throws ErrorException median(data, weights(wt)) +@test_throws ErrorException median(data, fweights(wt)) # Weighted quantile tests @@ -280,25 +280,25 @@ data = ( [-10, 1, 1, -10, -10], ) wt = ( - weights([1, 1/3, 1/3, 1/3, 1]), - weights([1, 1, 1, 1, 1]), - weights([1, 1/3, 1/3, 1/3, 1, 1]), - weights([1/3, 1/3, 1/3, 1, 1, 1]), - weights([30, 191, 9, 0]), - weights([10, 1, 1, 1, 9]), - weights([10, 1, 1, 1, 900]), - weights([1, 3, 5, 4, 2]), - weights([2, 2, 5, 1, 2, 2, 1, 6]), - weights([0.1, 0.1, 0.8]), - weights([5, 5, 4, 1]), - weights([30, 56, 144, 24, 55, 43, 67]), - weights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), - weights([12]), - weights([7, 1, 1, 1, 6]), - weights([1, 0, 0, 0, 2]), - weights([1, 2, 3, 4, 5]), - weights([0.1, 0.2, 0.3, 0.2, 0.1]), - weights([1, 1, 1, 1, 1]), + fweights([1, 1/3, 1/3, 1/3, 1]), + fweights([1, 1, 1, 1, 1]), + fweights([1, 1/3, 1/3, 1/3, 1, 1]), + fweights([1/3, 1/3, 1/3, 1, 1, 1]), + fweights([30, 191, 9, 0]), + fweights([10, 1, 1, 1, 9]), + fweights([10, 1, 1, 1, 900]), + fweights([1, 3, 5, 4, 2]), + fweights([2, 2, 5, 1, 2, 2, 1, 6]), + fweights([0.1, 0.1, 0.8]), + fweights([5, 5, 4, 1]), + fweights([30, 56, 144, 24, 55, 43, 67]), + fweights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), + fweights([12]), + fweights([7, 1, 1, 1, 6]), + fweights([1, 0, 0, 0, 2]), + fweights([1, 2, 3, 4, 5]), + fweights([0.1, 0.2, 0.3, 0.2, 0.1]), + fweights([1, 1, 1, 1, 1]), ) quantile_answers = ( [1.0,3.6000000000000005,6.181818181818182,8.2,10.0], @@ -334,15 +334,15 @@ for i = 1:length(data) for j = 1:10 # order of w does not matter reorder = sortperm(rand(length(data[i]))) - @test quantile(data[i][reorder], weights(wt[i][reorder]), p) ≈ quantile_answers[i] + @test quantile(data[i][reorder], fweights(wt[i][reorder]), p) ≈ quantile_answers[i] end end # w = 1 corresponds to base quantile for i = 1:length(data) - @test quantile(data[i], weights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) + @test quantile(data[i], fweights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) for j = 1:10 prandom = rand(4) - @test quantile(data[i], weights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) + @test quantile(data[i], fweights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) end end @@ -350,8 +350,8 @@ end v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] answer = 6.181818181818182 -@test quantile(data[1], weights(w), 0.5) ≈ answer -@test wquantile(data[1], weights(w), [0.5]) ≈ [answer] -@test wquantile(data[1], weights(w), 0.5) ≈ answer +@test quantile(data[1], fweights(w), 0.5) ≈ answer +@test wquantile(data[1], fweights(w), [0.5]) ≈ [answer] +@test wquantile(data[1], fweights(w), 0.5) ≈ answer @test wquantile(data[1], w, [0.5]) ≈ [answer] @test wquantile(data[1], w, 0.5) ≈ answer diff --git a/test/wsampling.jl b/test/wsampling.jl index 31012438a..d605a868e 100644 --- a/test/wsampling.jl +++ b/test/wsampling.jl @@ -35,7 +35,7 @@ end import StatsBase: direct_sample!, alias_sample! n = 10^5 -wv = weights([0.2, 0.8, 0.4, 0.6]) +wv = fweights([0.2, 0.8, 0.4, 0.6]) a = direct_sample!(4:7, wv, zeros(Int, n, 3)) check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=false) @@ -79,7 +79,7 @@ import StatsBase: naive_wsample_norep!, efraimidis_a_wsample_norep!, efraimidis_ares_wsample_norep!, efraimidis_aexpj_wsample_norep! n = 10^5 -wv = weights([0.2, 0.8, 0.4, 0.6]) +wv = fweights([0.2, 0.8, 0.4, 0.6]) a = zeros(Int, 3, n) for j = 1:n From a23b8cb2c0a53712391f23e14edf6f087f3d9b11 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 27 Apr 2017 22:22:23 -0500 Subject: [PATCH 11/50] Changed function name `bias` -> `cfactor`. * Started dispatching on corrected value rather than repeating the same corrected check for each weights type. * Added extra documentation for the different weights type, including documenting the alternative names for each type. * `cfactor(::AbstractWeights, true` now throws an argument error to avoid return an incorrect result for weight types that don't provide bias correction. --- src/cov.jl | 2 +- src/moments.jl | 26 +++++++-------- src/weights.jl | 90 ++++++++++++++++++++++++++++++++------------------ 3 files changed, 71 insertions(+), 47 deletions(-) diff --git a/src/cov.jl b/src/cov.jl index dc82b6278..89012cbcf 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -80,7 +80,7 @@ scattermat(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = ## weighted cov function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1, corrected::Bool=true) - scale!(scattermatm(x, mean, wv, vardim), bias(wv, corrected)) + scale!(scattermatm(x, mean, wv, vardim), cfactor(wv, corrected)) end function Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=true) diff --git a/src/moments.jl b/src/moments.jl index 3460f9cd0..03a83b0c4 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -44,7 +44,7 @@ end ## var along dim Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=true) = - scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), bias(wv, corrected)) + scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), cfactor(wv, corrected)) function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) if mean == 0 @@ -199,7 +199,7 @@ function _moment2(v::RealArray, m::Real; corrected=true) @inbounds z = v[i] - m s += z * z end - s * bias(n, corrected) + cfactor(n, corrected) * s end function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) @@ -211,7 +211,7 @@ function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) @inbounds s += (z * z) * w[i] end - s * bias(wv, corrected) + cfactor(wv, corrected) * s end function _moment3(v::RealArray, m::Real; corrected=true) @@ -221,7 +221,7 @@ function _moment3(v::RealArray, m::Real; corrected=true) @inbounds z = v[i] - m s += z * z * z end - s * bias(n, corrected) + cfactor(n, corrected) * s end function _moment3(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) @@ -232,7 +232,7 @@ function _moment3(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) @inbounds z = v[i] - m @inbounds s += (z * z * z) * w[i] end - s * bias(wv, corrected) + cfactor(wv, corrected) * s end function _moment4(v::RealArray, m::Real; corrected=true) @@ -242,7 +242,7 @@ function _moment4(v::RealArray, m::Real; corrected=true) @inbounds z = v[i] - m s += abs2(z * z) end - s * bias(n, corrected) + cfactor(n, corrected) * s end function _moment4(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) @@ -253,7 +253,7 @@ function _moment4(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) @inbounds z = v[i] - m @inbounds s += abs2(z * z) * w[i] end - s * bias(wv, corrected) + cfactor(wv, corrected) * s end function _momentk(v::RealArray, k::Int, m::Real; corrected=true) @@ -263,7 +263,7 @@ function _momentk(v::RealArray, k::Int, m::Real; corrected=true) @inbounds z = v[i] - m s += (z ^ k) end - s * bias(n, corrected) + cfactor(n, corrected) * s end function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=true) @@ -274,7 +274,7 @@ function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected= @inbounds z = v[i] - m @inbounds s += (z ^ k) * w[i] end - s * bias(wv, corrected) + cfactor(wv, corrected) * s end @@ -325,7 +325,7 @@ function skewness(v::RealArray, m::Real; corrected=true) cm2 += z2 cm3 += z2 * z end - b = bias(n, corrected) + b = cfactor(n, corrected) cm3 *= b cm2 *= b return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 @@ -346,7 +346,7 @@ function skewness(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) cm2 += z2w cm3 += z2w * z end - b = bias(wv, corrected) + b = cfactor(wv, corrected) cm3 *= b cm2 *= b return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 @@ -375,7 +375,7 @@ function kurtosis(v::RealArray, m::Real; corrected=true) cm2 += z2 cm4 += z2 * z2 end - b = bias(n, corrected) + b = cfactor(n, corrected) cm4 *= b cm2 *= b return (cm4 / (cm2 * cm2)) - 3.0 @@ -397,7 +397,7 @@ function kurtosis(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) cm2 += z2w cm4 += z2w * z2 end - b = bias(wv, corrected) + b = cfactor(wv, corrected) cm4 *= b cm2 *= b return (cm4 / (cm2 * cm2)) - 3.0 diff --git a/src/weights.jl b/src/weights.jl index acc50893b..ae99ad66b 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -39,37 +39,32 @@ Base.getindex(wv::AbstractWeights, i) = getindex(wv.values, i) Base.size(wv::AbstractWeights) = size(wv.values) """ - bias(n::Integer, corrected=true) + cfactor(n::Integer, corrected=true) -Computes the corrected (default) or uncorrected bias for any `n` observations. +Computes a correction factor for calculating `var`, `std` and `cov` with `n` observations. +If `corrected=true` this will return ``\\frac{1}{n - 1}`` +(ie: [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), +otherwise it will return ``\\frac{1}{n}``. +""" +cfactor(n::Integer, corrected=true) = 1 / (n - Int(corrected)) -``\\frac{1}{n - 1}`` """ -bias(n::Integer, corrected=true) = inv(n - Int(corrected)) + cfactor(wv::AbstractWeights, corrected=true) +Computes a correction factor for calculating `var`, `std` and `cov` with a set of +weights `wv`. """ - bias(w::AbstractWeights, corrected=true) +cfactor(wv::AbstractWeights, corrected=true) = cfactor(wv, Val{corrected}) -Computes the corrected (default) or uncorrected bias for any weight vector. -The default equation assumes analytic/precision/reliability weights and determines the -bias as: +""" + cfactor(wv::AbstractWeights, false) -``\\frac{1}{\sum w - \sum w / \sum {w^2}}`` +``\\frac{1}{\sum w}`` """ -function bias(w::AbstractWeights, corrected=true) - s = sum(w) - if corrected - # sum square norm - sum_sn = 0.0 - for x in w - sum_sn += (x / s) ^ 2 - end +cfactor(wv::AbstractWeights, ::Type{Val{false}}) = 1 / sum(wv) +cfactor(wv::AbstractWeights, ::Type{Val{true}}) = + throw(ArgumentError("$(typeof(wv)) does not support bias correction.")) - return inv(s * (1 - sum_sn)) - else - return inv(s) - end -end @weights AnalyticWeights @@ -77,6 +72,10 @@ end AnalyticWeights(vs, wsum=sum(vs)) Construct an `AnalyticWeights` vector with weight values `vs` and sum of weights `wsum`. + +Analytic weights describe a non-random relative importance (usually between 0 and 1) +for each observation. These weights may also be referred to as reliability weights, +precision weights or inverse variance weights. """ AnalyticWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = AnalyticWeights{S, eltype(vs), V}(vs, s) @@ -85,16 +84,35 @@ AnalyticWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = aweights(vs) Construct an `AnalyticWeights` vector from a given array. +See the documentation for `AnalyticWeights` for more details. """ aweights(vs::RealVector) = AnalyticWeights(vs) aweights(vs::RealArray) = AnalyticWeights(vec(vs)) +""" + cfactor(w::AnalyticWeights, true) + +``\\frac{1}{\sum w - \sum w / \sum {w^2}}`` +""" +function cfactor(w::AnalyticWeights, ::Type{Val{true}}) + s = sum(w) + sum_sn = 0.0 + for x in w + sum_sn += (x / s) ^ 2 + end + + 1 / (s * (1 - sum_sn)) +end + @weights FrequencyWeights """ FrequencyWeights(vs, wsum=sum(vs)) Construct a `FrequencyWeights` vector with weight values `vs` and sum of weights `wsum`. + +Frequency weights describe the number of cases (or frequency) in which each observation +was observed. These weight may also be referred to as case weights or repeat weights. """ FrequencyWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = FrequencyWeights{S, eltype(vs), V}(vs, s) @@ -103,19 +121,29 @@ FrequencyWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = fweights(vs) Construct a `FrequencyWeights` vector from a given array. +See the documentation for `FrequencyWeights` for more details. """ fweights(vs::RealVector) = FrequencyWeights(vs) fweights(vs::RealArray) = FrequencyWeights(vec(vs)) """ - bias(w::FrequencyWeights, corrected=true) + cfactor(w::FrequencyWeights, true) ``\\frac{1}{\sum{w} - 1}`` """ -bias(w::FrequencyWeights, corrected=true) = inv(sum(w) - Int(corrected)) +cfactor(w::FrequencyWeights, ::Type{Val{true}}) = 1 / (sum(w) - 1) @weights ProbabilityWeights +""" + ProbabilityWeights(vs, wsum=sum(vs)) + +Construct a `ProbabilityWeights` vector with weight values `vs` and sum of weights `wsum`. + +Probability weights represent the inverse of the sampling probability for each observation, +providing a correction mechanism for under- or over-sampling certain population groups. +These weight may also be referred to as sampling weights. +""" ProbabilityWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = ProbabilityWeights{S, eltype(vs), V}(vs, s) @@ -123,24 +151,20 @@ ProbabilityWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = pweights(vs) Construct a `ProbabilityWeights` vector from a given array. +See the documentation for `ProbabilityWeights` for more details. """ pweights(vs::RealVector) = ProbabilityWeights(vs) pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) """ - bias(w::ProbabilityWeights, corrected=true) + cfactor(w::ProbabilityWeights, true) ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ -function bias(w::ProbabilityWeights, corrected=true) +function cfactor(w::ProbabilityWeights, ::Type{Val{true}}) s = sum(w) - - if corrected - n = length(w) - return n / (s * (n - 1)) - else - return inv(s) - end + n = length(w) + return n / (s * (n - 1)) end """ From 25f69f312b2411025ab366ad1be6a361fef6a73b Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 27 Apr 2017 23:08:50 -0500 Subject: [PATCH 12/50] Removed corrected flag from `skewness` and `kurtosis` and set default behaviour to corrected=false. --- src/cov.jl | 8 ++-- src/moments.jl | 105 +++++++++++++++++++++----------------------- src/weights.jl | 8 ++-- test/moments.jl | 30 ++++++------- test/scalarstats.jl | 6 +-- 5 files changed, 76 insertions(+), 81 deletions(-) diff --git a/src/cov.jl b/src/cov.jl index 89012cbcf..ba64d6a88 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -79,20 +79,20 @@ scattermat(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = scattermatm(x, Base.mean(x, wv, vardim), wv, vardim) ## weighted cov -function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1, corrected::Bool=true) +function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1, corrected::Bool=false) scale!(scattermatm(x, mean, wv, vardim), cfactor(wv, corrected)) end -function Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=true) +function Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=false) Base.covm(x, Base.mean(x, wv, vardim), wv, vardim, corrected) end -function mean_and_cov(x::DenseMatrix, vardim::Int=1; corrected=true) +function mean_and_cov(x::DenseMatrix, vardim::Int=1; corrected=false) m = mean(x, vardim) return m, Base.covm(x, m, vardim, corrected) end -function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=true) +function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=false) m = mean(x, wv, vardim) return m, Base.cov(x, wv, vardim; corrected=corrected) end diff --git a/src/moments.jl b/src/moments.jl index 03a83b0c4..bbe37873e 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -15,7 +15,7 @@ whereas it's `length(x)-1` in `Base.varm`. The impact is that this is not a weighted estimate of the population variance based on the sample; it's the weighted variance of the sample. """ -Base.varm(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) = +Base.varm(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) = _moment2(v, wv, m, corrected=corrected) """ @@ -31,7 +31,7 @@ whereas it's `length(x)-1` in `Base.var`. The impact is that this is not a weighted estimate of the population variance based on the sample; it's the weighted variance of the sample. """ -function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=true) +function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=false) if mean == 0 return varm(v, wv, 0; corrected=corrected) elseif mean == nothing @@ -43,10 +43,10 @@ end ## var along dim -Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=true) = +Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=false) = scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), cfactor(wv, corrected)) -function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) +function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=false) if mean == 0 Base.varm!( R, A, wv, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim; @@ -72,7 +72,7 @@ function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mea end end -function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=true) +function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=false) @static if VERSION < v"0.6.0-dev.1121" return Base.varm!( similar(A, Float64, Base.reduced_dims(size(A), dim)), @@ -86,7 +86,7 @@ function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; co end end -function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) +function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=false) @static if VERSION < v"0.6.0-dev.1121" return var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim; mean=mean, corrected=corrected) @@ -104,7 +104,7 @@ Return the standard deviation of a real-valued array `v` with a known mean `m`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights (also called case weights) for the estimate. """ -Base.stdm(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) = +Base.stdm(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) = sqrt(varm(v, wv, m; corrected=corrected)) """ @@ -114,16 +114,16 @@ Return the standard deviation of a real-valued array `v`, optionally over a dimension `dim`. The weighting vector `wv` specifies frequency weights (also called case weights) for the estimate. """ -Base.std(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=true) = +Base.std(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=false) = sqrt.(var(v, wv; mean=mean, corrected=corrected)) -Base.stdm(v::RealArray, m::RealArray, dim::Int; corrected=true) = +Base.stdm(v::RealArray, m::RealArray, dim::Int; corrected=false) = Base.sqrt!(varm(v, m, dim; corrected=corrected)) -Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int; corrected=true) = +Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int; corrected=false) = sqrt.(varm(v, wv, m, dim; corrected=corrected)) -Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=true) = +Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=false) = sqrt.(var(v, wv, dim; mean=mean, corrected=corrected)) ##### Fused statistics @@ -134,7 +134,7 @@ Return the mean and variance of a real-valued array `x`, optionally over a dimen `dim`, as a tuple. A weighting vector `wv` can be specified to weight the estimates. The weights are assumed to be frequency weights, also called case weights. """ -function mean_and_var(A::RealArray; corrected=true) +function mean_and_var(A::RealArray; corrected=false) m = mean(A) v = varm(A, m; corrected=corrected) m, v @@ -148,51 +148,50 @@ over a dimension `dim`, as a tuple. A weighting vector `wv` can be specified to weight the estimates. The weights are assumed to be frequency weights, also called case weights. """ -function mean_and_std(A::RealArray; corrected=true) +function mean_and_std(A::RealArray; corrected=false) m = mean(A) s = stdm(A, m; corrected=corrected) m, s end -function mean_and_var(A::RealArray, wv::AbstractWeights; corrected=true) +function mean_and_var(A::RealArray, wv::AbstractWeights; corrected=false) m = mean(A, wv) v = varm(A, wv, m; corrected=corrected) m, v end -function mean_and_std(A::RealArray, wv::AbstractWeights; corrected=true) +function mean_and_std(A::RealArray, wv::AbstractWeights; corrected=false) m = mean(A, wv) s = stdm(A, wv, m; corrected=corrected) m, s end -function mean_and_var(A::RealArray, dim::Int; corrected=true) +function mean_and_var(A::RealArray, dim::Int; corrected=false) m = mean(A, dim) v = varm(A, m, dim; corrected=corrected) m, v end -function mean_and_std(A::RealArray, dim::Int; corrected=true) +function mean_and_std(A::RealArray, dim::Int; corrected=false) m = mean(A, dim) s = stdm(A, m, dim; corrected=corrected) m, s end -function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int; corrected=true) +function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int; corrected=false) m = mean(A, wv, dim) v = varm(A, wv, m, dim; corrected=corrected) m, v end -function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int; corrected=true) +function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int; corrected=false) m = mean(A, wv, dim) s = stdm(A, wv, m, dim; corrected=corrected) m, s end ##### General central moment - -function _moment2(v::RealArray, m::Real; corrected=true) +function _moment2(v::RealArray, m::Real; corrected=false) n = length(v) s = 0.0 for i = 1:n @@ -202,7 +201,7 @@ function _moment2(v::RealArray, m::Real; corrected=true) cfactor(n, corrected) * s end -function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) +function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) n = length(v) s = 0.0 w = values(wv) @@ -214,7 +213,7 @@ function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) cfactor(wv, corrected) * s end -function _moment3(v::RealArray, m::Real; corrected=true) +function _moment3(v::RealArray, m::Real; corrected=false) n = length(v) s = 0.0 for i = 1:n @@ -224,7 +223,7 @@ function _moment3(v::RealArray, m::Real; corrected=true) cfactor(n, corrected) * s end -function _moment3(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) +function _moment3(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) n = length(v) s = 0.0 w = values(wv) @@ -235,7 +234,7 @@ function _moment3(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) cfactor(wv, corrected) * s end -function _moment4(v::RealArray, m::Real; corrected=true) +function _moment4(v::RealArray, m::Real; corrected=false) n = length(v) s = 0.0 for i = 1:n @@ -245,7 +244,7 @@ function _moment4(v::RealArray, m::Real; corrected=true) cfactor(n, corrected) * s end -function _moment4(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) +function _moment4(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) n = length(v) s = 0.0 w = values(wv) @@ -256,7 +255,7 @@ function _moment4(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) cfactor(wv, corrected) * s end -function _momentk(v::RealArray, k::Int, m::Real; corrected=true) +function _momentk(v::RealArray, k::Int, m::Real; corrected=false) n = length(v) s = 0.0 for i = 1:n @@ -266,7 +265,7 @@ function _momentk(v::RealArray, k::Int, m::Real; corrected=true) cfactor(n, corrected) * s end -function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=true) +function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=false) n = length(v) s = 0.0 w = values(wv) @@ -284,14 +283,14 @@ end Return the `k`th order central moment of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. """ -function moment(v::RealArray, k::Int, m::Real; corrected=true) +function moment(v::RealArray, k::Int, m::Real; corrected=false) k == 2 ? _moment2(v, m; corrected=corrected) : k == 3 ? _moment3(v, m; corrected=corrected) : k == 4 ? _moment4(v, m; corrected=corrected) : _momentk(v, k, m; corrected=corrected) end -function moment(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=true) +function moment(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=false) k == 2 ? _moment2(v, wv, m; corrected=corrected) : k == 3 ? _moment3(v, wv, m; corrected=corrected) : k == 4 ? _moment4(v, wv, m; corrected=corrected) : @@ -299,7 +298,7 @@ function moment(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=tr end moment(v::RealArray, k::Int; corrected=true) = moment(v, k, mean(v); corrected=corrected) -function moment(v::RealArray, k::Int, wv::AbstractWeights; corrected=true) +function moment(v::RealArray, k::Int, wv::AbstractWeights; corrected=false) moment(v, k, wv, mean(v, wv); corrected=corrected) end @@ -314,7 +313,7 @@ end Compute the standardized skewness of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. """ -function skewness(v::RealArray, m::Real; corrected=true) +function skewness(v::RealArray, m::Real) n = length(v) cm2 = 0.0 # empirical 2nd centered moment (variance) cm3 = 0.0 # empirical 3rd centered moment @@ -325,13 +324,13 @@ function skewness(v::RealArray, m::Real; corrected=true) cm2 += z2 cm3 += z2 * z end - b = cfactor(n, corrected) - cm3 *= b - cm2 *= b + cf = cfactor(n, false) + cm3 *= cf + cm2 *= cf return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 end -function skewness(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) +function skewness(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) @@ -346,16 +345,14 @@ function skewness(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) cm2 += z2w cm3 += z2w * z end - b = cfactor(wv, corrected) - cm3 *= b - cm2 *= b + cf = cfactor(wv, false) + cm3 *= cf + cm2 *= cf return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 end -skewness(v::RealArray; corrected=true) = skewness(v, mean(v); corrected=corrected) -function skewness(v::RealArray, wv::AbstractWeights; corrected=true) - skewness(v, wv, mean(v, wv); corrected=corrected) -end +skewness(v::RealArray) = skewness(v, mean(v)) +skewness(v::RealArray, wv::AbstractWeights) = skewness(v, wv, mean(v, wv)) # (excessive) Kurtosis # This is Type 1 definition according to Joanes and Gill (1998) @@ -365,7 +362,7 @@ end Compute the excess kurtosis of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. """ -function kurtosis(v::RealArray, m::Real; corrected=true) +function kurtosis(v::RealArray, m::Real) n = length(v) cm2 = 0.0 # empirical 2nd centered moment (variance) cm4 = 0.0 # empirical 4th centered moment @@ -375,13 +372,13 @@ function kurtosis(v::RealArray, m::Real; corrected=true) cm2 += z2 cm4 += z2 * z2 end - b = cfactor(n, corrected) - cm4 *= b - cm2 *= b + cf = cfactor(n, false) + cm4 *= cf + cm2 *= cf return (cm4 / (cm2 * cm2)) - 3.0 end -function kurtosis(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) +function kurtosis(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) length(wv) == n || throw(DimensionMismatch("Inconsistent array lengths.")) cm2 = 0.0 # empirical 2nd centered moment (variance) @@ -397,13 +394,11 @@ function kurtosis(v::RealArray, wv::AbstractWeights, m::Real; corrected=true) cm2 += z2w cm4 += z2w * z2 end - b = cfactor(wv, corrected) - cm4 *= b - cm2 *= b + cf = cfactor(wv, false) + cm4 *= cf + cm2 *= cf return (cm4 / (cm2 * cm2)) - 3.0 end -kurtosis(v::RealArray; corrected=true) = kurtosis(v, mean(v); corrected=corrected) -function kurtosis(v::RealArray, wv::AbstractWeights; corrected=true) - kurtosis(v, wv, mean(v, wv); corrected=corrected) -end +kurtosis(v::RealArray) = kurtosis(v, mean(v)) +kurtosis(v::RealArray, wv::AbstractWeights) = kurtosis(v, wv, mean(v, wv)) diff --git a/src/weights.jl b/src/weights.jl index ae99ad66b..a48e995b9 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -39,22 +39,22 @@ Base.getindex(wv::AbstractWeights, i) = getindex(wv.values, i) Base.size(wv::AbstractWeights) = size(wv.values) """ - cfactor(n::Integer, corrected=true) + cfactor(n::Integer, corrected=false) Computes a correction factor for calculating `var`, `std` and `cov` with `n` observations. If `corrected=true` this will return ``\\frac{1}{n - 1}`` (ie: [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), otherwise it will return ``\\frac{1}{n}``. """ -cfactor(n::Integer, corrected=true) = 1 / (n - Int(corrected)) +cfactor(n::Integer, corrected=false) = 1 / (n - Int(corrected)) """ - cfactor(wv::AbstractWeights, corrected=true) + cfactor(wv::AbstractWeights, corrected=false) Computes a correction factor for calculating `var`, `std` and `cov` with a set of weights `wv`. """ -cfactor(wv::AbstractWeights, corrected=true) = cfactor(wv, Val{corrected}) +cfactor(wv::AbstractWeights, corrected=false) = cfactor(wv, Val{corrected}) """ cfactor(wv::AbstractWeights, false) diff --git a/test/moments.jl b/test/moments.jl index f706e9f35..045474773 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -87,18 +87,18 @@ end wv = fweights(ones(5) * 2.0) -@test skewness(1:5; corrected=false) ≈ 0.0 -@test skewness([1, 2, 3, 4, 5]; corrected=false) ≈ 0.0 -@test skewness([1, 2, 2, 2, 5]; corrected=false) ≈ 1.1731251294063556 -@test skewness([1, 4, 4, 4, 5]; corrected=false) ≈ -1.1731251294063556 +@test skewness(1:5) ≈ 0.0 +@test skewness([1, 2, 3, 4, 5]) ≈ 0.0 +@test skewness([1, 2, 2, 2, 5]) ≈ 1.1731251294063556 +@test skewness([1, 4, 4, 4, 5]) ≈ -1.1731251294063556 -@test skewness([1, 2, 2, 2, 5], wv; corrected=false) ≈ 1.1731251294063556 +@test skewness([1, 2, 2, 2, 5], wv) ≈ 1.1731251294063556 -@test kurtosis(1:5; corrected=false) ≈ -1.3 -@test kurtosis([1, 2, 3, 4, 5]; corrected=false) ≈ -1.3 -@test kurtosis([1, 2, 3, 3, 2]; corrected=false) ≈ -1.1530612244897953 +@test kurtosis(1:5) ≈ -1.3 +@test kurtosis([1, 2, 3, 4, 5]) ≈ -1.3 +@test kurtosis([1, 2, 3, 3, 2]) ≈ -1.1530612244897953 -@test kurtosis([1, 2, 3, 4, 5], wv; corrected=false) ≈ -1.3 +@test kurtosis([1, 2, 3, 4, 5], wv) ≈ -1.3 ##### general moments @@ -125,7 +125,7 @@ x2 = collect(2.0:6.0) x = rand(10) # AnalyticWeights -@test var(x, aweights(ones(10))) ≈ var(x) +@test var(x, aweights(ones(10)); corrected=true) ≈ var(x) w = aweights(rand(10)) n = length(w) # Could be count(!iszero, w) instead @@ -133,21 +133,21 @@ w = aweights(w .* (n / sum(w))) sw = sum(w) # This is now equal to n, but maybe we should support non-normalized weights? xbar = sum(w .* x) ./ sw expected = sum(w .* (x .- xbar).^2)/(sw - sum(w.^2)/sw) -@test var(x, w) ≈ expected +@test var(x, w; corrected=true) ≈ expected # FrequencyWeights -@test var(x, fweights(ones(Int, 10))) ≈ var(x) +@test var(x, fweights(ones(Int, 10)); corrected=true) ≈ var(x) w = fweights(rand(UInt, 10)) sw = sum(w) xbar = sum(w .* x) / sw expected = sum(w .* (x .- xbar).^2) ./ (sum(w) - 1) -@test var(x, w) ≈ expected +@test var(x, w; corrected=true) ≈ expected # ProbabilityWeights -@test var(x, pweights(ones(10))) ≈ var(x) +@test var(x, pweights(ones(10)); corrected=true) ≈ var(x) w = pweights(rand(10)) n = count(!iszero, w) sw = sum(w) xbar = sum(w .* x)/sw expected = sum(w .* (x .- xbar).^2)/sw * n/(n - 1) -@test var(x, w) ≈ expected +@test var(x, w; corrected=true) ≈ expected diff --git a/test/scalarstats.jl b/test/scalarstats.jl index 51d96d10a..e741226f2 100755 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -66,9 +66,9 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test zscore!(zeros(size(a)), a, [1, 2, 3], [0.5, 1.0, 2.0]) ≈ z1 @test zscore!(zeros(size(a)), a, [1 3 2 4], [0.25 0.5 1.0 2.0]) ≈ z2 -@test zscore(a) ≈ zscore(a, mean(a), std(a)) -@test zscore(a, 1) ≈ zscore(a, mean(a,1), std(a,1)) -@test zscore(a, 2) ≈ zscore(a, mean(a,2), std(a,2)) +@test zscore(a) ≈ zscore(a, mean(a), std(a; corrected=false)) +@test zscore(a, 1) ≈ zscore(a, mean(a,1), std(a,1; corrected=false)) +@test zscore(a, 2) ≈ zscore(a, mean(a,2), std(a,2; corrected=false)) ###### quantile & friends From 096548c213d81072bbc8b2f173b3f564f6620787 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 27 Apr 2017 23:32:44 -0500 Subject: [PATCH 13/50] Fixed 0.6 `abstract type ... end ` deprecation warnings using compat. --- src/weights.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index a48e995b9..76302be8d 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -2,9 +2,9 @@ ###### Weight vector ##### if VERSION < v"0.6.0-dev.2123" - abstract AbstractWeights{S<:Real, T<:Real, V<:RealVector} <: RealVector{T} + @compat abstract type AbstractWeights{S<:Real, T<:Real, V<:RealVector} <: RealVector{T} end else - abstract AbstractWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} + @compat abstract type AbstractWeights{S<:Real, T<:Real, V<:AbstractVector{T}} <: AbstractVector{T} end end """ From 616408de667ce1c32d5986cd64669bc1c4eff0c5 Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 28 Apr 2017 15:00:21 -0500 Subject: [PATCH 14/50] Removed `corrected` option from `moment`s, but kept it for `_moment2` for `varm` to call internally. --- src/moments.jl | 48 ++++++++++++++++++++++++------------------------ test/moments.jl | 24 ++++++++++++------------ 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/moments.jl b/src/moments.jl index bbe37873e..311519c2c 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -213,17 +213,17 @@ function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) cfactor(wv, corrected) * s end -function _moment3(v::RealArray, m::Real; corrected=false) +function _moment3(v::RealArray, m::Real) n = length(v) s = 0.0 for i = 1:n @inbounds z = v[i] - m s += z * z * z end - cfactor(n, corrected) * s + cfactor(n, false) * s end -function _moment3(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) +function _moment3(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 w = values(wv) @@ -231,7 +231,7 @@ function _moment3(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) @inbounds z = v[i] - m @inbounds s += (z * z * z) * w[i] end - cfactor(wv, corrected) * s + cfactor(wv, false) * s end function _moment4(v::RealArray, m::Real; corrected=false) @@ -241,10 +241,10 @@ function _moment4(v::RealArray, m::Real; corrected=false) @inbounds z = v[i] - m s += abs2(z * z) end - cfactor(n, corrected) * s + cfactor(n, false) * s end -function _moment4(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) +function _moment4(v::RealArray, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 w = values(wv) @@ -252,20 +252,20 @@ function _moment4(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) @inbounds z = v[i] - m @inbounds s += abs2(z * z) * w[i] end - cfactor(wv, corrected) * s + cfactor(wv, false) * s end -function _momentk(v::RealArray, k::Int, m::Real; corrected=false) +function _momentk(v::RealArray, k::Int, m::Real) n = length(v) s = 0.0 for i = 1:n @inbounds z = v[i] - m s += (z ^ k) end - cfactor(n, corrected) * s + cfactor(n, false) * s end -function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=false) +function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) n = length(v) s = 0.0 w = values(wv) @@ -273,7 +273,7 @@ function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected= @inbounds z = v[i] - m @inbounds s += (z ^ k) * w[i] end - cfactor(wv, corrected) * s + cfactor(wv, false) * s end @@ -283,23 +283,23 @@ end Return the `k`th order central moment of a real-valued array `v`, optionally specifying a weighting vector `wv` and a center `m`. """ -function moment(v::RealArray, k::Int, m::Real; corrected=false) - k == 2 ? _moment2(v, m; corrected=corrected) : - k == 3 ? _moment3(v, m; corrected=corrected) : - k == 4 ? _moment4(v, m; corrected=corrected) : - _momentk(v, k, m; corrected=corrected) +function moment(v::RealArray, k::Int, m::Real) + k == 2 ? _moment2(v, m) : + k == 3 ? _moment3(v, m) : + k == 4 ? _moment4(v, m) : + _momentk(v, k, m) end -function moment(v::RealArray, k::Int, wv::AbstractWeights, m::Real; corrected=false) - k == 2 ? _moment2(v, wv, m; corrected=corrected) : - k == 3 ? _moment3(v, wv, m; corrected=corrected) : - k == 4 ? _moment4(v, wv, m; corrected=corrected) : - _momentk(v, k, wv, m; corrected=corrected) +function moment(v::RealArray, k::Int, wv::AbstractWeights, m::Real) + k == 2 ? _moment2(v, wv, m) : + k == 3 ? _moment3(v, wv, m) : + k == 4 ? _moment4(v, wv, m) : + _momentk(v, k, wv, m) end -moment(v::RealArray, k::Int; corrected=true) = moment(v, k, mean(v); corrected=corrected) -function moment(v::RealArray, k::Int, wv::AbstractWeights; corrected=false) - moment(v, k, wv, mean(v, wv); corrected=corrected) +moment(v::RealArray, k::Int; corrected=true) = moment(v, k, mean(v)) +function moment(v::RealArray, k::Int, wv::AbstractWeights) + moment(v, k, wv, mean(v, wv)) end diff --git a/test/moments.jl b/test/moments.jl index 045474773..befbd49fb 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -104,22 +104,22 @@ wv = fweights(ones(5) * 2.0) ##### general moments x = collect(2.0:8.0) -@test moment(x, 2; corrected=false) ≈ sum((x .- 5).^2) / length(x) -@test moment(x, 3; corrected=false) ≈ sum((x .- 5).^3) / length(x) -@test moment(x, 4; corrected=false) ≈ sum((x .- 5).^4) / length(x) -@test moment(x, 5; corrected=false) ≈ sum((x .- 5).^5) / length(x) +@test moment(x, 2) ≈ sum((x .- 5).^2) / length(x) +@test moment(x, 3) ≈ sum((x .- 5).^3) / length(x) +@test moment(x, 4) ≈ sum((x .- 5).^4) / length(x) +@test moment(x, 5) ≈ sum((x .- 5).^5) / length(x) -@test moment(x, 2, 4.0; corrected=false) ≈ sum((x .- 4).^2) / length(x) -@test moment(x, 3, 4.0; corrected=false) ≈ sum((x .- 4).^3) / length(x) -@test moment(x, 4, 4.0; corrected=false) ≈ sum((x .- 4).^4) / length(x) -@test moment(x, 5, 4.0; corrected=false) ≈ sum((x .- 4).^5) / length(x) +@test moment(x, 2, 4.0) ≈ sum((x .- 4).^2) / length(x) +@test moment(x, 3, 4.0) ≈ sum((x .- 4).^3) / length(x) +@test moment(x, 4, 4.0) ≈ sum((x .- 4).^4) / length(x) +@test moment(x, 5, 4.0) ≈ sum((x .- 4).^5) / length(x) w = fweights([1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) x2 = collect(2.0:6.0) -@test moment(x, 2, w; corrected=false) ≈ sum((x2 .- 4).^2) / 5 -@test moment(x, 3, w; corrected=false) ≈ sum((x2 .- 4).^3) / 5 -@test moment(x, 4, w; corrected=false) ≈ sum((x2 .- 4).^4) / 5 -@test moment(x, 5, w; corrected=false) ≈ sum((x2 .- 4).^5) / 5 +@test moment(x, 2, w) ≈ sum((x2 .- 4).^2) / 5 +@test moment(x, 3, w) ≈ sum((x2 .- 4).^3) / 5 +@test moment(x, 4, w) ≈ sum((x2 .- 4).^4) / 5 +@test moment(x, 5, w) ≈ sum((x2 .- 4).^5) / 5 # Test corrected cases (this will be cleaner in testsets) x = rand(10) From b73a00a735481d92b1a697a0463f8ac1c651f7b7 Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 28 Apr 2017 15:09:42 -0500 Subject: [PATCH 15/50] Renamed cfactor -> varcorrection --- src/cov.jl | 2 +- src/moments.jl | 26 +++++++++++++------------- src/weights.jl | 26 +++++++++++++------------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/cov.jl b/src/cov.jl index ba64d6a88..024728fbe 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -80,7 +80,7 @@ scattermat(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = ## weighted cov function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1, corrected::Bool=false) - scale!(scattermatm(x, mean, wv, vardim), cfactor(wv, corrected)) + scale!(scattermatm(x, mean, wv, vardim), varcorrection(wv, corrected)) end function Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=false) diff --git a/src/moments.jl b/src/moments.jl index 311519c2c..65eff1056 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -44,7 +44,7 @@ end ## var along dim Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=false) = - scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), cfactor(wv, corrected)) + scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), varcorrection(wv, corrected)) function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=false) if mean == 0 @@ -198,7 +198,7 @@ function _moment2(v::RealArray, m::Real; corrected=false) @inbounds z = v[i] - m s += z * z end - cfactor(n, corrected) * s + varcorrection(n, corrected) * s end function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) @@ -210,7 +210,7 @@ function _moment2(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) @inbounds s += (z * z) * w[i] end - cfactor(wv, corrected) * s + varcorrection(wv, corrected) * s end function _moment3(v::RealArray, m::Real) @@ -220,7 +220,7 @@ function _moment3(v::RealArray, m::Real) @inbounds z = v[i] - m s += z * z * z end - cfactor(n, false) * s + varcorrection(n, false) * s end function _moment3(v::RealArray, wv::AbstractWeights, m::Real) @@ -231,7 +231,7 @@ function _moment3(v::RealArray, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += (z * z * z) * w[i] end - cfactor(wv, false) * s + varcorrection(wv, false) * s end function _moment4(v::RealArray, m::Real; corrected=false) @@ -241,7 +241,7 @@ function _moment4(v::RealArray, m::Real; corrected=false) @inbounds z = v[i] - m s += abs2(z * z) end - cfactor(n, false) * s + varcorrection(n, false) * s end function _moment4(v::RealArray, wv::AbstractWeights, m::Real) @@ -252,7 +252,7 @@ function _moment4(v::RealArray, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += abs2(z * z) * w[i] end - cfactor(wv, false) * s + varcorrection(wv, false) * s end function _momentk(v::RealArray, k::Int, m::Real) @@ -262,7 +262,7 @@ function _momentk(v::RealArray, k::Int, m::Real) @inbounds z = v[i] - m s += (z ^ k) end - cfactor(n, false) * s + varcorrection(n, false) * s end function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) @@ -273,7 +273,7 @@ function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += (z ^ k) * w[i] end - cfactor(wv, false) * s + varcorrection(wv, false) * s end @@ -324,7 +324,7 @@ function skewness(v::RealArray, m::Real) cm2 += z2 cm3 += z2 * z end - cf = cfactor(n, false) + cf = varcorrection(n, false) cm3 *= cf cm2 *= cf return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 @@ -345,7 +345,7 @@ function skewness(v::RealArray, wv::AbstractWeights, m::Real) cm2 += z2w cm3 += z2w * z end - cf = cfactor(wv, false) + cf = varcorrection(wv, false) cm3 *= cf cm2 *= cf return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 @@ -372,7 +372,7 @@ function kurtosis(v::RealArray, m::Real) cm2 += z2 cm4 += z2 * z2 end - cf = cfactor(n, false) + cf = varcorrection(n, false) cm4 *= cf cm2 *= cf return (cm4 / (cm2 * cm2)) - 3.0 @@ -394,7 +394,7 @@ function kurtosis(v::RealArray, wv::AbstractWeights, m::Real) cm2 += z2w cm4 += z2w * z2 end - cf = cfactor(wv, false) + cf = varcorrection(wv, false) cm4 *= cf cm2 *= cf return (cm4 / (cm2 * cm2)) - 3.0 diff --git a/src/weights.jl b/src/weights.jl index 76302be8d..1ab920779 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -39,30 +39,30 @@ Base.getindex(wv::AbstractWeights, i) = getindex(wv.values, i) Base.size(wv::AbstractWeights) = size(wv.values) """ - cfactor(n::Integer, corrected=false) + varcorrection(n::Integer, corrected=false) Computes a correction factor for calculating `var`, `std` and `cov` with `n` observations. If `corrected=true` this will return ``\\frac{1}{n - 1}`` (ie: [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), otherwise it will return ``\\frac{1}{n}``. """ -cfactor(n::Integer, corrected=false) = 1 / (n - Int(corrected)) +varcorrection(n::Integer, corrected=false) = 1 / (n - Int(corrected)) """ - cfactor(wv::AbstractWeights, corrected=false) + varcorrection(wv::AbstractWeights, corrected=false) Computes a correction factor for calculating `var`, `std` and `cov` with a set of weights `wv`. """ -cfactor(wv::AbstractWeights, corrected=false) = cfactor(wv, Val{corrected}) +varcorrection(wv::AbstractWeights, corrected=false) = varcorrection(wv, Val{corrected}) """ - cfactor(wv::AbstractWeights, false) + varcorrection(wv::AbstractWeights, false) ``\\frac{1}{\sum w}`` """ -cfactor(wv::AbstractWeights, ::Type{Val{false}}) = 1 / sum(wv) -cfactor(wv::AbstractWeights, ::Type{Val{true}}) = +varcorrection(wv::AbstractWeights, ::Type{Val{false}}) = 1 / sum(wv) +varcorrection(wv::AbstractWeights, ::Type{Val{true}}) = throw(ArgumentError("$(typeof(wv)) does not support bias correction.")) @@ -90,11 +90,11 @@ aweights(vs::RealVector) = AnalyticWeights(vs) aweights(vs::RealArray) = AnalyticWeights(vec(vs)) """ - cfactor(w::AnalyticWeights, true) + varcorrection(w::AnalyticWeights, true) ``\\frac{1}{\sum w - \sum w / \sum {w^2}}`` """ -function cfactor(w::AnalyticWeights, ::Type{Val{true}}) +function varcorrection(w::AnalyticWeights, ::Type{Val{true}}) s = sum(w) sum_sn = 0.0 for x in w @@ -127,11 +127,11 @@ fweights(vs::RealVector) = FrequencyWeights(vs) fweights(vs::RealArray) = FrequencyWeights(vec(vs)) """ - cfactor(w::FrequencyWeights, true) + varcorrection(w::FrequencyWeights, true) ``\\frac{1}{\sum{w} - 1}`` """ -cfactor(w::FrequencyWeights, ::Type{Val{true}}) = 1 / (sum(w) - 1) +varcorrection(w::FrequencyWeights, ::Type{Val{true}}) = 1 / (sum(w) - 1) @weights ProbabilityWeights @@ -157,11 +157,11 @@ pweights(vs::RealVector) = ProbabilityWeights(vs) pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) """ - cfactor(w::ProbabilityWeights, true) + varcorrection(w::ProbabilityWeights, true) ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ -function cfactor(w::ProbabilityWeights, ::Type{Val{true}}) +function varcorrection(w::ProbabilityWeights, ::Type{Val{true}}) s = sum(w) n = length(w) return n / (s * (n - 1)) From a9485db2f34af4fdc392bbd0a85998b2dd7fe25b Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 28 Apr 2017 15:19:22 -0500 Subject: [PATCH 16/50] Fixed comments in export. --- src/StatsBase.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 598b38329..87112a130 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -17,12 +17,12 @@ module StatsBase export ## weights - AbstractWeights, # the abstract type to represent any weight vector - AnalyticWeights, # the default type for representing a analytic/precision/reliability weight vectors - FrequencyWeights, # the type for representing a frequency weight vectors - ProbabilityWeights, # the type for representing a probability/sampling weight vectors - ExponentialWeights, # the type for representing exponential weights - weights, # alias for aweights + AbstractWeights, # abstract type to represent any weight vector + WeightVec, # deprecated type to represent any weight vector + AnalyticWeights, # to represent an analytic/precision/reliability weight vector + FrequencyWeights, # to representing a frequency/case/repeat weight vector + ProbabilityWeights, # to representing a probability/sampling weight vector + weights, # deprecated function for constructing a WeightVec vector aweights, # construct an AnalyticWeights vector fweights, # construct a FrequencyWeights vector pweights, # construct a ProbabilityWeights vector From 7f179ef4c89b44a24eb66eeea91ead1b532b373c Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 28 Apr 2017 15:54:02 -0500 Subject: [PATCH 17/50] Fixed more style issues. --- src/cov.jl | 6 ++---- src/hist.jl | 1 - src/moments.jl | 49 +++++++++++++++++++++++-------------------------- 3 files changed, 25 insertions(+), 31 deletions(-) diff --git a/src/cov.jl b/src/cov.jl index 024728fbe..2e25e6376 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -79,13 +79,11 @@ scattermat(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = scattermatm(x, Base.mean(x, wv, vardim), wv, vardim) ## weighted cov -function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1, corrected::Bool=false) +Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1, corrected::Bool=false) = scale!(scattermatm(x, mean, wv, vardim), varcorrection(wv, corrected)) -end -function Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=false) +Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=false) = Base.covm(x, Base.mean(x, wv, vardim), wv, vardim, corrected) -end function mean_and_cov(x::DenseMatrix, vardim::Int=1; corrected=false) m = mean(x, vardim) diff --git a/src/hist.jl b/src/hist.jl index f4a65f6aa..22fcb3a2f 100644 --- a/src/hist.jl +++ b/src/hist.jl @@ -260,7 +260,6 @@ function append!{T,N}(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}) end h end - function append!{T,N}(h::AbstractHistogram{T,N}, vs::NTuple{N,AbstractVector}, wv::AbstractVector) @inbounds for i in eachindex(wv, vs...) xs = _multi_getindex(i, vs...) diff --git a/src/moments.jl b/src/moments.jl index 65eff1056..540583105 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -33,30 +33,29 @@ variance of the sample. """ function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=false) if mean == 0 - return varm(v, wv, 0; corrected=corrected) + varm(v, wv, 0; corrected=corrected) elseif mean == nothing - return varm(v, wv, Base.mean(v, wv); corrected=corrected) + varm(v, wv, Base.mean(v, wv); corrected=corrected) else - return varm(v, wv, mean; corrected=corrected) + varm(v, wv, mean; corrected=corrected) end end ## var along dim -Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=false) = - scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), varcorrection(wv, corrected)) +function Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, + dim::Int; corrected=false) + scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), + varcorrection(wv, corrected)) +end -function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=false) +function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, + corrected=false) if mean == 0 - Base.varm!( - R, A, wv, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim; - corrected=corrected - ) + Base.varm!(R, A, wv, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim; + corrected=corrected) elseif mean == nothing - Base.varm!( - R, A, wv, Base.mean(A, wv, dim), dim; - corrected=corrected - ) + Base.varm!(R, A, wv, Base.mean(A, wv, dim), dim; corrected=corrected) else # check size of mean for i = 1:ndims(A) @@ -72,26 +71,24 @@ function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mea end end -function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; corrected=false) +function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; + corrected=false) @static if VERSION < v"0.6.0-dev.1121" - return Base.varm!( - similar(A, Float64, Base.reduced_dims(size(A), dim)), - A, wv, M, dim; corrected=corrected - ) + Base.varm!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, M, dim; + corrected=corrected) else - return Base.varm!( - similar(A, Float64, Base.reduced_indices(indices(A), dim)), - A, wv, M, dim; corrected=corrected - ) + Base.varm!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, M, + dim; corrected=corrected) end end -function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=false) +function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, + corrected=false) @static if VERSION < v"0.6.0-dev.1121" - return var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim; + var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim; mean=mean, corrected=corrected) else - return var!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, dim; + var!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, dim; mean=mean, corrected=corrected) end end From c9437ab26f6e66b94ae18695b457382f275221fe Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 28 Apr 2017 17:03:21 -0500 Subject: [PATCH 18/50] Reverted dispatching on corrected value. --- src/deprecates.jl | 11 ++++++++ src/weights.jl | 69 ++++++++++++++++++++++++----------------------- 2 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/deprecates.jl b/src/deprecates.jl index 67bc6be85..401a6b8ac 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -68,3 +68,14 @@ function weights(vs::RealArray) s = sum(v) WeightVec{typeof(s), eltype(v), typeof(v)}(v, s) end + +""" + varcorrection(w::WeightVec, corrected=false) + +Returns ``\\frac{1}{\sum w}`` when corrected is false and throws an `ArgumentError` +when correction is true. +""" +function varcorrections(w::WeightVec, corrected::Bool=false) + corrected && throw(ArgumentError("WeightVec does not support bias correction.")) + 1 / w.sum +end diff --git a/src/weights.jl b/src/weights.jl index 1ab920779..7899f38f4 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -46,24 +46,7 @@ If `corrected=true` this will return ``\\frac{1}{n - 1}`` (ie: [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), otherwise it will return ``\\frac{1}{n}``. """ -varcorrection(n::Integer, corrected=false) = 1 / (n - Int(corrected)) - -""" - varcorrection(wv::AbstractWeights, corrected=false) - -Computes a correction factor for calculating `var`, `std` and `cov` with a set of -weights `wv`. -""" -varcorrection(wv::AbstractWeights, corrected=false) = varcorrection(wv, Val{corrected}) - -""" - varcorrection(wv::AbstractWeights, false) - -``\\frac{1}{\sum w}`` -""" -varcorrection(wv::AbstractWeights, ::Type{Val{false}}) = 1 / sum(wv) -varcorrection(wv::AbstractWeights, ::Type{Val{true}}) = - throw(ArgumentError("$(typeof(wv)) does not support bias correction.")) +varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected)) @weights AnalyticWeights @@ -90,18 +73,23 @@ aweights(vs::RealVector) = AnalyticWeights(vs) aweights(vs::RealArray) = AnalyticWeights(vec(vs)) """ - varcorrection(w::AnalyticWeights, true) + varcorrection(w::AnalyticWeights, corrected=false) -``\\frac{1}{\sum w - \sum w / \sum {w^2}}`` +``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` """ -function varcorrection(w::AnalyticWeights, ::Type{Val{true}}) - s = sum(w) - sum_sn = 0.0 - for x in w - sum_sn += (x / s) ^ 2 - end +function varcorrection(w::AnalyticWeights, corrected::Bool=false) + s = w.sum + + if corrected + sum_sn = 0.0 + for x in w + sum_sn += (x / s) ^ 2 + end - 1 / (s * (1 - sum_sn)) + 1 / (s * (1 - sum_sn)) + else + 1 / s + end end @weights FrequencyWeights @@ -127,11 +115,19 @@ fweights(vs::RealVector) = FrequencyWeights(vs) fweights(vs::RealArray) = FrequencyWeights(vec(vs)) """ - varcorrection(w::FrequencyWeights, true) + varcorrection(w::FrequencyWeights, corrected=false) ``\\frac{1}{\sum{w} - 1}`` """ -varcorrection(w::FrequencyWeights, ::Type{Val{true}}) = 1 / (sum(w) - 1) +function varcorrection(w::FrequencyWeights, corrected::Bool=false) + s = w.sum + + if corrected + 1 / (s - 1) + else + 1 / s + end +end @weights ProbabilityWeights @@ -157,14 +153,19 @@ pweights(vs::RealVector) = ProbabilityWeights(vs) pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) """ - varcorrection(w::ProbabilityWeights, true) + varcorrection(w::ProbabilityWeights, corrected=false) ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ -function varcorrection(w::ProbabilityWeights, ::Type{Val{true}}) - s = sum(w) - n = length(w) - return n / (s * (n - 1)) +function varcorrection(w::ProbabilityWeights, corrected::Bool=false) + s = w.sum + + if corrected + n = length(w) + n / (s * (n - 1)) + else + 1 / s + end end """ From 7072eaee1445a62b4243229f3116338c40e5fdf3 Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 28 Apr 2017 17:52:24 -0500 Subject: [PATCH 19/50] Updated weights docstrings. --- src/weights.jl | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index 7899f38f4..397b23d9b 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -10,7 +10,7 @@ end """ @weights name -generates a new generic weight type with specified `name`, which subtypes `AbstractWeights` +Generates a new generic weight type with specified `name`, which subtypes `AbstractWeights` and stores the `values` (`V<:RealVector`) and `sum` (`S<:Real`). """ macro weights(name) @@ -41,10 +41,10 @@ Base.size(wv::AbstractWeights) = size(wv.values) """ varcorrection(n::Integer, corrected=false) -Computes a correction factor for calculating `var`, `std` and `cov` with `n` observations. -If `corrected=true` this will return ``\\frac{1}{n - 1}`` -(ie: [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), -otherwise it will return ``\\frac{1}{n}``. +Compute a bias correction factor for calculating `var`, `std` and `cov` with +`n` observations. Returns ``\\frac{1}{n - 1}`` when `corrected=true` +(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), +otherwise returns ``\\frac{1}{n}`` (i.e no correction). """ varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected)) @@ -54,7 +54,8 @@ varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected)) """ AnalyticWeights(vs, wsum=sum(vs)) -Construct an `AnalyticWeights` vector with weight values `vs` and sum of weights `wsum`. +Construct an `AnalyticWeights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. Analytic weights describe a non-random relative importance (usually between 0 and 1) for each observation. These weights may also be referred to as reliability weights, @@ -66,7 +67,7 @@ AnalyticWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = """ aweights(vs) -Construct an `AnalyticWeights` vector from a given array. +Construct an `AnalyticWeights` vector from array `vs`. See the documentation for `AnalyticWeights` for more details. """ aweights(vs::RealVector) = AnalyticWeights(vs) @@ -81,8 +82,8 @@ function varcorrection(w::AnalyticWeights, corrected::Bool=false) s = w.sum if corrected - sum_sn = 0.0 - for x in w + sum_sn = zero(eltype(w)) / one(typeof(s)) ^ 2 # to ensure type stability + @inbounds for x in w sum_sn += (x / s) ^ 2 end @@ -97,9 +98,10 @@ end """ FrequencyWeights(vs, wsum=sum(vs)) -Construct a `FrequencyWeights` vector with weight values `vs` and sum of weights `wsum`. +Construct a `FrequencyWeights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. -Frequency weights describe the number of cases (or frequency) in which each observation +Frequency weights describe the number of times (or frequency) each observation was observed. These weight may also be referred to as case weights or repeat weights. """ FrequencyWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = @@ -134,11 +136,12 @@ end """ ProbabilityWeights(vs, wsum=sum(vs)) -Construct a `ProbabilityWeights` vector with weight values `vs` and sum of weights `wsum`. +Construct a `ProbabilityWeights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. Probability weights represent the inverse of the sampling probability for each observation, providing a correction mechanism for under- or over-sampling certain population groups. -These weight may also be referred to as sampling weights. +These weights may also be referred to as sampling weights. """ ProbabilityWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = ProbabilityWeights{S, eltype(vs), V}(vs, s) @@ -169,18 +172,16 @@ function varcorrection(w::ProbabilityWeights, corrected::Bool=false) end """ - eweights(n, [λ]) + eweights(n, λ) -Constructs an `AnalyticWeights` vector with a desired length `n` and smoothing factor `λ`, -where each element is set to ``λ * (1 - λ)^(1 - i)``. +Construct an `AnalyticWeights` vector with length `n`, +where each element in position ``i`` is set to ``λ * (1 - λ)^(1 - i)``. -# Arguments -* `n::Integer`: the desired length of the `Weights` -* `λ::Real`: a smoothing factor or rate parameter between 0 and 1. - As this value approaches 0 the resulting weights will be almost equal, - while values closer to 1 will put higher weight on the end elements of the vector. +``λ`` is a smoothing factor or rate parameter between 0 and 1. +As this value approaches 0 the resulting weights will be almost equal, +while values closer to 1 will put higher weight on the end elements of the vector. """ -function eweights(n::Integer, λ::Real=0.99) +function eweights(n::Integer, λ::Real) n > 0 || throw(ArgumentError("cannot construct weights of length < 1")) 0 <= λ <= 1 || throw(ArgumentError("smoothing factor must be between 0 and 1")) w0 = map(i -> λ * (1 - λ)^(1 - i), 1:n) From 46a1aa9d1a8509f9dea59dedcb440d8905975f1a Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 28 Apr 2017 23:32:09 -0500 Subject: [PATCH 20/50] Reworked `var` and `std` definitions and doc strings. * `corrected` is now an optional argument rather than a keyword so we can produce a deprecation warning about `corrected=false` becoming `corrected=true`. NOTE: this is contrary to the `std` and `var` methods in base, but base julia isn't even consistent about whether `corrected` is a keyword or not betwen `var`, `std` and `cov`. * The `var` and `std` methods now have doc strings explaining how bias correction works with weight vectors. * `mean_and_x` methods default to what they were before. --- src/deprecates.jl | 95 ++++++++++++++++ src/moments.jl | 262 ++++++++++++++++++++++++++------------------ test/moments.jl | 86 +++++++-------- test/scalarstats.jl | 6 +- 4 files changed, 296 insertions(+), 153 deletions(-) diff --git a/src/deprecates.jl b/src/deprecates.jl index 401a6b8ac..b7f7ab709 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -79,3 +79,98 @@ function varcorrections(w::WeightVec, corrected::Bool=false) corrected && throw(ArgumentError("WeightVec does not support bias correction.")) 1 / w.sum end + +_correction_dep_msg(fname) = + string(fname, " will default to `corrected=true` in the future.") + +# The following methods are for wrapping the deprecated `correction=false` behaviour. +# When we default to `correction=true` these methods should be removed in favour of +# adding `corrected::Bool=true` in the appropriate methods. + +function Base.varm(v::RealArray, wv::AbstractWeights, m::Real) + Base.depwarn(_correction_dep_msg("`varm`"), :varm) + varm(v, wv, m, false) +end + +function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int) + Base.depwarn(_correction_dep_msg("`varm`"), :varm) + varm(A, wv, M, dim, false) +end + +function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing) + Base.depwarn(_correction_dep_msg("`var`"), :var) + var(v, wv, false; mean=mean) +end + +function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) + Base.depwarn(_correction_dep_msg("`var`"), :var) + var(A, wv, dim, false; mean=mean) +end + +function Base.stdm(v::RealArray, wv::AbstractWeights, m::Real) + Base.depwarn(_correction_dep_msg("`stdm`"), :stdm) + stdm(v, wv, m, false) +end + +function Base.stdm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int) + Base.depwarn(_correction_dep_msg("`stdm`"), :stdm) + stdm(A, wv, M, dim, false) +end + +function Base.std(v::RealArray, wv::AbstractWeights; mean=nothing) + Base.depwarn(_correction_dep_msg("`std`"), :std) + std(v, wv, false; mean=mean) +end + +function Base.std(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) + Base.depwarn(_correction_dep_msg("`std`"), :std) + std(A, wv, dim, false; mean=mean) +end + +function mean_and_var(A::RealArray) + m = mean(A) + v = varm(A, m; corrected=true) + m, v +end + +function mean_and_var(A::RealArray, wv::AbstractWeights) + m = mean(A, wv) + v = varm(A, wv, m, true) + m, v +end + +function mean_and_var(A::RealArray, dim::Int) + m = mean(A, dim) + v = varm(A, m, dim; corrected=true) + m, v +end + +function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int) + m = mean(A, wv, dim) + v = varm(A, wv, m, dim, true) + m, v +end + +function mean_and_std(A::RealArray) + m = mean(A) + s = stdm(A, m; corrected=true) + m, s +end + +function mean_and_std(A::RealArray, wv::AbstractWeights) + m = mean(A, wv) + s = stdm(A, wv, m, true) + m, s +end + +function mean_and_std(A::RealArray, dim::Int) + m = mean(A, dim) + s = stdm(A, m, dim, true) + m, s +end + +function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int) + m = mean(A, wv, dim) + s = stdm(A, wv, m, dim, true) + m, s +end diff --git a/src/moments.jl b/src/moments.jl index 540583105..d4219e443 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -1,61 +1,100 @@ ##### Weighted var & std ## var - """ - varm(x, wv::AbstractWeights, m, [dim]) + varm(x, wv::AbstractWeights, m, [dim, corrected]) Return the variance of a real-valued array `x` with a known mean `m`, optionally -over a dimension `dim`. The weighting vector `wv` specifies frequency weights -(also called case weights) for the result. - -This function differs from its counterpart in Base in that Bessel's correction -is not used. That is, here the denominator for the variance is `sum(wv)`, -whereas it's `length(x)-1` in `Base.varm`. The impact is that this is not a -weighted estimate of the population variance based on the sample; it's the weighted -variance of the sample. +over a dimension `dim`. Observations in `x` or weighted via `wv`. + +In base julia a biased variance (`corrected=false`) is calculated as: + +``\\frac{1}{N} \sum\limits_{i = 1}^N {\left( {x_i - \bar x} \right)^2 }`` + +An unbiased variance (`corrected=true`) is calculated by replacing +``\\frac{1}{N - 1}`` with ``\\frac{1}{N - 1}`` +(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)). + +Here we calculate the biased weighted variance (`corrected=false`) as: + +``\\frac{1}{\sum{w}} \sum\limits_{i = 1}^N {w_i\left( {x_i - \bar x} \right)^2 }`` + +An unbiased weighted variance (`corrected=true`) is dependent on the type of weights used: + +* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` +* FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ -Base.varm(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) = +Base.varm(v::RealArray, wv::AbstractWeights, m::Real, corrected::Bool) = _moment2(v, wv, m, corrected=corrected) +function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int, corrected) + @static if VERSION < v"0.6.0-dev.1121" + Base.varm!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, M, dim, + corrected) + else + Base.varm!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, M, + dim, corrected) + end +end + +function Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, + dim::Int, corrected::Bool) + scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), + varcorrection(wv, corrected)) +end + """ - var(x, wv::AbstractWeights, [dim]; mean=nothing) + var(x, wv::AbstractWeights, [dim, corrected]; mean=nothing) + +Return the variance of a real-valued array `x` with a known mean `m`, optionally +over a dimension `dim`. Observations in `x` or weighted via `wv`. + +In base julia a biased variance (`corrected=false`) is calculated as: -Return the variance of a real-valued array `x`, optionally over a dimension `dim`. -The weighting vector `wv` specifies frequency weights (also called case weights) -for the estimate. +``\\frac{1}{N} \sum\limits_{i = 1}^N {\left( {x_i - \bar x} \right)^2 }`` -This function differs from its counterpart in Base in that Bessel's correction -is not used. That is, here the denominator for the variance is `sum(wv)`, -whereas it's `length(x)-1` in `Base.var`. The impact is that this is not a -weighted estimate of the population variance based on the sample; it's the weighted -variance of the sample. +An unbiased variance (`corrected=true`) is calculated by replacing +``\\frac{1}{N - 1}`` with ``\\frac{1}{N - 1}`` +(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)). + +Here we calculate the biased weighted variance (`corrected=false`) as: + +``\\frac{1}{\sum{w}} \sum\limits_{i = 1}^N {w_i\left( {x_i - \bar x} \right)^2 }`` + +An unbiased weighted variance (`corrected=true`) is dependent on the type of weights used: + +* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` +* FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ -function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=false) +function Base.var(v::RealArray, wv::AbstractWeights, corrected::Bool; mean=nothing) if mean == 0 - varm(v, wv, 0; corrected=corrected) + varm(v, wv, 0, corrected) elseif mean == nothing - varm(v, wv, Base.mean(v, wv); corrected=corrected) + varm(v, wv, Base.mean(v, wv), corrected) else - varm(v, wv, mean; corrected=corrected) + varm(v, wv, mean, corrected) end end -## var along dim - -function Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, - dim::Int; corrected=false) - scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), - varcorrection(wv, corrected)) +function Base.var(A::RealArray, wv::AbstractWeights, dim::Int, corrected::Bool; + mean=nothing) + @static if VERSION < v"0.6.0-dev.1121" + var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim, corrected; + mean=mean) + else + var!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, dim, + corrected; mean=mean) + end end -function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, - corrected=false) +function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int, + corrected::Bool; mean=nothing) if mean == 0 - Base.varm!(R, A, wv, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim; - corrected=corrected) + Base.varm!(R, A, wv, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim, corrected) elseif mean == nothing - Base.varm!(R, A, wv, Base.mean(A, wv, dim), dim; corrected=corrected) + Base.varm!(R, A, wv, Base.mean(A, wv, dim), dim, corrected) else # check size of mean for i = 1:ndims(A) @@ -67,73 +106,102 @@ function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; mea dM == dA || throw(DimensionMismatch("Incorrect size of mean.")) end end - Base.varm!(R, A, wv, mean, dim; corrected=corrected) - end -end - -function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; - corrected=false) - @static if VERSION < v"0.6.0-dev.1121" - Base.varm!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, M, dim; - corrected=corrected) - else - Base.varm!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, M, - dim; corrected=corrected) - end -end - -function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, - corrected=false) - @static if VERSION < v"0.6.0-dev.1121" - var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim; - mean=mean, corrected=corrected) - else - var!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, dim; - mean=mean, corrected=corrected) + Base.varm!(R, A, wv, mean, dim, corrected) end end ## std """ - stdm(v, wv::AbstractWeights, m, [dim]) + stdm(v, wv::AbstractWeights, m, [dim, corrected]) Return the standard deviation of a real-valued array `v` with a known mean `m`, -optionally over a dimension `dim`. The weighting vector `wv` specifies frequency -weights (also called case weights) for the estimate. -""" -Base.stdm(v::RealArray, wv::AbstractWeights, m::Real; corrected=false) = - sqrt(varm(v, wv, m; corrected=corrected)) +optionally over a dimension `dim`. Observations in `x` or weighted via `wv`. + +In base julia a biased standard deviation (`corrected=false`) is calculated as: + +``\\sqrt{\frac{1}{N} \sum\limits_{i = 1}^N {\left( {x_i - \bar x} \right)^2 }}`` + +An unbiased standard deviation (`corrected=true`) is calculated by replacing +``\\frac{1}{N - 1}`` with ``\\frac{1}{N - 1}`` +(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)). + +Here we calculate the biased weighted standard deviation (`corrected=false`) as: + +``\sqrt{\frac{1}{\sum{w}} \sum\limits_{i = 1}^N {w_i\left( {x_i - \bar x} \right)^2 }}`` +An unbiased standard deviation (`corrected=true`) is dependent on the type of weights used: + +* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` +* FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ - std(v, wv::AbstractWeights, [dim]; mean=nothing) +Base.stdm(v::RealArray, wv::AbstractWeights, m::Real, corrected::Bool) = + sqrt(varm(v, wv, m, corrected)) + +Base.stdm(v::RealArray, m::RealArray, dim::Int, corrected::Bool) = + Base.sqrt!(varm(v, m, dim, corrected=corrected)) + +Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int, corrected::Bool) = + sqrt.(varm(v, wv, m, dim, corrected)) -Return the standard deviation of a real-valued array `v`, optionally over a -dimension `dim`. The weighting vector `wv` specifies frequency weights (also -called case weights) for the estimate. """ -Base.std(v::RealArray, wv::AbstractWeights; mean=nothing, corrected=false) = - sqrt.(var(v, wv; mean=mean, corrected=corrected)) + std(v, wv::AbstractWeights, [dim, corrected]; mean=nothing) + +Return the standard deviation of a real-valued array `v` with a known mean `m`, +optionally over a dimension `dim`. Observations in `x` or weighted via `wv`. -Base.stdm(v::RealArray, m::RealArray, dim::Int; corrected=false) = - Base.sqrt!(varm(v, m, dim; corrected=corrected)) +In base julia a biased standard deviation (`corrected=false`) is calculated as: -Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int; corrected=false) = - sqrt.(varm(v, wv, m, dim; corrected=corrected)) +``\\sqrt{\frac{1}{N} \sum\limits_{i = 1}^N {\left( {x_i - \bar x} \right)^2 }}`` -Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, corrected=false) = - sqrt.(var(v, wv, dim; mean=mean, corrected=corrected)) +An unbiased standard deviation (`corrected=true`) is calculated by replacing +``\\frac{1}{N - 1}`` with ``\\frac{1}{N - 1}`` +(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)). + +Here we calculate the biased weighted standard deviation (`corrected=false`) as: + +``\sqrt{\frac{1}{\sum{w}} \sum\limits_{i = 1}^N {w_i\left( {x_i - \bar x} \right)^2 }}`` + +An unbiased standard deviation (`corrected=true`) is dependent on the type of weights used: + +* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` +* FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` +""" +Base.std(v::RealArray, wv::AbstractWeights, corrected::Bool; mean=nothing) = + sqrt.(var(v, wv, corrected; mean=mean)) + +Base.std(v::RealArray, wv::AbstractWeights, dim::Int, corrected::Bool; mean=nothing) = + sqrt.(var(v, wv, dim, corrected; mean=mean)) ##### Fused statistics """ - mean_and_var(x, [wv::AbstractWeights], [dim]) -> (mean, var) + mean_and_var(x, [wv::AbstractWeights, dim, corrected]) -> (mean, var) Return the mean and variance of a real-valued array `x`, optionally over a dimension -`dim`, as a tuple. A weighting vector `wv` can be specified to weight the estimates. -The weights are assumed to be frequency weights, also called case weights. +`dim`, as a tuple. """ -function mean_and_var(A::RealArray; corrected=false) +function mean_and_var(A::RealArray, corrected::Bool) m = mean(A) - v = varm(A, m; corrected=corrected) + v = varm(A, m; corrected=correted) + m, v +end + +function mean_and_var(A::RealArray, wv::AbstractWeights, corrected::Bool) + m = mean(A, wv) + v = varm(A, wv, m, corrected) + m, v +end + +function mean_and_var(A::RealArray, dim::Int, corrected::Bool) + m = mean(A, dim) + v = varm(A, m, dim; corrected=corrected) + m, v +end + +function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int, corrected) + m = mean(A, wv, dim) + v = varm(A, wv, m, dim, corrected) m, v end @@ -145,45 +213,27 @@ over a dimension `dim`, as a tuple. A weighting vector `wv` can be specified to weight the estimates. The weights are assumed to be frequency weights, also called case weights. """ -function mean_and_std(A::RealArray; corrected=false) +function mean_and_std(A::RealArray, corrected::Bool) m = mean(A) s = stdm(A, m; corrected=corrected) m, s end -function mean_and_var(A::RealArray, wv::AbstractWeights; corrected=false) +function mean_and_std(A::RealArray, wv::AbstractWeights, corrected::Bool) m = mean(A, wv) - v = varm(A, wv, m; corrected=corrected) - m, v -end - -function mean_and_std(A::RealArray, wv::AbstractWeights; corrected=false) - m = mean(A, wv) - s = stdm(A, wv, m; corrected=corrected) + s = stdm(A, wv, m, corrected) m, s end -function mean_and_var(A::RealArray, dim::Int; corrected=false) - m = mean(A, dim) - v = varm(A, m, dim; corrected=corrected) - m, v -end - -function mean_and_std(A::RealArray, dim::Int; corrected=false) +function mean_and_std(A::RealArray, dim::Int, corrected::Bool) m = mean(A, dim) - s = stdm(A, m, dim; corrected=corrected) + s = stdm(A, m, dim, corrected) m, s end -function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int; corrected=false) - m = mean(A, wv, dim) - v = varm(A, wv, m, dim; corrected=corrected) - m, v -end - -function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int; corrected=false) +function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int, corrected::Bool) m = mean(A, wv, dim) - s = stdm(A, wv, m, dim; corrected=corrected) + s = stdm(A, wv, m, dim, corrected) m, s end diff --git a/test/moments.jl b/test/moments.jl index befbd49fb..966281cf1 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -7,29 +7,29 @@ x = rand(10) wv = fweights(rand(10)) m = mean(x, wv) -@test var(x, wv; corrected=false) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) -@test var(x, wv; mean=0, corrected=false) ≈ sum(abs2.(x), wv) ./ sum(wv) -@test var(x, wv; mean=1.0, corrected=false) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) +@test var(x, wv, false) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) +@test var(x, wv, false; mean=0) ≈ sum(abs2.(x), wv) ./ sum(wv) +@test var(x, wv, false; mean=1.0) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) -@test std(x, wv; corrected=false) ≈ sqrt(var(x, wv; corrected=false)) -@test std(x, wv; mean=0, corrected=false) ≈ sqrt(var(x, wv; mean=0, corrected=false)) -@test std(x, wv; mean=1.0, corrected=false) ≈ sqrt(var(x, wv; mean=1.0, corrected=false)) +@test std(x, wv, false) ≈ sqrt(var(x, wv, false)) +@test std(x, wv, false; mean=0) ≈ sqrt(var(x, wv, false; mean=0)) +@test std(x, wv, false; mean=1.0) ≈ sqrt(var(x, wv, false; mean=1.0)) -(m, v) = mean_and_var(x; corrected=false) +(m, v) = mean_and_var(x) @test m == mean(x) -@test v == var(x; corrected=false) +@test v == var(x) -(m, s) = mean_and_std(x; corrected=false) +(m, s) = mean_and_std(x) @test m == mean(x) -@test s == std(x; corrected=false) +@test s == std(x) -(m, v) = mean_and_var(x, wv; corrected=false) +(m, v) = mean_and_var(x, wv) @test m == mean(x, wv) -@test v == var(x, wv; corrected=false) +@test v == var(x, wv, true) -(m, s) = mean_and_std(x, wv; corrected=false) +(m, s) = mean_and_std(x, wv) @test m == mean(x, wv) -@test s == std(x, wv; corrected=false) +@test s == std(x, wv, true) x = rand(5, 6) w1 = rand(5) @@ -39,49 +39,47 @@ wv2 = fweights(w2) m1 = mean(x, wv1, 1) m2 = mean(x, wv2, 2) -@test var(x, wv1, 1; mean=0, corrected=false) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2; mean=0, corrected=false) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) +@test var(x, wv1, 1, false; mean=0) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) +@test var(x, wv2, 2, false; mean=0) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) -@test var(x, wv1, 1; mean=m1, corrected=false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2; mean=m2, corrected=false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) +@test var(x, wv1, 1, false; mean=m1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) +@test var(x, wv2, 2, false; mean=m2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) -@test var(x, wv1, 1; corrected=false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2; corrected=false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) +@test var(x, wv1, 1, false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) +@test var(x, wv2, 2, false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) -@test std(x, wv1, 1; corrected=false) ≈ sqrt.(var(x, wv1, 1; corrected=false)) -@test std(x, wv2, 2; corrected=false) ≈ sqrt.(var(x, wv2, 2; corrected=false)) -@test std(x, wv1, 1; mean=0, corrected=false) ≈ sqrt.(var(x, wv1, 1; mean=0, corrected=false)) -@test std(x, wv2, 2; mean=0, corrected=false) ≈ sqrt.(var(x, wv2, 2; mean=0, corrected=false)) -@test std(x, wv1, 1; mean=m1, corrected=false) ≈ sqrt.(var(x, wv1, 1; mean=m1, corrected=false)) -@test std(x, wv2, 2; mean=m2, corrected=false) ≈ sqrt.(var(x, wv2, 2; mean=m2, corrected=false)) +@test std(x, wv1, 1, false) ≈ sqrt.(var(x, wv1, 1, false)) +@test std(x, wv2, 2, false) ≈ sqrt.(var(x, wv2, 2, false)) +@test std(x, wv1, 1, false; mean=0) ≈ sqrt.(var(x, wv1, 1, false; mean=0)) +@test std(x, wv2, 2, false; mean=0) ≈ sqrt.(var(x, wv2, 2, false; mean=0)) +@test std(x, wv1, 1, false; mean=m1) ≈ sqrt.(var(x, wv1, 1, false; mean=m1)) +@test std(x, wv2, 2, false; mean=m2) ≈ sqrt.(var(x, wv2, 2, false; mean=m2)) for d in 1:2 - (m, v) = mean_and_var(x, d; corrected=false) + (m, v) = mean_and_var(x, d, false) @test m == mean(x, d) @test v == var(x, d; corrected=false) - (m, s) = mean_and_std(x, d; corrected=false) + (m, s) = mean_and_std(x, d, false) @test m == mean(x, d) @test s == std(x, d; corrected=false) end -(m, v) = mean_and_var(x, wv1, 1; corrected=false) +(m, v) = mean_and_var(x, wv1, 1) @test m == mean(x, wv1, 1) -@test v == var(x, wv1, 1; corrected=false) +@test v == var(x, wv1, 1, true) -(m, v) = mean_and_var(x, wv2, 2; corrected=false) +(m, v) = mean_and_var(x, wv2, 2, false) @test m == mean(x, wv2, 2) -@test v == var(x, wv2, 2; corrected=false) +@test v == var(x, wv2, 2, false) -(m, s) = mean_and_std(x, wv1, 1; corrected=false) +(m, s) = mean_and_std(x, wv1, 1, false) @test m == mean(x, wv1, 1) -@test s == std(x, wv1, 1; corrected=false) +@test s == std(x, wv1, 1, false) -(m, s) = mean_and_std(x, wv2, 2; corrected=false) +(m, s) = mean_and_std(x, wv2, 2, false) @test m == mean(x, wv2, 2) -@test s == std(x, wv2, 2; corrected=false) - - +@test s == std(x, wv2, 2, false) ##### skewness & kurtosis @@ -125,7 +123,7 @@ x2 = collect(2.0:6.0) x = rand(10) # AnalyticWeights -@test var(x, aweights(ones(10)); corrected=true) ≈ var(x) +@test var(x, aweights(ones(10)), true) ≈ var(x) w = aweights(rand(10)) n = length(w) # Could be count(!iszero, w) instead @@ -133,21 +131,21 @@ w = aweights(w .* (n / sum(w))) sw = sum(w) # This is now equal to n, but maybe we should support non-normalized weights? xbar = sum(w .* x) ./ sw expected = sum(w .* (x .- xbar).^2)/(sw - sum(w.^2)/sw) -@test var(x, w; corrected=true) ≈ expected +@test var(x, w, true) ≈ expected # FrequencyWeights -@test var(x, fweights(ones(Int, 10)); corrected=true) ≈ var(x) +@test var(x, fweights(ones(Int, 10)), true) ≈ var(x) w = fweights(rand(UInt, 10)) sw = sum(w) xbar = sum(w .* x) / sw expected = sum(w .* (x .- xbar).^2) ./ (sum(w) - 1) -@test var(x, w; corrected=true) ≈ expected +@test var(x, w, true) ≈ expected # ProbabilityWeights -@test var(x, pweights(ones(10)); corrected=true) ≈ var(x) +@test var(x, pweights(ones(10)), true) ≈ var(x) w = pweights(rand(10)) n = count(!iszero, w) sw = sum(w) xbar = sum(w .* x)/sw expected = sum(w .* (x .- xbar).^2)/sw * n/(n - 1) -@test var(x, w; corrected=true) ≈ expected +@test var(x, w, true) ≈ expected diff --git a/test/scalarstats.jl b/test/scalarstats.jl index e741226f2..51d96d10a 100755 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -66,9 +66,9 @@ z2 = [8. 2. 3. 1.; 24. 10. -1. -1.; 20. 12. 1. -2.] @test zscore!(zeros(size(a)), a, [1, 2, 3], [0.5, 1.0, 2.0]) ≈ z1 @test zscore!(zeros(size(a)), a, [1 3 2 4], [0.25 0.5 1.0 2.0]) ≈ z2 -@test zscore(a) ≈ zscore(a, mean(a), std(a; corrected=false)) -@test zscore(a, 1) ≈ zscore(a, mean(a,1), std(a,1; corrected=false)) -@test zscore(a, 2) ≈ zscore(a, mean(a,2), std(a,2; corrected=false)) +@test zscore(a) ≈ zscore(a, mean(a), std(a)) +@test zscore(a, 1) ≈ zscore(a, mean(a,1), std(a,1)) +@test zscore(a, 2) ≈ zscore(a, mean(a,2), std(a,2)) ###### quantile & friends From e831d154e4feee4612fe38a2a5b37100635c2199 Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 28 Apr 2017 23:50:46 -0500 Subject: [PATCH 21/50] Removed a couple unnecessary functions from deprecates. --- src/deprecates.jl | 18 ------------------ src/moments.jl | 8 ++++---- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/src/deprecates.jl b/src/deprecates.jl index b7f7ab709..0d8241d28 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -127,36 +127,18 @@ function Base.std(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) std(A, wv, dim, false; mean=mean) end -function mean_and_var(A::RealArray) - m = mean(A) - v = varm(A, m; corrected=true) - m, v -end - function mean_and_var(A::RealArray, wv::AbstractWeights) m = mean(A, wv) v = varm(A, wv, m, true) m, v end -function mean_and_var(A::RealArray, dim::Int) - m = mean(A, dim) - v = varm(A, m, dim; corrected=true) - m, v -end - function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int) m = mean(A, wv, dim) v = varm(A, wv, m, dim, true) m, v end -function mean_and_std(A::RealArray) - m = mean(A) - s = stdm(A, m; corrected=true) - m, s -end - function mean_and_std(A::RealArray, wv::AbstractWeights) m = mean(A, wv) s = stdm(A, wv, m, true) diff --git a/src/moments.jl b/src/moments.jl index d4219e443..8e891ea50 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -181,9 +181,9 @@ Base.std(v::RealArray, wv::AbstractWeights, dim::Int, corrected::Bool; mean=noth Return the mean and variance of a real-valued array `x`, optionally over a dimension `dim`, as a tuple. """ -function mean_and_var(A::RealArray, corrected::Bool) +function mean_and_var(A::RealArray, corrected::Bool=true) m = mean(A) - v = varm(A, m; corrected=correted) + v = varm(A, m; corrected=corrected) m, v end @@ -193,7 +193,7 @@ function mean_and_var(A::RealArray, wv::AbstractWeights, corrected::Bool) m, v end -function mean_and_var(A::RealArray, dim::Int, corrected::Bool) +function mean_and_var(A::RealArray, dim::Int, corrected::Bool=true) m = mean(A, dim) v = varm(A, m, dim; corrected=corrected) m, v @@ -213,7 +213,7 @@ over a dimension `dim`, as a tuple. A weighting vector `wv` can be specified to weight the estimates. The weights are assumed to be frequency weights, also called case weights. """ -function mean_and_std(A::RealArray, corrected::Bool) +function mean_and_std(A::RealArray, corrected::Bool=true) m = mean(A) s = stdm(A, m; corrected=corrected) m, s From 4ade783c832e8d7723b3c9faa075ae8714e1c8b9 Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 28 Apr 2017 23:58:49 -0500 Subject: [PATCH 22/50] Updated mean_and_x docstrings to mention `corrected` and point to the `var` or `std` documentation. --- src/moments.jl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/moments.jl b/src/moments.jl index 8e891ea50..bf055212b 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -179,7 +179,9 @@ Base.std(v::RealArray, wv::AbstractWeights, dim::Int, corrected::Bool; mean=noth mean_and_var(x, [wv::AbstractWeights, dim, corrected]) -> (mean, var) Return the mean and variance of a real-valued array `x`, optionally over a dimension -`dim`, as a tuple. +`dim`, as a tuple. Observations in `x` can be weighted via `wv`. Finally, bias correction +can be applied to the variance calculation if `corrected=true`. +See `var` documentation for more details. """ function mean_and_var(A::RealArray, corrected::Bool=true) m = mean(A) @@ -206,12 +208,13 @@ function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int, corrected) end """ - mean_and_std(x, [wv::AbstractWeights], [dim]) -> (mean, std) + mean_and_std(x, [wv::AbstractWeights, dim, corrected]) -> (mean, std) Return the mean and standard deviation of a real-valued array `x`, optionally over a dimension `dim`, as a tuple. A weighting vector `wv` can be specified -to weight the estimates. The weights are assumed to be frequency weights, also -called case weights. +to weight the estimates. Finally, bias correction can be applied to the +standard deviation calculation if `corrected=true`. +See `std` documentation for more details. """ function mean_and_std(A::RealArray, corrected::Bool=true) m = mean(A) From 7bdc11218e2befee6b5b1ad873e18268442f931c Mon Sep 17 00:00:00 2001 From: rofinn Date: Sat, 29 Apr 2017 00:46:19 -0500 Subject: [PATCH 23/50] Added documentation about `corrected` argument in `cov` and `mean_and_cov` docstring. Also, added deprecation for cov `corrected=false` behaviour. --- src/cov.jl | 85 +++++++++++++++++++++++++++-------------------- src/deprecates.jl | 30 +++++++++++++++++ test/cov.jl | 16 ++++----- 3 files changed, 87 insertions(+), 44 deletions(-) diff --git a/src/cov.jl b/src/cov.jl index 2e25e6376..5e71f01de 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -22,13 +22,6 @@ function _scalevars(x::DenseMatrix, s::DenseVector, vardim::Int) end ## scatter matrix - -scattermat_zm(x::DenseMatrix, vardim::Int) = Base.unscaled_covzm(x, vardim) - -scattermat_zm(x::DenseMatrix, wv::AbstractWeights, vardim::Int) = - _symmetrize!(Base.unscaled_covzm(x, _scalevars(x, values(wv), vardim), vardim)) - - """ scattermat(X, [wv::AbstractWeights]; mean=nothing, vardim=1) @@ -44,28 +37,6 @@ that the data are centered and hence there's no need to subtract the mean. When `vardim = 1`, the variables are considered columns with observations in rows; when `vardim = 2`, variables are in rows with observations in columns. """ -function scattermat end - - -""" - cov(X, wv::AbstractWeights; mean=nothing, vardim=1) - -Compute the weighted covariance matrix. By default, the covariance -matrix is normalized by the sum of the weights. That is, `cov(X, wv)` -is equivalent to `scattermat(X, wv) / sum(wv)`. -""" -cov - - -""" - mean_and_cov(x, [wv::AbstractWeights]; vardim=1) -> (mean, cov) - -Return the mean and covariance matrix as a tuple. A weighting -vector `wv` can be specified. `vardim` that designates whether -the variables are columns in the matrix (`1`) or rows (`2`). -""" -function mean_and_cov end - scattermatm(x::DenseMatrix, mean, vardim::Int=1) = scattermat_zm(x .- mean, vardim) @@ -78,19 +49,61 @@ scattermat(x::DenseMatrix, vardim::Int=1) = scattermat(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = scattermatm(x, Base.mean(x, wv, vardim), wv, vardim) -## weighted cov -Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1, corrected::Bool=false) = - scale!(scattermatm(x, mean, wv, vardim), varcorrection(wv, corrected)) +scattermat_zm(x::DenseMatrix, vardim::Int) = Base.unscaled_covzm(x, vardim) + +scattermat_zm(x::DenseMatrix, wv::AbstractWeights, vardim::Int) = + _symmetrize!(Base.unscaled_covzm(x, _scalevars(x, values(wv), vardim), vardim)) + +""" + cov(X, wv::AbstractWeights, [vardim, corrected]) -Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=false) = +Compute the weighted covariance matrix. Similar to `var` and `std` the biased covariance +matrix (`corrected=false`) can be computed by multiplying `scattermat(X, wv)` by +``\frac{1}{\sum{w}}`` to normalize. However, the unbiased covariance matrix +(`corrected=true`) is dependent on the type of weights used: + +* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` +* FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` +""" +Base.cov(x::DenseMatrix, wv::AbstractWeights, corrected::Bool) = + Base.covm(x, Base.mean(x, wv, 1), wv, 1, corrected) + +Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int, corrected::Bool) = Base.covm(x, Base.mean(x, wv, vardim), wv, vardim, corrected) -function mean_and_cov(x::DenseMatrix, vardim::Int=1; corrected=false) +Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, corrected::Bool) = + scale!(scattermatm(x, mean, wv, 1), varcorrection(wv, corrected)) + +Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int, corrected::Bool) = + scale!(scattermatm(x, mean, wv, vardim), varcorrection(wv, corrected)) + +""" + mean_and_cov(x, [wv::AbstractWeights, vardim, corrected]) -> (mean, cov) + +Return the mean and covariance matrix as a tuple. A weighting +vector `wv` can be specified. `vardim` that designates whether +the variables are columns in the matrix (`1`) or rows (`2`). +Finally, bias correction can be applied to the covariance calculation if +`corrected=true`. +See `cov` documentation for more details. +""" +function mean_and_cov(x::DenseMatrix, corrected::Bool=true) + m = mean(x, 1) + return m, Base.covm(x, m, 1, corrected) +end + +function mean_and_cov(x::DenseMatrix, vardim::Int, corrected::Bool=true) m = mean(x, vardim) return m, Base.covm(x, m, vardim, corrected) end -function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected=false) +function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, corrected::Bool) + m = mean(x, wv, 1) + return m, Base.cov(x, wv, 1, corrected) +end + +function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int, corrected::Bool) m = mean(x, wv, vardim) - return m, Base.cov(x, wv, vardim; corrected=corrected) + return m, Base.cov(x, wv, vardim, corrected) end diff --git a/src/deprecates.jl b/src/deprecates.jl index 0d8241d28..4471a5ab2 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -156,3 +156,33 @@ function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int) s = stdm(A, wv, m, dim, true) m, s end + +function Base.cov(x::DenseMatrix, wv::AbstractWeights) + Base.depwarn(_correction_dep_msg("`cov`"), :cov) + Base.covm(x, Base.mean(x, wv, 1), wv, 1, false) +end + +function Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int) + Base.depwarn(_correction_dep_msg("`cov`"), :cov) + Base.covm(x, Base.mean(x, wv, vardim), wv, vardim, false) +end + +function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights) + Base.depwarn(_correction_dep_msg("`covm`"), :covm) + scale!(scattermatm(x, mean, wv, 1), varcorrection(wv, false)) +end + +function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int) + Base.depwarn(_correction_dep_msg("`covm`"), :covm) + scale!(scattermatm(x, mean, wv, vardim), varcorrection(wv, false)) +end + +function mean_and_cov(x::DenseMatrix, wv::AbstractWeights) + m = mean(x, wv, 1) + return m, Base.cov(x, wv, 1) +end + +function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int) + m = mean(x, wv, vardim) + return m, Base.cov(x, wv, vardim) +end diff --git a/test/cov.jl b/test/cov.jl index 9cc501d82..449712f88 100644 --- a/test/cov.jl +++ b/test/cov.jl @@ -60,8 +60,8 @@ Sz2w = X * diagm(w2) * X' # weighted covariance -@test cov(X, wv1; corrected=false) ≈ S1w ./ sum(wv1) -@test cov(X, wv2, 2; corrected=false) ≈ S2w ./ sum(wv2) +@test cov(X, wv1, false) ≈ S1w ./ sum(wv1) +@test cov(X, wv2, 2, false) ≈ S2w ./ sum(wv2) @test Base.covm(X, 0, wv1, 1, false) ≈ Sz1w ./ sum(wv1) @test Base.covm(X, 0, wv2, 2, false) ≈ Sz2w ./ sum(wv2) @@ -74,18 +74,18 @@ Sz2w = X * diagm(w2) * X' # mean_and_cov -(m, C) = mean_and_cov(X, 1; corrected=false) +(m, C) = mean_and_cov(X, 1, false) @test m == mean(X, 1) @test C == cov(X, 1, false) -(m, C) = mean_and_cov(X, 2; corrected=false) +(m, C) = mean_and_cov(X, 2, false) @test m == mean(X, 2) @test C == cov(X, 2, false) -(m, C) = mean_and_cov(X, wv1, 1; corrected=false) +(m, C) = mean_and_cov(X, wv1, 1, false) @test m == mean(X, wv1, 1) -@test C == cov(X, wv1, 1; corrected=false) +@test C == cov(X, wv1, 1, false) -(m, C) = mean_and_cov(X, wv2, 2; corrected=false) +(m, C) = mean_and_cov(X, wv2, 2, false) @test m == mean(X, wv2, 2) -@test C == cov(X, wv2, 2; corrected=false) +@test C == cov(X, wv2, 2, false) From f6f3b38084b5bd6ebcd68627df446b882d4e407a Mon Sep 17 00:00:00 2001 From: rofinn Date: Sun, 30 Apr 2017 15:55:19 -0500 Subject: [PATCH 24/50] Added deprecation tests and fixed a few bugs with our deprecations. --- .travis.yml | 6 +- src/deprecates.jl | 12 +-- src/moments.jl | 4 +- test/deprecates.jl | 201 +++++++++++++++++++++++++++++++++++++++++++++ test/moments.jl | 6 +- test/runtests.jl | 12 +++ 6 files changed, 227 insertions(+), 14 deletions(-) create mode 100644 test/deprecates.jl diff --git a/.travis.yml b/.travis.yml index ef0f418ac..2626aacd5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,8 +11,8 @@ notifications: git: depth: 999999 # Uncomment the following lines to override the default test script -#script: -# - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi -# - julia -e 'Pkg.clone(pwd()); Pkg.build("StatsBase"); Pkg.test("StatsBase"; coverage=true)' +script: + - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi + - julia --depwarn=no -e 'Pkg.clone(pwd()); Pkg.build("StatsBase"); Pkg.test("StatsBase"; coverage=true)' after_success: - julia -e 'cd(Pkg.dir("StatsBase")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())' diff --git a/src/deprecates.jl b/src/deprecates.jl index 4471a5ab2..fdcce7abd 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -75,7 +75,7 @@ end Returns ``\\frac{1}{\sum w}`` when corrected is false and throws an `ArgumentError` when correction is true. """ -function varcorrections(w::WeightVec, corrected::Bool=false) +function varcorrection(w::WeightVec, corrected::Bool=false) corrected && throw(ArgumentError("WeightVec does not support bias correction.")) 1 / w.sum end @@ -129,31 +129,31 @@ end function mean_and_var(A::RealArray, wv::AbstractWeights) m = mean(A, wv) - v = varm(A, wv, m, true) + v = varm(A, wv, m) m, v end function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int) m = mean(A, wv, dim) - v = varm(A, wv, m, dim, true) + v = varm(A, wv, m, dim) m, v end function mean_and_std(A::RealArray, wv::AbstractWeights) m = mean(A, wv) - s = stdm(A, wv, m, true) + s = stdm(A, wv, m) m, s end function mean_and_std(A::RealArray, dim::Int) m = mean(A, dim) - s = stdm(A, m, dim, true) + s = stdm(A, m, dim) m, s end function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int) m = mean(A, wv, dim) - s = stdm(A, wv, m, dim, true) + s = stdm(A, wv, m, dim) m, s end diff --git a/src/moments.jl b/src/moments.jl index bf055212b..77da18803 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -26,7 +26,7 @@ An unbiased weighted variance (`corrected=true`) is dependent on the type of wei * ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ Base.varm(v::RealArray, wv::AbstractWeights, m::Real, corrected::Bool) = - _moment2(v, wv, m, corrected=corrected) + _moment2(v, wv, m; corrected=corrected) function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int, corrected) @static if VERSION < v"0.6.0-dev.1121" @@ -138,7 +138,7 @@ An unbiased standard deviation (`corrected=true`) is dependent on the type of we Base.stdm(v::RealArray, wv::AbstractWeights, m::Real, corrected::Bool) = sqrt(varm(v, wv, m, corrected)) -Base.stdm(v::RealArray, m::RealArray, dim::Int, corrected::Bool) = +Base.stdm(v::RealArray, m::RealArray, dim::Int, corrected::Bool=true) = Base.sqrt!(varm(v, m, dim, corrected=corrected)) Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int, corrected::Bool) = diff --git a/test/deprecates.jl b/test/deprecates.jl new file mode 100644 index 000000000..02c87d94b --- /dev/null +++ b/test/deprecates.jl @@ -0,0 +1,201 @@ +using StatsBase +using Base.Test +using Compat +import Compat: view + +@testset "StatsBase.Deprecates" begin + +@testset "Deprecates WeightVec and weights" begin + @test isa(weights([1, 2, 3]), WeightVec{Int}) + @test isa(weights([1., 2., 3.]), WeightVec{Float64}) + @test isa(weights([1 2 3; 4 5 6]), WeightVec{Int}) + + @test isa(WeightVec([1, 2, 3], 6), WeightVec{Int}) + + @test isempty(weights(Float64[])) + @test size(weights([1, 2, 3])) == (3,) + + w = [1., 2., 3.] + wv = weights(w) + @test eltype(wv) === Float64 + @test length(wv) === 3 + @test values(wv) === w + @test sum(wv) === 6.0 + @test !isempty(wv) + + b = trues(3) + bv = weights(b) + @test eltype(bv) === Bool + @test length(bv) === 3 + @test values(bv) === b + @test sum(bv) === 3 + @test !isempty(bv) + + ba = BitArray([true, false, true]) + sa = sparsevec([1., 0., 2.]) + + @test sum(ba, wv) === 4.0 + @test sum(sa, wv) === 7.0 +end + +@testset "Moments" begin + @testset "Vectors" begin + x = rand(10) + wv = weights(rand(10)) + m = mean(x, wv) + + @testset "var" begin + @test var(x, wv) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) + @test var(x, wv; mean=0) ≈ sum(abs2.(x), wv) ./ sum(wv) + @test var(x, wv; mean=1.0) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) + end + + @testset "std" begin + @test std(x, wv) ≈ sqrt(var(x, wv)) + @test std(x, wv; mean=0) ≈ sqrt(var(x, wv; mean=0)) + @test std(x, wv; mean=1.0) ≈ sqrt(var(x, wv; mean=1.0)) + end + + @testset "mean_and_var" begin + (m, v) = mean_and_var(x) + @test m == mean(x) + @test v == var(x) + + (m, v) = mean_and_var(x, wv) + @test m == mean(x, wv) + @test v == var(x, wv) + end + + @testset "mean_and_std" begin + (m, s) = mean_and_std(x) + @test m == mean(x) + @test s == std(x) + + (m, s) = mean_and_std(x, wv) + @test m == mean(x, wv) + @test s == std(x, wv) + end + end + + @testset "Matrices" begin + x = rand(5, 6) + w1 = rand(5) + w2 = rand(6) + wv1 = weights(w1) + wv2 = weights(w2) + m1 = mean(x, wv1, 1) + m2 = mean(x, wv2, 2) + + @testset "var" begin + @test var(x, wv1, 1; mean=0) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) + @test var(x, wv2, 2; mean=0) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) + @test var(x, wv1, 1; mean=m1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) + @test var(x, wv2, 2; mean=m2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) + @test var(x, wv1, 1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) + @test var(x, wv2, 2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) + end + + @testset "std" begin + @test std(x, wv1, 1) ≈ sqrt.(var(x, wv1, 1)) + @test std(x, wv2, 2) ≈ sqrt.(var(x, wv2, 2)) + @test std(x, wv1, 1; mean=0) ≈ sqrt.(var(x, wv1, 1; mean=0)) + @test std(x, wv2, 2; mean=0) ≈ sqrt.(var(x, wv2, 2; mean=0)) + @test std(x, wv1, 1; mean=m1) ≈ sqrt.(var(x, wv1, 1; mean=m1)) + @test std(x, wv2, 2; mean=m2) ≈ sqrt.(var(x, wv2, 2; mean=m2)) + end + + @testset "mean_and_var" begin + for d in 1:2 + (m, v) = mean_and_var(x, d) + @test m == mean(x, d) + @test v == var(x, d) + end + + (m, v) = mean_and_var(x, wv1, 1) + @test m == mean(x, wv1, 1) + @test v == var(x, wv1, 1) + + (m, v) = mean_and_var(x, wv2, 2) + @test m == mean(x, wv2, 2) + @test v == var(x, wv2, 2) + end + + @testset "mean_and_std" begin + for d in 1:2 + (m, s) = mean_and_std(x, d) + @test m == mean(x, d) + @test s == std(x, d) + end + + (m, s) = mean_and_std(x, wv1, 1) + @test m == mean(x, wv1, 1) + @test s == std(x, wv1, 1) + + (m, s) = mean_and_std(x, wv2, 2) + @test m == mean(x, wv2, 2) + @test s == std(x, wv2, 2) + end + end +end + +@testset "Covariance" begin + X = randn(3, 8) + + Z1 = X .- mean(X, 1) + Z2 = X .- mean(X, 2) + + w1 = rand(3) + w2 = rand(8) + + wv1 = weights(w1) + wv2 = weights(w2) + + Z1w = X .- mean(X, wv1, 1) + Z2w = X .- mean(X, wv2, 2) + + S1 = Z1'Z1 + S2 = Z2 * Z2' + + Sz1 = X'X + Sz2 = X * X' + + S1w = Z1w' * diagm(w1) * Z1w + S2w = Z2w * diagm(w2) * Z2w' + + Sz1w = X' * diagm(w1) * X + Sz2w = X * diagm(w2) * X' + + @testset "cov" begin + @test cov(X, wv1) ≈ S1w ./ sum(wv1) + @test cov(X, wv2, 2) ≈ S2w ./ sum(wv2) + + @test Base.covm(X, 0, wv1) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, 0, wv2, 2) ≈ Sz2w ./ sum(wv2) + + @test Base.covm(X, mean(X, wv1, 1), wv1) ≈ S1w ./ sum(wv1) + @test Base.covm(X, mean(X, wv2, 2), wv2, 2) ≈ S2w ./ sum(wv2) + + @test Base.covm(X, zeros(1,8), wv1) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, zeros(3), wv2, 2) ≈ Sz2w ./ sum(wv2) + end + + @testset "mean_and_cov" begin + (m, C) = mean_and_cov(X, 1) + @test m == mean(X, 1) + @test C == cov(X, 1) + + (m, C) = mean_and_cov(X, 2) + @test m == mean(X, 2) + @test C == cov(X, 2) + + (m, C) = mean_and_cov(X, wv1, 1) + @test m == mean(X, wv1, 1) + @test C == cov(X, wv1, 1) + + (m, C) = mean_and_cov(X, wv2, 2) + @test m == mean(X, wv2, 2) + @test C == cov(X, wv2, 2) + end +end + +end # @testset "StatsBase.Deprecates" diff --git a/test/moments.jl b/test/moments.jl index 966281cf1..984790338 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -23,11 +23,11 @@ m = mean(x, wv) @test m == mean(x) @test s == std(x) -(m, v) = mean_and_var(x, wv) +(m, v) = mean_and_var(x, wv, true) @test m == mean(x, wv) @test v == var(x, wv, true) -(m, s) = mean_and_std(x, wv) +(m, s) = mean_and_std(x, wv, true) @test m == mean(x, wv) @test s == std(x, wv, true) @@ -65,7 +65,7 @@ for d in 1:2 @test s == std(x, d; corrected=false) end -(m, v) = mean_and_var(x, wv1, 1) +(m, v) = mean_and_var(x, wv1, 1, true) @test m == mean(x, wv1, 1) @test v == var(x, wv1, 1, true) diff --git a/test/runtests.jl b/test/runtests.jl index e3bea13bd..8216df897 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,13 @@ using StatsBase +opts = Base.JLOptions() +depwarns = isdefined(opts, :depwarn) ? opts.depwarn == "no" : true +test_deprecates = if haskey(ENV, "TEST_DEPRECATES") + lowercase(ENV["TEST_DEPRECATES"]) == "true" +else + false +end + tests = ["weights", "moments", "scalarstats", @@ -17,6 +25,10 @@ tests = ["weights", "statmodels"]#, #"statquiz"] +if !depwarns || test_deprecates + push!(tests, "deprecates") +end + println("Running tests:") for t in tests From ad49920afa0c36216e4b88a830979545612b68b0 Mon Sep 17 00:00:00 2001 From: rofinn Date: Sun, 30 Apr 2017 15:58:14 -0500 Subject: [PATCH 25/50] Fixed analytic weights `varcorrection` equations. --- src/cov.jl | 2 +- src/moments.jl | 8 ++++---- src/weights.jl | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cov.jl b/src/cov.jl index 5e71f01de..ab408c0c1 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -62,7 +62,7 @@ matrix (`corrected=false`) can be computed by multiplying `scattermat(X, wv)` by ``\frac{1}{\sum{w}}`` to normalize. However, the unbiased covariance matrix (`corrected=true`) is dependent on the type of weights used: -* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` +* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` * FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` * ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ diff --git a/src/moments.jl b/src/moments.jl index 77da18803..aaa405617 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -21,7 +21,7 @@ Here we calculate the biased weighted variance (`corrected=false`) as: An unbiased weighted variance (`corrected=true`) is dependent on the type of weights used: -* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` +* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` * FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` * ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ @@ -64,7 +64,7 @@ Here we calculate the biased weighted variance (`corrected=false`) as: An unbiased weighted variance (`corrected=true`) is dependent on the type of weights used: -* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` +* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` * FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` * ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ @@ -131,7 +131,7 @@ Here we calculate the biased weighted standard deviation (`corrected=false`) as: An unbiased standard deviation (`corrected=true`) is dependent on the type of weights used: -* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` +* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` * FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` * ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ @@ -164,7 +164,7 @@ Here we calculate the biased weighted standard deviation (`corrected=false`) as: An unbiased standard deviation (`corrected=true`) is dependent on the type of weights used: -* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` +* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` * FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` * ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ diff --git a/src/weights.jl b/src/weights.jl index 397b23d9b..cfc538f36 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -76,7 +76,7 @@ aweights(vs::RealArray) = AnalyticWeights(vec(vs)) """ varcorrection(w::AnalyticWeights, corrected=false) -``\\frac{1}{\sum w - \sum {w^2} / \sum{w}^2}`` +``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` """ function varcorrection(w::AnalyticWeights, corrected::Bool=false) s = w.sum From def70e27b81290981d42bc224724755fa753e914 Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 1 May 2017 15:37:06 -0500 Subject: [PATCH 26/50] Added an extra deprecation test and fixed --depwarn=no check. --- test/deprecates.jl | 6 +++++- test/runtests.jl | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test/deprecates.jl b/test/deprecates.jl index 02c87d94b..e3827f387 100644 --- a/test/deprecates.jl +++ b/test/deprecates.jl @@ -76,7 +76,7 @@ end @test s == std(x, wv) end end - + @testset "Matrices" begin x = rand(5, 6) w1 = rand(5) @@ -188,6 +188,10 @@ end @test m == mean(X, 2) @test C == cov(X, 2) + (m, C) = mean_and_cov(X, wv1) + @test m == mean(X, wv1, 1) + @test C == cov(X, wv1, 1) + (m, C) = mean_and_cov(X, wv1, 1) @test m == mean(X, wv1, 1) @test C == cov(X, wv1, 1) diff --git a/test/runtests.jl b/test/runtests.jl index 8216df897..54527b77e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,7 @@ using StatsBase opts = Base.JLOptions() -depwarns = isdefined(opts, :depwarn) ? opts.depwarn == "no" : true +depwarns = isdefined(opts, :depwarn) ? opts.depwarn != 0 : true test_deprecates = if haskey(ENV, "TEST_DEPRECATES") lowercase(ENV["TEST_DEPRECATES"]) == "true" else From 06ec7f9a64a5e972cadcc3757eefb7df5701561e Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 1 May 2017 17:02:42 -0500 Subject: [PATCH 27/50] Updated test/moments.jl to use `@testset` so that we can test against multiple weight types corrected vs uncorrected calculations. --- test/moments.jl | 334 +++++++++++++++++++++++++++--------------------- 1 file changed, 188 insertions(+), 146 deletions(-) diff --git a/test/moments.jl b/test/moments.jl index 984790338..b3f33c9a5 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -1,151 +1,193 @@ using StatsBase using Base.Test -##### weighted var & std - -x = rand(10) -wv = fweights(rand(10)) -m = mean(x, wv) - -@test var(x, wv, false) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) -@test var(x, wv, false; mean=0) ≈ sum(abs2.(x), wv) ./ sum(wv) -@test var(x, wv, false; mean=1.0) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) - -@test std(x, wv, false) ≈ sqrt(var(x, wv, false)) -@test std(x, wv, false; mean=0) ≈ sqrt(var(x, wv, false; mean=0)) -@test std(x, wv, false; mean=1.0) ≈ sqrt(var(x, wv, false; mean=1.0)) - -(m, v) = mean_and_var(x) -@test m == mean(x) -@test v == var(x) - -(m, s) = mean_and_std(x) -@test m == mean(x) -@test s == std(x) - -(m, v) = mean_and_var(x, wv, true) -@test m == mean(x, wv) -@test v == var(x, wv, true) - -(m, s) = mean_and_std(x, wv, true) -@test m == mean(x, wv) -@test s == std(x, wv, true) - -x = rand(5, 6) -w1 = rand(5) -w2 = rand(6) -wv1 = fweights(w1) -wv2 = fweights(w2) -m1 = mean(x, wv1, 1) -m2 = mean(x, wv2, 2) - -@test var(x, wv1, 1, false; mean=0) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2, false; mean=0) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) - -@test var(x, wv1, 1, false; mean=m1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2, false; mean=m2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) - -@test var(x, wv1, 1, false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) -@test var(x, wv2, 2, false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) - -@test std(x, wv1, 1, false) ≈ sqrt.(var(x, wv1, 1, false)) -@test std(x, wv2, 2, false) ≈ sqrt.(var(x, wv2, 2, false)) -@test std(x, wv1, 1, false; mean=0) ≈ sqrt.(var(x, wv1, 1, false; mean=0)) -@test std(x, wv2, 2, false; mean=0) ≈ sqrt.(var(x, wv2, 2, false; mean=0)) -@test std(x, wv1, 1, false; mean=m1) ≈ sqrt.(var(x, wv1, 1, false; mean=m1)) -@test std(x, wv2, 2, false; mean=m2) ≈ sqrt.(var(x, wv2, 2, false; mean=m2)) - -for d in 1:2 - (m, v) = mean_and_var(x, d, false) - @test m == mean(x, d) - @test v == var(x, d; corrected=false) - - (m, s) = mean_and_std(x, d, false) - @test m == mean(x, d) - @test s == std(x, d; corrected=false) +@testset "StatsBase.Moments" begin +weight_funcs = (aweights, fweights, pweights) + +@testset "Variance and Standard Deviation" begin + @testset "Vectors" begin + x = [0.57, 0.10, 0.91, 0.72, 0.46] + w = [3.84, 2.70, 8.29, 8.91, 9.71] + + @testset "Uncorrected" begin + @testset "Variance with $f" for f in weight_funcs + wv = f(w) + m = mean(x, wv) + @test var(x, wv, false) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) + @test var(x, wv, false; mean=0) ≈ sum(abs2.(x), wv) ./ sum(wv) + @test var(x, wv, false; mean=1.0) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) + end + + @testset "Standard Deviation with $f" for f in weight_funcs + wv = f(w) + m = mean(x, wv) + @test std(x, wv, false) ≈ sqrt(var(x, wv, false)) + @test std(x, wv, false; mean=0) ≈ sqrt(var(x, wv, false; mean=0)) + @test std(x, wv, false; mean=1.0) ≈ sqrt(var(x, wv, false; mean=1.0)) + end + + @testset "Mean and Variance with $f" for f in weight_funcs + wv = f(w) + (m, v) = mean_and_var(x, false) + @test m == mean(x) + @test v == var(x, corrected=false) + + (m, v) = mean_and_var(x, wv, false) + @test m == mean(x, wv) + @test v == var(x, wv, false) + end + + @testset "Mean and Standard Deviation with $f" for f in weight_funcs + wv = f(w) + (m, s) = mean_and_std(x, false) + @test m == mean(x) + @test s == std(x, corrected=false) + + (m, s) = mean_and_std(x, wv, false) + @test m == mean(x, wv) + @test s == std(x, wv, false) + end + end + + @testset "Corrected" begin + @testset "Variance" begin + # expected `var` output for (aweights, fweights, pweights) + expected = (0.0694434191182236, 0.05466601256158146, 0.06628969012045285) + expected_0 = (0.5798908707332937, 0.45649137134052387, 0.5535554932735426) + expected_1 = (0.25422659392845115, 0.20012773497688754, 0.24268105381165922) + + @testset "$(weight_funcs[i])" for i in 1:3 + wv = weight_funcs[i](w) + m = mean(x, wv) + + @test var(x, wv, true) ≈ expected[i] + @test var(x, wv, true; mean=0) ≈ expected_0[i] + @test var(x, wv, true; mean=1.0) ≈ expected_1[i] + end + end + + @testset "Standard Deviation with $f" for f in weight_funcs + wv = f(w) + m = mean(x, wv) + @test std(x, wv, true) ≈ sqrt(var(x, wv, true)) + @test std(x, wv, true; mean=0) ≈ sqrt(var(x, wv, true; mean=0)) + @test std(x, wv, true; mean=1.0) ≈ sqrt(var(x, wv, true; mean=1.0)) + end + + @testset "Mean and Variance with $f" for f in weight_funcs + wv = f(w) + + (m, v) = mean_and_var(x, true) + @test m == mean(x) + @test v == var(x, corrected=true) + + (m, v) = mean_and_var(x, wv, true) + @test m == mean(x, wv) + @test v == var(x, wv, true) + end + + @testset "Mean and Standard Deviation with $f" for f in weight_funcs + wv = f(w) + + (m, s) = mean_and_std(x, true) + @test m == mean(x) + @test s == std(x, corrected=true) + + (m, s) = mean_and_std(x, wv, true) + @test m == mean(x, wv) + @test s == std(x, wv, true) + end + end + end + + @testset "Matrices" begin + x = rand(5, 6) + w1 = rand(5) + w2 = rand(6) + wv1 = fweights(w1) + wv2 = fweights(w2) + m1 = mean(x, wv1, 1) + m2 = mean(x, wv2, 2) + + @test var(x, wv1, 1, false; mean=0) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) + @test var(x, wv2, 2, false; mean=0) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) + + @test var(x, wv1, 1, false; mean=m1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) + @test var(x, wv2, 2, false; mean=m2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) + + @test var(x, wv1, 1, false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) + @test var(x, wv2, 2, false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) + + @test std(x, wv1, 1, false) ≈ sqrt.(var(x, wv1, 1, false)) + @test std(x, wv2, 2, false) ≈ sqrt.(var(x, wv2, 2, false)) + @test std(x, wv1, 1, false; mean=0) ≈ sqrt.(var(x, wv1, 1, false; mean=0)) + @test std(x, wv2, 2, false; mean=0) ≈ sqrt.(var(x, wv2, 2, false; mean=0)) + @test std(x, wv1, 1, false; mean=m1) ≈ sqrt.(var(x, wv1, 1, false; mean=m1)) + @test std(x, wv2, 2, false; mean=m2) ≈ sqrt.(var(x, wv2, 2, false; mean=m2)) + + for d in 1:2 + (m, v) = mean_and_var(x, d, false) + @test m == mean(x, d) + @test v == var(x, d; corrected=false) + + (m, s) = mean_and_std(x, d, false) + @test m == mean(x, d) + @test s == std(x, d; corrected=false) + end + + (m, v) = mean_and_var(x, wv1, 1, true) + @test m == mean(x, wv1, 1) + @test v == var(x, wv1, 1, true) + + (m, v) = mean_and_var(x, wv2, 2, false) + @test m == mean(x, wv2, 2) + @test v == var(x, wv2, 2, false) + + (m, s) = mean_and_std(x, wv1, 1, false) + @test m == mean(x, wv1, 1) + @test s == std(x, wv1, 1, false) + + (m, s) = mean_and_std(x, wv2, 2, false) + @test m == mean(x, wv2, 2) + @test s == std(x, wv2, 2, false) + end end -(m, v) = mean_and_var(x, wv1, 1, true) -@test m == mean(x, wv1, 1) -@test v == var(x, wv1, 1, true) - -(m, v) = mean_and_var(x, wv2, 2, false) -@test m == mean(x, wv2, 2) -@test v == var(x, wv2, 2, false) - -(m, s) = mean_and_std(x, wv1, 1, false) -@test m == mean(x, wv1, 1) -@test s == std(x, wv1, 1, false) - -(m, s) = mean_and_std(x, wv2, 2, false) -@test m == mean(x, wv2, 2) -@test s == std(x, wv2, 2, false) - -##### skewness & kurtosis - -wv = fweights(ones(5) * 2.0) - -@test skewness(1:5) ≈ 0.0 -@test skewness([1, 2, 3, 4, 5]) ≈ 0.0 -@test skewness([1, 2, 2, 2, 5]) ≈ 1.1731251294063556 -@test skewness([1, 4, 4, 4, 5]) ≈ -1.1731251294063556 - -@test skewness([1, 2, 2, 2, 5], wv) ≈ 1.1731251294063556 - -@test kurtosis(1:5) ≈ -1.3 -@test kurtosis([1, 2, 3, 4, 5]) ≈ -1.3 -@test kurtosis([1, 2, 3, 3, 2]) ≈ -1.1530612244897953 - -@test kurtosis([1, 2, 3, 4, 5], wv) ≈ -1.3 - - -##### general moments - -x = collect(2.0:8.0) -@test moment(x, 2) ≈ sum((x .- 5).^2) / length(x) -@test moment(x, 3) ≈ sum((x .- 5).^3) / length(x) -@test moment(x, 4) ≈ sum((x .- 5).^4) / length(x) -@test moment(x, 5) ≈ sum((x .- 5).^5) / length(x) - -@test moment(x, 2, 4.0) ≈ sum((x .- 4).^2) / length(x) -@test moment(x, 3, 4.0) ≈ sum((x .- 4).^3) / length(x) -@test moment(x, 4, 4.0) ≈ sum((x .- 4).^4) / length(x) -@test moment(x, 5, 4.0) ≈ sum((x .- 4).^5) / length(x) - -w = fweights([1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) -x2 = collect(2.0:6.0) -@test moment(x, 2, w) ≈ sum((x2 .- 4).^2) / 5 -@test moment(x, 3, w) ≈ sum((x2 .- 4).^3) / 5 -@test moment(x, 4, w) ≈ sum((x2 .- 4).^4) / 5 -@test moment(x, 5, w) ≈ sum((x2 .- 4).^5) / 5 - -# Test corrected cases (this will be cleaner in testsets) -x = rand(10) - -# AnalyticWeights -@test var(x, aweights(ones(10)), true) ≈ var(x) - -w = aweights(rand(10)) -n = length(w) # Could be count(!iszero, w) instead -w = aweights(w .* (n / sum(w))) -sw = sum(w) # This is now equal to n, but maybe we should support non-normalized weights? -xbar = sum(w .* x) ./ sw -expected = sum(w .* (x .- xbar).^2)/(sw - sum(w.^2)/sw) -@test var(x, w, true) ≈ expected - -# FrequencyWeights -@test var(x, fweights(ones(Int, 10)), true) ≈ var(x) -w = fweights(rand(UInt, 10)) -sw = sum(w) -xbar = sum(w .* x) / sw -expected = sum(w .* (x .- xbar).^2) ./ (sum(w) - 1) -@test var(x, w, true) ≈ expected - -# ProbabilityWeights -@test var(x, pweights(ones(10)), true) ≈ var(x) -w = pweights(rand(10)) -n = count(!iszero, w) -sw = sum(w) -xbar = sum(w .* x)/sw -expected = sum(w .* (x .- xbar).^2)/sw * n/(n - 1) -@test var(x, w, true) ≈ expected +@testset "Skewness and Kurtosis" begin + wv = fweights(ones(5) * 2.0) + + @test skewness(1:5) ≈ 0.0 + @test skewness([1, 2, 3, 4, 5]) ≈ 0.0 + @test skewness([1, 2, 2, 2, 5]) ≈ 1.1731251294063556 + @test skewness([1, 4, 4, 4, 5]) ≈ -1.1731251294063556 + + @test skewness([1, 2, 2, 2, 5], wv) ≈ 1.1731251294063556 + + @test kurtosis(1:5) ≈ -1.3 + @test kurtosis([1, 2, 3, 4, 5]) ≈ -1.3 + @test kurtosis([1, 2, 3, 3, 2]) ≈ -1.1530612244897953 + + @test kurtosis([1, 2, 3, 4, 5], wv) ≈ -1.3 +end + +@testset "General Moments" begin + x = collect(2.0:8.0) + @test moment(x, 2) ≈ sum((x .- 5).^2) / length(x) + @test moment(x, 3) ≈ sum((x .- 5).^3) / length(x) + @test moment(x, 4) ≈ sum((x .- 5).^4) / length(x) + @test moment(x, 5) ≈ sum((x .- 5).^5) / length(x) + + @test moment(x, 2, 4.0) ≈ sum((x .- 4).^2) / length(x) + @test moment(x, 3, 4.0) ≈ sum((x .- 4).^3) / length(x) + @test moment(x, 4, 4.0) ≈ sum((x .- 4).^4) / length(x) + @test moment(x, 5, 4.0) ≈ sum((x .- 4).^5) / length(x) + + w = fweights([1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) + x2 = collect(2.0:6.0) + @test moment(x, 2, w) ≈ sum((x2 .- 4).^2) / 5 + @test moment(x, 3, w) ≈ sum((x2 .- 4).^3) / 5 + @test moment(x, 4, w) ≈ sum((x2 .- 4).^4) / 5 + @test moment(x, 5, w) ≈ sum((x2 .- 4).^5) / 5 +end + +end # @testset "StatsBase.Moments" From 1765f6751da7fca64ab10c4a49cf80e6d4196bfa Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 1 May 2017 17:41:12 -0500 Subject: [PATCH 28/50] Updated test/weights.jl to use `@testset` so that we can test against multiple weight types and added tests for `eweights`. --- test/weights.jl | 641 +++++++++++++++++++++++++----------------------- 1 file changed, 331 insertions(+), 310 deletions(-) diff --git a/test/weights.jl b/test/weights.jl index 3df3e418a..aa290f439 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -3,355 +3,376 @@ using Base.Test using Compat import Compat: view -@test isa(fweights([1, 2, 3]), AbstractWeights{Int}) -@test isa(fweights([1., 2., 3.]), AbstractWeights{Float64}) -@test isa(fweights([1 2 3; 4 5 6]), AbstractWeights{Int}) - -@test isa(AnalyticWeights([1, 2, 3], 6), AbstractWeights{Int}) - -@test isempty(fweights(Float64[])) -@test size(fweights([1, 2, 3])) == (3,) - -w = [1., 2., 3.] -wv = fweights(w) -@test eltype(wv) === Float64 -@test length(wv) === 3 -@test values(wv) === w -@test sum(wv) === 6.0 -@test !isempty(wv) - -b = trues(3) -bv = fweights(b) -@test eltype(bv) === Bool -@test length(bv) === 3 -@test values(bv) === b -@test sum(bv) === 3 -@test !isempty(bv) - -ba = BitArray([true, false, true]) -sa = sparsevec([1., 0., 2.]) - -@test sum(ba, wv) === 4.0 -@test sum(sa, wv) === 7.0 - -## wsum - -x = [6., 8., 9.] -w = [2., 3., 4.] -p = [1. 2. ; 3. 4.] -q = [1., 2., 3., 4.] - -@test wsum(Float64[], Float64[]) === 0.0 -@test wsum(x, w) === 72.0 -@test wsum(p, q) === 29.0 +@testset "StatsBase.Weights" begin +weight_funcs = (aweights, fweights, pweights) + +@testset "Construction" begin + @testset "$f" for f in weight_funcs + @test isa(f([1, 2, 3]), AbstractWeights{Int}) + @test isa(f([1., 2., 3.]), AbstractWeights{Float64}) + @test isa(f([1 2 3; 4 5 6]), AbstractWeights{Int}) + + @test isempty(f(Float64[])) + @test size(f([1, 2, 3])) == (3,) + + w = [1., 2., 3.] + wv = f(w) + @test eltype(wv) === Float64 + @test length(wv) === 3 + @test values(wv) === w + @test sum(wv) === 6.0 + @test !isempty(wv) + + b = trues(3) + bv = f(b) + @test eltype(bv) === Bool + @test length(bv) === 3 + @test values(bv) === b + @test sum(bv) === 3 + @test !isempty(bv) + + ba = BitArray([true, false, true]) + sa = sparsevec([1., 0., 2.]) + + @test sum(ba, wv) === 4.0 + @test sum(sa, wv) === 7.0 + end -## wsum along dimensions + @testset "eweights" begin + λ = 0.2 + wv = eweights(4, λ) + @test round(values(wv), 4) == [0.2, 0.25, 0.3125, 0.3906] + end +end -@test wsum(x, w, 1) == [72.0] +@testset "Sum" begin + x = [6., 8., 9.] + w = [2., 3., 4.] + p = [1. 2. ; 3. 4.] + q = [1., 2., 3., 4.] -x = rand(6, 8) -w1 = rand(6) -w2 = rand(8) + @test wsum(Float64[], Float64[]) === 0.0 + @test wsum(x, w) === 72.0 + @test wsum(p, q) === 29.0 -@test size(wsum(x, w1, 1)) == (1, 8) -@test size(wsum(x, w2, 2)) == (6, 1) + @testset "Along dimensions" begin + @test wsum(x, w, 1) == [72.0] -@test wsum(x, w1, 1) ≈ sum(x .* w1, 1) -@test wsum(x, w2, 2) ≈ sum(x .* w2', 2) + x = rand(6, 8) + w1 = rand(6) + w2 = rand(8) -x = rand(6, 5, 4) -w1 = rand(6) -w2 = rand(5) -w3 = rand(4) + @test size(wsum(x, w1, 1)) == (1, 8) + @test size(wsum(x, w2, 2)) == (6, 1) -@test size(wsum(x, w1, 1)) == (1, 5, 4) -@test size(wsum(x, w2, 2)) == (6, 1, 4) -@test size(wsum(x, w3, 3)) == (6, 5, 1) + @test wsum(x, w1, 1) ≈ sum(x .* w1, 1) + @test wsum(x, w2, 2) ≈ sum(x .* w2', 2) -@test wsum(x, w1, 1) ≈ sum(x .* w1, 1) -@test wsum(x, w2, 2) ≈ sum(x .* w2', 2) -@test wsum(x, w3, 3) ≈ sum(x .* reshape(w3, 1, 1, 4), 3) + x = rand(6, 5, 4) + w1 = rand(6) + w2 = rand(5) + w3 = rand(4) -v = view(x, 2:4, :, :) + @test size(wsum(x, w1, 1)) == (1, 5, 4) + @test size(wsum(x, w2, 2)) == (6, 1, 4) + @test size(wsum(x, w3, 3)) == (6, 5, 1) -@test wsum(v, w1[1:3], 1) ≈ sum(v .* w1[1:3], 1) -@test wsum(v, w2, 2) ≈ sum(v .* w2', 2) -@test wsum(v, w3, 3) ≈ sum(v .* reshape(w3, 1, 1, 4), 3) + @test wsum(x, w1, 1) ≈ sum(x .* w1, 1) + @test wsum(x, w2, 2) ≈ sum(x .* w2', 2) + @test wsum(x, w3, 3) ≈ sum(x .* reshape(w3, 1, 1, 4), 3) -## wsum for Arrays with non-BlasReal elements + v = view(x, 2:4, :, :) -x = rand(1:100, 6, 8) -w1 = rand(6) -w2 = rand(8) + @test wsum(v, w1[1:3], 1) ≈ sum(v .* w1[1:3], 1) + @test wsum(v, w2, 2) ≈ sum(v .* w2', 2) + @test wsum(v, w3, 3) ≈ sum(v .* reshape(w3, 1, 1, 4), 3) + end -@test wsum(x, w1, 1) ≈ sum(x .* w1, 1) -@test wsum(x, w2, 2) ≈ sum(x .* w2', 2) + @testset "Arrays with non-BlasReal elements" begin + x = rand(1:100, 6, 8) + w1 = rand(6) + w2 = rand(8) -## wsum! + @test wsum(x, w1, 1) ≈ sum(x .* w1, 1) + @test wsum(x, w2, 2) ≈ sum(x .* w2', 2) + end -x = rand(6) -w = rand(6) + @testset "In place" begin + x = rand(6) + w = rand(6) -r = ones(1) -@test wsum!(r, x, w, 1; init=true) === r -@test r ≈ [dot(x, w)] + r = ones(1) + @test wsum!(r, x, w, 1; init=true) === r + @test r ≈ [dot(x, w)] -r = ones(1) -@test wsum!(r, x, w, 1; init=false) === r -@test r ≈ [dot(x, w) + 1.0] + r = ones(1) + @test wsum!(r, x, w, 1; init=false) === r + @test r ≈ [dot(x, w) + 1.0] -x = rand(6, 8) -w1 = rand(6) -w2 = rand(8) + x = rand(6, 8) + w1 = rand(6) + w2 = rand(8) -r = ones(1, 8) -@test wsum!(r, x, w1, 1; init=true) === r -@test r ≈ sum(x .* w1, 1) + r = ones(1, 8) + @test wsum!(r, x, w1, 1; init=true) === r + @test r ≈ sum(x .* w1, 1) -r = ones(1, 8) -@test wsum!(r, x, w1, 1; init=false) === r -@test r ≈ sum(x .* w1, 1) .+ 1.0 + r = ones(1, 8) + @test wsum!(r, x, w1, 1; init=false) === r + @test r ≈ sum(x .* w1, 1) .+ 1.0 -r = ones(6) -@test wsum!(r, x, w2, 2; init=true) === r -@test r ≈ sum(x .* w2', 2) + r = ones(6) + @test wsum!(r, x, w2, 2; init=true) === r + @test r ≈ sum(x .* w2', 2) -r = ones(6) -@test wsum!(r, x, w2, 2; init=false) === r -@test r ≈ sum(x .* w2', 2) .+ 1.0 + r = ones(6) + @test wsum!(r, x, w2, 2; init=false) === r + @test r ≈ sum(x .* w2', 2) .+ 1.0 -x = rand(8, 6, 5) -w1 = rand(8) -w2 = rand(6) -w3 = rand(5) + x = rand(8, 6, 5) + w1 = rand(8) + w2 = rand(6) + w3 = rand(5) -r = ones(1, 6, 5) -@test wsum!(r, x, w1, 1; init=true) === r -@test r ≈ sum(x .* w1, 1) + r = ones(1, 6, 5) + @test wsum!(r, x, w1, 1; init=true) === r + @test r ≈ sum(x .* w1, 1) -r = ones(1, 6, 5) -@test wsum!(r, x, w1, 1; init=false) === r -@test r ≈ sum(x .* w1, 1) .+ 1.0 + r = ones(1, 6, 5) + @test wsum!(r, x, w1, 1; init=false) === r + @test r ≈ sum(x .* w1, 1) .+ 1.0 -r = ones(8, 1, 5) -@test wsum!(r, x, w2, 2; init=true) === r -@test r ≈ sum(x .* w2', 2) + r = ones(8, 1, 5) + @test wsum!(r, x, w2, 2; init=true) === r + @test r ≈ sum(x .* w2', 2) -r = ones(8, 1, 5) -@test wsum!(r, x, w2, 2; init=false) === r -@test r ≈ sum(x .* w2', 2) .+ 1.0 + r = ones(8, 1, 5) + @test wsum!(r, x, w2, 2; init=false) === r + @test r ≈ sum(x .* w2', 2) .+ 1.0 -r = ones(8, 6) -@test wsum!(r, x, w3, 3; init=true) === r -@test r ≈ sum(x .* reshape(w3, (1, 1, 5)), 3) + r = ones(8, 6) + @test wsum!(r, x, w3, 3; init=true) === r + @test r ≈ sum(x .* reshape(w3, (1, 1, 5)), 3) -r = ones(8, 6) -@test wsum!(r, x, w3, 3; init=false) === r -@test r ≈ sum(x .* reshape(w3, (1, 1, 5)), 3) .+ 1.0 + r = ones(8, 6) + @test wsum!(r, x, w3, 3; init=false) === r + @test r ≈ sum(x .* reshape(w3, (1, 1, 5)), 3) .+ 1.0 + end +end +@testset "Sum and mean syntax" begin + a = reshape(1.0:27.0, 3, 3, 3) -## the sum and mean syntax + @testset "Sum" begin + @test sum([1.0, 2.0, 3.0], fweights([1.0, 0.5, 0.5])) ≈ 3.5 + @test sum(1:3, fweights([1.0, 1.0, 0.5])) ≈ 4.5 -@test sum([1.0, 2.0, 3.0], fweights([1.0, 0.5, 0.5])) ≈ 3.5 -@test sum(1:3, fweights([1.0, 1.0, 0.5])) ≈ 4.5 + for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) + @test sum(a, fweights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) + @test sum(a, fweights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) + @test sum(a, fweights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) + end + end -@test mean([1:3;], fweights([1.0, 1.0, 0.5])) ≈ 1.8 -@test mean(1:3, fweights([1.0, 1.0, 0.5])) ≈ 1.8 + @testset "Mean" begin + @test mean([1:3;], fweights([1.0, 1.0, 0.5])) ≈ 1.8 + @test mean(1:3, fweights([1.0, 1.0, 0.5])) ≈ 1.8 -a = reshape(1.0:27.0, 3, 3, 3) -for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) - @test sum(a, fweights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) - @test sum(a, fweights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) - @test sum(a, fweights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) - @test mean(a, fweights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) - @test mean(a, fweights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) - @test mean(a, fweights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) - @test_throws ErrorException mean(a, fweights(wt), 4) + for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) + @test mean(a, fweights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) + @test mean(a, fweights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) + @test mean(a, fweights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) + @test_throws ErrorException mean(a, fweights(wt), 4) + end + end end -# Weighted median tests -data = ( - [7, 1, 2, 4, 10], - [7, 1, 2, 4, 10], - [7, 1, 2, 4, 10, 15], - [1, 2, 4, 7, 10, 15], - [0, 10, 20, 30], - [1, 2, 3, 4, 5], - [1, 2, 3, 4, 5], - [30, 40, 50, 60, 35], - [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7, 0.4], - [3.7, 3.3, 3.5, 2.8], - [100, 125, 123, 60, 45, 56, 66], - [2, 2, 2, 2, 2, 2], - [2.3], - [-2, -3, 1, 2, -10], - [1, 2, 3, 4, 5], - [5, 4, 3, 2, 1], - [-2, 2, -1, 3, 6], - [-10, 1, 1, -10, -10], - [2, 4], - [2, 2, 4, 4], - [2, 2, 2, 4] -) -wt = ( - [1, 1/3, 1/3, 1/3, 1], - [1, 1, 1, 1, 1], - [1, 1/3, 1/3, 1/3, 1, 1], - [1/3, 1/3, 1/3, 1, 1, 1], - [30, 191, 9, 0], - [10, 1, 1, 1, 9], - [10, 1, 1, 1, 900], - [1, 3, 5, 4, 2], - [2, 2, 0, 1, 2, 2, 1, 6, 0], - [5, 5, 4, 1], - [30, 56, 144, 24, 55, 43, 67], - [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], - [12], - [7, 1, 1, 1, 6], - [1, 0, 0, 0, 2], - [1, 2, -3, 4, -5], - [0.1, 0.2, 0.3, -0.2, 0.1], - [-1, -1, -1, -1, 1], - [1, 1], - [1, 1, 1, 1], - [1, 1, 1, 1] -) -median_answers = (7.0, 4.0, 8.5, - 8.5, 10.0, 2.5, - 5.0, 50.0, 1.7, - 3.5, 100.0, 2.0, - 2.3, -2.0, 5.0, - 2.0, -1.0, -10.0, - 3.0, 3.0, 2.0) -num_tests = length(data) -for i = 1:num_tests - @test wmedian(data[i], wt[i]) == median_answers[i] - @test wmedian(data[i], fweights(wt[i])) == median_answers[i] - @test median(data[i], fweights(wt[i])) == median_answers[i] - for j = 1:100 - # Make sure the weighted median does not change if the data - # and weights are reordered. - reorder = sortperm(rand(length(data[i]))) - @test median(data[i][reorder], fweights(wt[i][reorder])) == median_answers[i] +@testset "Median" begin + data = ( + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10, 15], + [1, 2, 4, 7, 10, 15], + [0, 10, 20, 30], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [30, 40, 50, 60, 35], + [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7, 0.4], + [3.7, 3.3, 3.5, 2.8], + [100, 125, 123, 60, 45, 56, 66], + [2, 2, 2, 2, 2, 2], + [2.3], + [-2, -3, 1, 2, -10], + [1, 2, 3, 4, 5], + [5, 4, 3, 2, 1], + [-2, 2, -1, 3, 6], + [-10, 1, 1, -10, -10], + [2, 4], + [2, 2, 4, 4], + [2, 2, 2, 4] + ) + wt = ( + [1, 1/3, 1/3, 1/3, 1], + [1, 1, 1, 1, 1], + [1, 1/3, 1/3, 1/3, 1, 1], + [1/3, 1/3, 1/3, 1, 1, 1], + [30, 191, 9, 0], + [10, 1, 1, 1, 9], + [10, 1, 1, 1, 900], + [1, 3, 5, 4, 2], + [2, 2, 0, 1, 2, 2, 1, 6, 0], + [5, 5, 4, 1], + [30, 56, 144, 24, 55, 43, 67], + [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], + [12], + [7, 1, 1, 1, 6], + [1, 0, 0, 0, 2], + [1, 2, -3, 4, -5], + [0.1, 0.2, 0.3, -0.2, 0.1], + [-1, -1, -1, -1, 1], + [1, 1], + [1, 1, 1, 1], + [1, 1, 1, 1] + ) + median_answers = (7.0, 4.0, 8.5, + 8.5, 10.0, 2.5, + 5.0, 50.0, 1.7, + 3.5, 100.0, 2.0, + 2.3, -2.0, 5.0, + 2.0, -1.0, -10.0, + 3.0, 3.0, 2.0) + num_tests = length(data) + for i = 1:num_tests + @test wmedian(data[i], wt[i]) == median_answers[i] + @test wmedian(data[i], fweights(wt[i])) == median_answers[i] + @test median(data[i], fweights(wt[i])) == median_answers[i] + for j = 1:100 + # Make sure the weighted median does not change if the data + # and weights are reordered. + reorder = sortperm(rand(length(data[i]))) + @test median(data[i][reorder], fweights(wt[i][reorder])) == median_answers[i] + end end + data = [4, 3, 2, 1] + wt = [0, 0, 0, 0] + @test_throws MethodError wmedian(data[1]) + @test_throws ErrorException median(data, fweights(wt)) + @test_throws ErrorException wmedian(data, wt) + @test_throws ErrorException median((Float64)[], fweights((Float64)[])) + wt = [1, 2, 3, 4, 5] + @test_throws ErrorException median(data, fweights(wt)) + @test_throws MethodError median([4 3 2 1 0], fweights(wt)) + @test_throws MethodError median([[1 2];[4 5];[7 8];[10 11];[13 14]], fweights(wt)) + data = [1, 3, 2, NaN, 2] + @test isnan(median(data, fweights(wt))) + wt = [1, 2, NaN, 4, 5] + @test_throws ErrorException median(data, fweights(wt)) + data = [1, 3, 2, 1, 2] + @test_throws ErrorException median(data, fweights(wt)) + wt = [-1, -1, -1, -1, -1] + @test_throws ErrorException median(data, fweights(wt)) + wt = [-1, -1, -1, 0, 0] + @test_throws ErrorException median(data, fweights(wt)) end -data = [4, 3, 2, 1] -wt = [0, 0, 0, 0] -@test_throws MethodError wmedian(data[1]) -@test_throws ErrorException median(data, fweights(wt)) -@test_throws ErrorException wmedian(data, wt) -@test_throws ErrorException median((Float64)[], fweights((Float64)[])) -wt = [1, 2, 3, 4, 5] -@test_throws ErrorException median(data, fweights(wt)) -@test_throws MethodError median([4 3 2 1 0], fweights(wt)) -@test_throws MethodError median([[1 2];[4 5];[7 8];[10 11];[13 14]], fweights(wt)) -data = [1, 3, 2, NaN, 2] -@test isnan(median(data, fweights(wt))) -wt = [1, 2, NaN, 4, 5] -@test_throws ErrorException median(data, fweights(wt)) -data = [1, 3, 2, 1, 2] -@test_throws ErrorException median(data, fweights(wt)) -wt = [-1, -1, -1, -1, -1] -@test_throws ErrorException median(data, fweights(wt)) -wt = [-1, -1, -1, 0, 0] -@test_throws ErrorException median(data, fweights(wt)) - - -# Weighted quantile tests -data = ( - [7, 1, 2, 4, 10], - [7, 1, 2, 4, 10], - [7, 1, 2, 4, 10, 15], - [1, 2, 4, 7, 10, 15], - [0, 10, 20, 30], - [1, 2, 3, 4, 5], - [1, 2, 3, 4, 5], - [30, 40, 50, 60, 35], - [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7], - [1, 2, 2], - [3.7, 3.3, 3.5, 2.8], - [100, 125, 123, 60, 45, 56, 66], - [2, 2, 2, 2, 2, 2], - [2.3], - [-2, -3, 1, 2, -10], - [1, 2, 3, 4, 5], - [5, 4, 3, 2, 1], - [-2, 2, -1, 3, 6], - [-10, 1, 1, -10, -10], -) -wt = ( - fweights([1, 1/3, 1/3, 1/3, 1]), - fweights([1, 1, 1, 1, 1]), - fweights([1, 1/3, 1/3, 1/3, 1, 1]), - fweights([1/3, 1/3, 1/3, 1, 1, 1]), - fweights([30, 191, 9, 0]), - fweights([10, 1, 1, 1, 9]), - fweights([10, 1, 1, 1, 900]), - fweights([1, 3, 5, 4, 2]), - fweights([2, 2, 5, 1, 2, 2, 1, 6]), - fweights([0.1, 0.1, 0.8]), - fweights([5, 5, 4, 1]), - fweights([30, 56, 144, 24, 55, 43, 67]), - fweights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), - fweights([12]), - fweights([7, 1, 1, 1, 6]), - fweights([1, 0, 0, 0, 2]), - fweights([1, 2, 3, 4, 5]), - fweights([0.1, 0.2, 0.3, 0.2, 0.1]), - fweights([1, 1, 1, 1, 1]), -) -quantile_answers = ( - [1.0,3.6000000000000005,6.181818181818182,8.2,10.0], - [1.0,2.0,4.0,7.0,10.0], - [1.0,4.75,8.0,10.833333333333334,15.0], - [1.0,4.75,8.0,10.833333333333334,15.0], - [0.0,6.1387900355871885,11.600000000000001,15.912500000000001,30.0], - [1.0,1.5365853658536586,2.5999999999999996,4.405405405405405,5.0], - [1.0,4.239377950569287,4.492918633712858,4.746459316856429,5.0], - [30.0,38.75,45.714285714285715,52.85714285714286,60.0], - [0.3,0.6903846153846154,1.484,1.7,2.0], - [1.0,2.0,2.0,2.0,2.0], - [2.8,3.3361111111111112,3.4611111111111112,3.581578947368421,3.7], - [45.0,59.88593155893536,100.08846153846153,118.62115384615385,125.0], - [2.0,2.0,2.0,2.0,2.0], - [2.3,2.3,2.3,2.3,2.3], - [-10.0,-5.52,-2.5882352941176467,-0.9411764705882351,2.0], - [1.0,1.75,4.25,4.625,5.0], - [1.0,1.625,2.3333333333333335,3.25,5.0], - [-2.0,-0.5384615384615388,1.5384615384615383,2.6999999999999997,6.0], - [-10.0,-10.0,-10.0,1.0,1.0] -) -p = [0.0, 0.25, 0.5, 0.75, 1.0] - -srand(10) -for i = 1:length(data) - @test quantile(data[i], wt[i], p) ≈ quantile_answers[i] - for j = 1:10 - # order of p does not matter - reorder = sortperm(rand(length(p))) - @test quantile(data[i], wt[i], p[reorder]) ≈ quantile_answers[i][reorder] + +@testset "Quantile" begin + data = ( + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10, 15], + [1, 2, 4, 7, 10, 15], + [0, 10, 20, 30], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [30, 40, 50, 60, 35], + [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7], + [1, 2, 2], + [3.7, 3.3, 3.5, 2.8], + [100, 125, 123, 60, 45, 56, 66], + [2, 2, 2, 2, 2, 2], + [2.3], + [-2, -3, 1, 2, -10], + [1, 2, 3, 4, 5], + [5, 4, 3, 2, 1], + [-2, 2, -1, 3, 6], + [-10, 1, 1, -10, -10], + ) + wt = ( + fweights([1, 1/3, 1/3, 1/3, 1]), + fweights([1, 1, 1, 1, 1]), + fweights([1, 1/3, 1/3, 1/3, 1, 1]), + fweights([1/3, 1/3, 1/3, 1, 1, 1]), + fweights([30, 191, 9, 0]), + fweights([10, 1, 1, 1, 9]), + fweights([10, 1, 1, 1, 900]), + fweights([1, 3, 5, 4, 2]), + fweights([2, 2, 5, 1, 2, 2, 1, 6]), + fweights([0.1, 0.1, 0.8]), + fweights([5, 5, 4, 1]), + fweights([30, 56, 144, 24, 55, 43, 67]), + fweights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), + fweights([12]), + fweights([7, 1, 1, 1, 6]), + fweights([1, 0, 0, 0, 2]), + fweights([1, 2, 3, 4, 5]), + fweights([0.1, 0.2, 0.3, 0.2, 0.1]), + fweights([1, 1, 1, 1, 1]), + ) + quantile_answers = ( + [1.0,3.6000000000000005,6.181818181818182,8.2,10.0], + [1.0,2.0,4.0,7.0,10.0], + [1.0,4.75,8.0,10.833333333333334,15.0], + [1.0,4.75,8.0,10.833333333333334,15.0], + [0.0,6.1387900355871885,11.600000000000001,15.912500000000001,30.0], + [1.0,1.5365853658536586,2.5999999999999996,4.405405405405405,5.0], + [1.0,4.239377950569287,4.492918633712858,4.746459316856429,5.0], + [30.0,38.75,45.714285714285715,52.85714285714286,60.0], + [0.3,0.6903846153846154,1.484,1.7,2.0], + [1.0,2.0,2.0,2.0,2.0], + [2.8,3.3361111111111112,3.4611111111111112,3.581578947368421,3.7], + [45.0,59.88593155893536,100.08846153846153,118.62115384615385,125.0], + [2.0,2.0,2.0,2.0,2.0], + [2.3,2.3,2.3,2.3,2.3], + [-10.0,-5.52,-2.5882352941176467,-0.9411764705882351,2.0], + [1.0,1.75,4.25,4.625,5.0], + [1.0,1.625,2.3333333333333335,3.25,5.0], + [-2.0,-0.5384615384615388,1.5384615384615383,2.6999999999999997,6.0], + [-10.0,-10.0,-10.0,1.0,1.0] + ) + p = [0.0, 0.25, 0.5, 0.75, 1.0] + + srand(10) + for i = 1:length(data) + @test quantile(data[i], wt[i], p) ≈ quantile_answers[i] + for j = 1:10 + # order of p does not matter + reorder = sortperm(rand(length(p))) + @test quantile(data[i], wt[i], p[reorder]) ≈ quantile_answers[i][reorder] + end + for j = 1:10 + # order of w does not matter + reorder = sortperm(rand(length(data[i]))) + @test quantile(data[i][reorder], fweights(wt[i][reorder]), p) ≈ quantile_answers[i] + end end - for j = 1:10 - # order of w does not matter - reorder = sortperm(rand(length(data[i]))) - @test quantile(data[i][reorder], fweights(wt[i][reorder]), p) ≈ quantile_answers[i] - end -end -# w = 1 corresponds to base quantile -for i = 1:length(data) - @test quantile(data[i], fweights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) - for j = 1:10 - prandom = rand(4) - @test quantile(data[i], fweights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) + # w = 1 corresponds to base quantile + for i = 1:length(data) + @test quantile(data[i], fweights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) + for j = 1:10 + prandom = rand(4) + @test quantile(data[i], fweights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) + end end + + # other syntaxes + v = [7, 1, 2, 4, 10] + w = [1, 1/3, 1/3, 1/3, 1] + answer = 6.181818181818182 + @test quantile(data[1], fweights(w), 0.5) ≈ answer + @test wquantile(data[1], fweights(w), [0.5]) ≈ [answer] + @test wquantile(data[1], fweights(w), 0.5) ≈ answer + @test wquantile(data[1], w, [0.5]) ≈ [answer] + @test wquantile(data[1], w, 0.5) ≈ answer end -# other syntaxes -v = [7, 1, 2, 4, 10] -w = [1, 1/3, 1/3, 1/3, 1] -answer = 6.181818181818182 -@test quantile(data[1], fweights(w), 0.5) ≈ answer -@test wquantile(data[1], fweights(w), [0.5]) ≈ [answer] -@test wquantile(data[1], fweights(w), 0.5) ≈ answer -@test wquantile(data[1], w, [0.5]) ≈ [answer] -@test wquantile(data[1], w, 0.5) ≈ answer +end # @testset StatsBase.Weights From 4b425701a09b885975bb432c6abdbd117afea3ef Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 1 May 2017 18:03:55 -0500 Subject: [PATCH 29/50] Updated test/cov.jl to use `@testset` and added a few extra tests. --- test/cov.jl | 94 +++++++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 42 deletions(-) diff --git a/test/cov.jl b/test/cov.jl index 449712f88..3aedc95d9 100644 --- a/test/cov.jl +++ b/test/cov.jl @@ -1,6 +1,7 @@ using StatsBase using Base.Test +@testset "StatsBase.Covariance" begin X = randn(3, 8) Z1 = X .- mean(X, 1) @@ -29,63 +30,72 @@ S2w = Z2w * diagm(w2) * Z2w' Sz1w = X' * diagm(w1) * X Sz2w = X * diagm(w2) * X' -## scattermat +@testset "Scattermat" begin + @test scattermat(X) ≈ S1 + @test scattermat(X, 2) ≈ S2 + @test StatsBase.scattermatm(X, 0) ≈ Sz1 + @test StatsBase.scattermatm(X, 0, 2) ≈ Sz2 -@test scattermat(X) ≈ S1 -@test scattermat(X, 2) ≈ S2 + @test StatsBase.scattermatm(X, mean(X,1)) ≈ S1 + @test StatsBase.scattermatm(X, mean(X,2), 2) ≈ S2 -@test StatsBase.scattermatm(X, 0) ≈ Sz1 -@test StatsBase.scattermatm(X, 0, 2) ≈ Sz2 + @test StatsBase.scattermatm(X, zeros(1,8)) ≈ Sz1 + @test StatsBase.scattermatm(X, zeros(3), 2) ≈ Sz2 -@test StatsBase.scattermatm(X, mean(X,1)) ≈ S1 -@test StatsBase.scattermatm(X, mean(X,2), 2) ≈ S2 + @testset "Weighted" begin + @test scattermat(X, wv1) ≈ S1w + @test scattermat(X, wv2, 2) ≈ S2w -@test StatsBase.scattermatm(X, zeros(1,8)) ≈ Sz1 -@test StatsBase.scattermatm(X, zeros(3), 2) ≈ Sz2 + @test StatsBase.scattermatm(X, 0, wv1) ≈ Sz1w + @test StatsBase.scattermatm(X, 0, wv2, 2) ≈ Sz2w -## weighted scatter mat + @test StatsBase.scattermatm(X, mean(X, wv1, 1), wv1) ≈ S1w + @test StatsBase.scattermatm(X, mean(X, wv2, 2), wv2, 2) ≈ S2w -@test scattermat(X, wv1) ≈ S1w -@test scattermat(X, wv2, 2) ≈ S2w + @test StatsBase.scattermatm(X, zeros(1,8), wv1) ≈ Sz1w + @test StatsBase.scattermatm(X, zeros(3), wv2, 2) ≈ Sz2w + end +end -@test StatsBase.scattermatm(X, 0, wv1) ≈ Sz1w -@test StatsBase.scattermatm(X, 0, wv2, 2) ≈ Sz2w +@testset "Weighted Covariance" begin + @test cov(X, wv1, false) ≈ S1w ./ sum(wv1) + @test cov(X, wv2, 2, false) ≈ S2w ./ sum(wv2) -@test StatsBase.scattermatm(X, mean(X, wv1, 1), wv1) ≈ S1w -@test StatsBase.scattermatm(X, mean(X, wv2, 2), wv2, 2) ≈ S2w + @test Base.covm(X, 0, wv1, 1, false) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, 0, wv2, 2, false) ≈ Sz2w ./ sum(wv2) -@test StatsBase.scattermatm(X, zeros(1,8), wv1) ≈ Sz1w -@test StatsBase.scattermatm(X, zeros(3), wv2, 2) ≈ Sz2w + @test Base.covm(X, mean(X, wv1, 1), wv1, 1, false) ≈ S1w ./ sum(wv1) + @test Base.covm(X, mean(X, wv2, 2), wv2, 2, false) ≈ S2w ./ sum(wv2) -# weighted covariance + @test Base.covm(X, zeros(1,8), wv1, 1, false) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, zeros(3), wv2, 2, false) ≈ Sz2w ./ sum(wv2) +end -@test cov(X, wv1, false) ≈ S1w ./ sum(wv1) -@test cov(X, wv2, 2, false) ≈ S2w ./ sum(wv2) +@testset "Mean and covariance" begin + (m, C) = mean_and_cov(X, false) + @test m == mean(X, 1) + @test C == cov(X, 1, false) -@test Base.covm(X, 0, wv1, 1, false) ≈ Sz1w ./ sum(wv1) -@test Base.covm(X, 0, wv2, 2, false) ≈ Sz2w ./ sum(wv2) + (m, C) = mean_and_cov(X, 1, false) + @test m == mean(X, 1) + @test C == cov(X, 1, false) -@test Base.covm(X, mean(X, wv1, 1), wv1, 1, false) ≈ S1w ./ sum(wv1) -@test Base.covm(X, mean(X, wv2, 2), wv2, 2, false) ≈ S2w ./ sum(wv2) + (m, C) = mean_and_cov(X, 2, false) + @test m == mean(X, 2) + @test C == cov(X, 2, false) -@test Base.covm(X, zeros(1,8), wv1, 1, false) ≈ Sz1w ./ sum(wv1) -@test Base.covm(X, zeros(3), wv2, 2, false) ≈ Sz2w ./ sum(wv2) + (m, C) = mean_and_cov(X, wv1, false) + @test m == mean(X, wv1, 1) + @test C == cov(X, wv1, 1, false) -# mean_and_cov + (m, C) = mean_and_cov(X, wv1, 1, false) + @test m == mean(X, wv1, 1) + @test C == cov(X, wv1, 1, false) -(m, C) = mean_and_cov(X, 1, false) -@test m == mean(X, 1) -@test C == cov(X, 1, false) + (m, C) = mean_and_cov(X, wv2, 2, false) + @test m == mean(X, wv2, 2) + @test C == cov(X, wv2, 2, false) +end -(m, C) = mean_and_cov(X, 2, false) -@test m == mean(X, 2) -@test C == cov(X, 2, false) - -(m, C) = mean_and_cov(X, wv1, 1, false) -@test m == mean(X, wv1, 1) -@test C == cov(X, wv1, 1, false) - -(m, C) = mean_and_cov(X, wv2, 2, false) -@test m == mean(X, wv2, 2) -@test C == cov(X, wv2, 2, false) +end # @testset "StatsBase.Covariance" From dd9e94e7d3e8a5303bc55501036602a638a90885 Mon Sep 17 00:00:00 2001 From: rofinn Date: Tue, 2 May 2017 16:42:41 -0500 Subject: [PATCH 30/50] Switched positional corrected back to a keyword with `Union{Bool, Void}` deprecation approach. * Reverted some of the code reorganization changes in moments.jl and cov.jl to make the PR review easier * Added a DepBool alias for `Union{Bool, Void}` * Added a `depcheck` method for checking handling deprecation of unset `corrected` keyword. --- .travis.yml | 6 +- src/common.jl | 16 +++++ src/cov.jl | 76 ++++++++++----------- src/deprecates.jl | 107 ----------------------------- src/moments.jl | 167 ++++++++++++++++++++++++++-------------------- test/cov.jl | 34 +++++----- test/moments.jl | 100 +++++++++++++-------------- 7 files changed, 217 insertions(+), 289 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2626aacd5..ef0f418ac 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,8 +11,8 @@ notifications: git: depth: 999999 # Uncomment the following lines to override the default test script -script: - - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi - - julia --depwarn=no -e 'Pkg.clone(pwd()); Pkg.build("StatsBase"); Pkg.test("StatsBase"; coverage=true)' +#script: +# - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi +# - julia -e 'Pkg.clone(pwd()); Pkg.build("StatsBase"); Pkg.test("StatsBase"; coverage=true)' after_success: - julia -e 'cd(Pkg.dir("StatsBase")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())' diff --git a/src/common.jl b/src/common.jl index 4b8ff439a..2124279c2 100644 --- a/src/common.jl +++ b/src/common.jl @@ -26,3 +26,19 @@ @compat fptype{T<:Union{Float64,Int32,UInt32,Int64,UInt64,Int128,UInt128}}(::Type{T}) = Float64 fptype(::Type{Complex64}) = Complex64 fptype(::Type{Complex128}) = Complex128 + +# A convenient typealias for deprecating default corrected Bool +@compat const DepBool = Union{Bool, Void} + +const CORRECTED_DEP_MSG = string("Will default to `corrected=true` in the future.", + "Use `corrected=false` for previous behaviour.") + +function depcheck(fname::Symbol, b::DepBool; msg::AbstractString=CORRECTED_DEP_MSG, + default::Bool=false) + if b == nothing + Base.depwarn(string(fname, ": ", msg), fname) + default + else + b + end +end diff --git a/src/cov.jl b/src/cov.jl index ab408c0c1..58837b5f4 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -22,6 +22,14 @@ function _scalevars(x::DenseMatrix, s::DenseVector, vardim::Int) end ## scatter matrix + + +scattermat_zm(x::DenseMatrix, vardim::Int) = Base.unscaled_covzm(x, vardim) + + +scattermat_zm(x::DenseMatrix, wv::AbstractWeights, vardim::Int) = + _symmetrize!(Base.unscaled_covzm(x, _scalevars(x, values(wv), vardim), vardim)) + """ scattermat(X, [wv::AbstractWeights]; mean=nothing, vardim=1) @@ -37,22 +45,8 @@ that the data are centered and hence there's no need to subtract the mean. When `vardim = 1`, the variables are considered columns with observations in rows; when `vardim = 2`, variables are in rows with observations in columns. """ -scattermatm(x::DenseMatrix, mean, vardim::Int=1) = - scattermat_zm(x .- mean, vardim) +function scattermat end -scattermatm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1) = - scattermat_zm(x .- mean, wv, vardim) - -scattermat(x::DenseMatrix, vardim::Int=1) = - scattermatm(x, Base.mean(x, vardim), vardim) - -scattermat(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = - scattermatm(x, Base.mean(x, wv, vardim), wv, vardim) - -scattermat_zm(x::DenseMatrix, vardim::Int) = Base.unscaled_covzm(x, vardim) - -scattermat_zm(x::DenseMatrix, wv::AbstractWeights, vardim::Int) = - _symmetrize!(Base.unscaled_covzm(x, _scalevars(x, values(wv), vardim), vardim)) """ cov(X, wv::AbstractWeights, [vardim, corrected]) @@ -66,17 +60,8 @@ matrix (`corrected=false`) can be computed by multiplying `scattermat(X, wv)` by * FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` * ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ -Base.cov(x::DenseMatrix, wv::AbstractWeights, corrected::Bool) = - Base.covm(x, Base.mean(x, wv, 1), wv, 1, corrected) +cov -Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int, corrected::Bool) = - Base.covm(x, Base.mean(x, wv, vardim), wv, vardim, corrected) - -Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, corrected::Bool) = - scale!(scattermatm(x, mean, wv, 1), varcorrection(wv, corrected)) - -Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int, corrected::Bool) = - scale!(scattermatm(x, mean, wv, vardim), varcorrection(wv, corrected)) """ mean_and_cov(x, [wv::AbstractWeights, vardim, corrected]) -> (mean, cov) @@ -88,22 +73,37 @@ Finally, bias correction can be applied to the covariance calculation if `corrected=true`. See `cov` documentation for more details. """ -function mean_and_cov(x::DenseMatrix, corrected::Bool=true) - m = mean(x, 1) - return m, Base.covm(x, m, 1, corrected) -end +function mean_and_cov end + + +scattermatm(x::DenseMatrix, mean, vardim::Int=1) = + scattermat_zm(x .- mean, vardim) + +scattermatm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1) = + scattermat_zm(x .- mean, wv, vardim) -function mean_and_cov(x::DenseMatrix, vardim::Int, corrected::Bool=true) +scattermat(x::DenseMatrix, vardim::Int=1) = + scattermatm(x, Base.mean(x, vardim), vardim) + +scattermat(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = + scattermatm(x, Base.mean(x, wv, vardim), wv, vardim) + +## weighted cov +Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1; + corrected::DepBool=nothing) = + scale!(scattermatm(x, mean, wv, vardim), varcorrection(wv, depcheck(:covm, corrected))) + + +Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected::DepBool=nothing) = + Base.covm(x, Base.mean(x, wv, vardim), wv, vardim; corrected=depcheck(:cov, corrected)) + + +function mean_and_cov(x::DenseMatrix, vardim::Int=1; corrected::Bool=true) m = mean(x, vardim) return m, Base.covm(x, m, vardim, corrected) end - -function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, corrected::Bool) - m = mean(x, wv, 1) - return m, Base.cov(x, wv, 1, corrected) -end - -function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int, corrected::Bool) +function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; + corrected::DepBool=nothing) m = mean(x, wv, vardim) - return m, Base.cov(x, wv, vardim, corrected) + return m, Base.cov(x, wv, vardim; corrected=depcheck(:mean_and_cov, corrected)) end diff --git a/src/deprecates.jl b/src/deprecates.jl index fdcce7abd..a84b6a6f1 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -79,110 +79,3 @@ function varcorrection(w::WeightVec, corrected::Bool=false) corrected && throw(ArgumentError("WeightVec does not support bias correction.")) 1 / w.sum end - -_correction_dep_msg(fname) = - string(fname, " will default to `corrected=true` in the future.") - -# The following methods are for wrapping the deprecated `correction=false` behaviour. -# When we default to `correction=true` these methods should be removed in favour of -# adding `corrected::Bool=true` in the appropriate methods. - -function Base.varm(v::RealArray, wv::AbstractWeights, m::Real) - Base.depwarn(_correction_dep_msg("`varm`"), :varm) - varm(v, wv, m, false) -end - -function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int) - Base.depwarn(_correction_dep_msg("`varm`"), :varm) - varm(A, wv, M, dim, false) -end - -function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing) - Base.depwarn(_correction_dep_msg("`var`"), :var) - var(v, wv, false; mean=mean) -end - -function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) - Base.depwarn(_correction_dep_msg("`var`"), :var) - var(A, wv, dim, false; mean=mean) -end - -function Base.stdm(v::RealArray, wv::AbstractWeights, m::Real) - Base.depwarn(_correction_dep_msg("`stdm`"), :stdm) - stdm(v, wv, m, false) -end - -function Base.stdm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int) - Base.depwarn(_correction_dep_msg("`stdm`"), :stdm) - stdm(A, wv, M, dim, false) -end - -function Base.std(v::RealArray, wv::AbstractWeights; mean=nothing) - Base.depwarn(_correction_dep_msg("`std`"), :std) - std(v, wv, false; mean=mean) -end - -function Base.std(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing) - Base.depwarn(_correction_dep_msg("`std`"), :std) - std(A, wv, dim, false; mean=mean) -end - -function mean_and_var(A::RealArray, wv::AbstractWeights) - m = mean(A, wv) - v = varm(A, wv, m) - m, v -end - -function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int) - m = mean(A, wv, dim) - v = varm(A, wv, m, dim) - m, v -end - -function mean_and_std(A::RealArray, wv::AbstractWeights) - m = mean(A, wv) - s = stdm(A, wv, m) - m, s -end - -function mean_and_std(A::RealArray, dim::Int) - m = mean(A, dim) - s = stdm(A, m, dim) - m, s -end - -function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int) - m = mean(A, wv, dim) - s = stdm(A, wv, m, dim) - m, s -end - -function Base.cov(x::DenseMatrix, wv::AbstractWeights) - Base.depwarn(_correction_dep_msg("`cov`"), :cov) - Base.covm(x, Base.mean(x, wv, 1), wv, 1, false) -end - -function Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int) - Base.depwarn(_correction_dep_msg("`cov`"), :cov) - Base.covm(x, Base.mean(x, wv, vardim), wv, vardim, false) -end - -function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights) - Base.depwarn(_correction_dep_msg("`covm`"), :covm) - scale!(scattermatm(x, mean, wv, 1), varcorrection(wv, false)) -end - -function Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int) - Base.depwarn(_correction_dep_msg("`covm`"), :covm) - scale!(scattermatm(x, mean, wv, vardim), varcorrection(wv, false)) -end - -function mean_and_cov(x::DenseMatrix, wv::AbstractWeights) - m = mean(x, wv, 1) - return m, Base.cov(x, wv, 1) -end - -function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int) - m = mean(x, wv, vardim) - return m, Base.cov(x, wv, vardim) -end diff --git a/src/moments.jl b/src/moments.jl index aaa405617..5ee4a9f86 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -25,24 +25,8 @@ An unbiased weighted variance (`corrected=true`) is dependent on the type of wei * FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` * ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ -Base.varm(v::RealArray, wv::AbstractWeights, m::Real, corrected::Bool) = - _moment2(v, wv, m; corrected=corrected) - -function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int, corrected) - @static if VERSION < v"0.6.0-dev.1121" - Base.varm!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, M, dim, - corrected) - else - Base.varm!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, M, - dim, corrected) - end -end - -function Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, - dim::Int, corrected::Bool) - scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), - varcorrection(wv, corrected)) -end +Base.varm(v::RealArray, wv::AbstractWeights, m::Real; corrected::DepBool=nothing) = + _moment2(v, wv, m; corrected=depcheck(:varm, corrected)) """ var(x, wv::AbstractWeights, [dim, corrected]; mean=nothing) @@ -68,33 +52,37 @@ An unbiased weighted variance (`corrected=true`) is dependent on the type of wei * FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` * ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ -function Base.var(v::RealArray, wv::AbstractWeights, corrected::Bool; mean=nothing) +function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, + corrected::DepBool=nothing) + corrected = depcheck(:var, corrected) + if mean == 0 - varm(v, wv, 0, corrected) + varm(v, wv, 0; corrected=corrected) elseif mean == nothing - varm(v, wv, Base.mean(v, wv), corrected) + varm(v, wv, Base.mean(v, wv); corrected=corrected) else - varm(v, wv, mean, corrected) + varm(v, wv, mean; corrected=corrected) end end -function Base.var(A::RealArray, wv::AbstractWeights, dim::Int, corrected::Bool; - mean=nothing) - @static if VERSION < v"0.6.0-dev.1121" - var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim, corrected; - mean=mean) - else - var!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, dim, - corrected; mean=mean) - end +## var along dim + +function Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, + dim::Int; corrected::DepBool=nothing) + corrected = depcheck(:varm!, corrected) + scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), + varcorrection(wv, corrected)) end -function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int, - corrected::Bool; mean=nothing) +function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; + mean=nothing, corrected::DepBool=nothing) + corrected = depcheck(:var!, corrected) + if mean == 0 - Base.varm!(R, A, wv, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim, corrected) + Base.varm!(R, A, wv, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim; + corrected=corrected) elseif mean == nothing - Base.varm!(R, A, wv, Base.mean(A, wv, dim), dim, corrected) + Base.varm!(R, A, wv, Base.mean(A, wv, dim), dim; corrected=corrected) else # check size of mean for i = 1:ndims(A) @@ -106,7 +94,33 @@ function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int, dM == dA || throw(DimensionMismatch("Incorrect size of mean.")) end end - Base.varm!(R, A, wv, mean, dim, corrected) + Base.varm!(R, A, wv, mean, dim; corrected=corrected) + end +end + +function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; + corrected::DepBool=nothing) + corrected = depcheck(:varm, corrected) + + @static if VERSION < v"0.6.0-dev.1121" + Base.varm!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, M, dim; + corrected=corrected) + else + Base.varm!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, M, + dim; corrected=corrected) + end +end + +function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, + corrected::DepBool=nothing) + corrected = depcheck(:var, corrected) + + @static if VERSION < v"0.6.0-dev.1121" + var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim; mean=mean, + corrected=corrected) + else + var!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, dim; + mean=mean, corrected=corrected) end end @@ -135,14 +149,8 @@ An unbiased standard deviation (`corrected=true`) is dependent on the type of we * FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` * ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ -Base.stdm(v::RealArray, wv::AbstractWeights, m::Real, corrected::Bool) = - sqrt(varm(v, wv, m, corrected)) - -Base.stdm(v::RealArray, m::RealArray, dim::Int, corrected::Bool=true) = - Base.sqrt!(varm(v, m, dim, corrected=corrected)) - -Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int, corrected::Bool) = - sqrt.(varm(v, wv, m, dim, corrected)) +Base.stdm(v::RealArray, wv::AbstractWeights, m::Real; corrected::DepBool=nothing) = + sqrt(varm(v, wv, m, corrected=depcheck(:stdm, corrected))) """ std(v, wv::AbstractWeights, [dim, corrected]; mean=nothing) @@ -168,11 +176,19 @@ An unbiased standard deviation (`corrected=true`) is dependent on the type of we * FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` * ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ -Base.std(v::RealArray, wv::AbstractWeights, corrected::Bool; mean=nothing) = - sqrt.(var(v, wv, corrected; mean=mean)) +Base.std(v::RealArray, wv::AbstractWeights; mean=nothing, corrected::DepBool=nothing) = + sqrt.(var(v, wv; mean=mean, corrected=depcheck(:std, corrected))) + +Base.stdm(v::RealArray, m::RealArray, dim::Int; corrected::DepBool=nothing) = + Base.sqrt!(varm(v, m, dim; corrected=corrected)) + +Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int; + corrected::DepBool=nothing) = + sqrt.(varm(v, wv, m, dim; corrected=depcheck(:stdm, corrected))) -Base.std(v::RealArray, wv::AbstractWeights, dim::Int, corrected::Bool; mean=nothing) = - sqrt.(var(v, wv, dim, corrected; mean=mean)) +Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, + corrected::DepBool=nothing) = + sqrt.(var(v, wv, dim; mean=mean, corrected=depcheck(:std, corrected))) ##### Fused statistics """ @@ -183,30 +199,12 @@ Return the mean and variance of a real-valued array `x`, optionally over a dimen can be applied to the variance calculation if `corrected=true`. See `var` documentation for more details. """ -function mean_and_var(A::RealArray, corrected::Bool=true) +function mean_and_var(A::RealArray; corrected::Bool=true) m = mean(A) v = varm(A, m; corrected=corrected) m, v end -function mean_and_var(A::RealArray, wv::AbstractWeights, corrected::Bool) - m = mean(A, wv) - v = varm(A, wv, m, corrected) - m, v -end - -function mean_and_var(A::RealArray, dim::Int, corrected::Bool=true) - m = mean(A, dim) - v = varm(A, m, dim; corrected=corrected) - m, v -end - -function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int, corrected) - m = mean(A, wv, dim) - v = varm(A, wv, m, dim, corrected) - m, v -end - """ mean_and_std(x, [wv::AbstractWeights, dim, corrected]) -> (mean, std) @@ -216,30 +214,51 @@ to weight the estimates. Finally, bias correction can be applied to the standard deviation calculation if `corrected=true`. See `std` documentation for more details. """ -function mean_and_std(A::RealArray, corrected::Bool=true) +function mean_and_std(A::RealArray; corrected::Bool=true) m = mean(A) s = stdm(A, m; corrected=corrected) m, s end -function mean_and_std(A::RealArray, wv::AbstractWeights, corrected::Bool) +function mean_and_var(A::RealArray, wv::AbstractWeights; corrected::DepBool=nothing) + m = mean(A, wv) + v = varm(A, wv, m; corrected=depcheck(:mean_and_var, corrected)) + m, v +end +function mean_and_std(A::RealArray, wv::AbstractWeights; corrected::DepBool=nothing) m = mean(A, wv) - s = stdm(A, wv, m, corrected) + s = stdm(A, wv, m; corrected=depcheck(:mean_and_std, corrected)) m, s end -function mean_and_std(A::RealArray, dim::Int, corrected::Bool) + +function mean_and_var(A::RealArray, dim::Int; corrected::Bool=true) m = mean(A, dim) - s = stdm(A, m, dim, corrected) + v = varm(A, m, dim; corrected=corrected) + m, v +end +function mean_and_std(A::RealArray, dim::Int; corrected::Bool=true) + m = mean(A, dim) + s = stdm(A, m, dim; corrected=corrected) m, s end -function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int, corrected::Bool) + +function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int; + corrected::DepBool=nothing) m = mean(A, wv, dim) - s = stdm(A, wv, m, dim, corrected) + v = varm(A, wv, m, dim; corrected=depcheck(:mean_and_var, corrected)) + m, v +end +function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int; + corrected::DepBool=nothing) + m = mean(A, wv, dim) + s = stdm(A, wv, m, dim; corrected=depcheck(:mean_and_std, corrected)) m, s end + + ##### General central moment function _moment2(v::RealArray, m::Real; corrected=false) n = length(v) diff --git a/test/cov.jl b/test/cov.jl index 3aedc95d9..a3ae7ffcb 100644 --- a/test/cov.jl +++ b/test/cov.jl @@ -59,43 +59,43 @@ Sz2w = X * diagm(w2) * X' end @testset "Weighted Covariance" begin - @test cov(X, wv1, false) ≈ S1w ./ sum(wv1) - @test cov(X, wv2, 2, false) ≈ S2w ./ sum(wv2) + @test cov(X, wv1; corrected=false) ≈ S1w ./ sum(wv1) + @test cov(X, wv2, 2; corrected=false) ≈ S2w ./ sum(wv2) - @test Base.covm(X, 0, wv1, 1, false) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, 0, wv2, 2, false) ≈ Sz2w ./ sum(wv2) + @test Base.covm(X, 0, wv1, 1; corrected=false) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, 0, wv2, 2; corrected=false) ≈ Sz2w ./ sum(wv2) - @test Base.covm(X, mean(X, wv1, 1), wv1, 1, false) ≈ S1w ./ sum(wv1) - @test Base.covm(X, mean(X, wv2, 2), wv2, 2, false) ≈ S2w ./ sum(wv2) + @test Base.covm(X, mean(X, wv1, 1), wv1, 1; corrected=false) ≈ S1w ./ sum(wv1) + @test Base.covm(X, mean(X, wv2, 2), wv2, 2; corrected=false) ≈ S2w ./ sum(wv2) - @test Base.covm(X, zeros(1,8), wv1, 1, false) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, zeros(3), wv2, 2, false) ≈ Sz2w ./ sum(wv2) + @test Base.covm(X, zeros(1,8), wv1, 1; corrected=false) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, zeros(3), wv2, 2; corrected=false) ≈ Sz2w ./ sum(wv2) end @testset "Mean and covariance" begin - (m, C) = mean_and_cov(X, false) + (m, C) = mean_and_cov(X; corrected=false) @test m == mean(X, 1) @test C == cov(X, 1, false) - (m, C) = mean_and_cov(X, 1, false) + (m, C) = mean_and_cov(X, 1; corrected=false) @test m == mean(X, 1) @test C == cov(X, 1, false) - (m, C) = mean_and_cov(X, 2, false) + (m, C) = mean_and_cov(X, 2; corrected=false) @test m == mean(X, 2) @test C == cov(X, 2, false) - (m, C) = mean_and_cov(X, wv1, false) + (m, C) = mean_and_cov(X, wv1; corrected=false) @test m == mean(X, wv1, 1) - @test C == cov(X, wv1, 1, false) + @test C == cov(X, wv1, 1; corrected=false) - (m, C) = mean_and_cov(X, wv1, 1, false) + (m, C) = mean_and_cov(X, wv1, 1; corrected=false) @test m == mean(X, wv1, 1) - @test C == cov(X, wv1, 1, false) + @test C == cov(X, wv1, 1; corrected=false) - (m, C) = mean_and_cov(X, wv2, 2, false) + (m, C) = mean_and_cov(X, wv2, 2; corrected=false) @test m == mean(X, wv2, 2) - @test C == cov(X, wv2, 2, false) + @test C == cov(X, wv2, 2; corrected=false) end end # @testset "StatsBase.Covariance" diff --git a/test/moments.jl b/test/moments.jl index b3f33c9a5..c5fe79aa9 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -13,39 +13,39 @@ weight_funcs = (aweights, fweights, pweights) @testset "Variance with $f" for f in weight_funcs wv = f(w) m = mean(x, wv) - @test var(x, wv, false) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) - @test var(x, wv, false; mean=0) ≈ sum(abs2.(x), wv) ./ sum(wv) - @test var(x, wv, false; mean=1.0) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) + @test var(x, wv; corrected=false) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) + @test var(x, wv; mean=0, corrected=false) ≈ sum(abs2.(x), wv) ./ sum(wv) + @test var(x, wv; mean=1.0, corrected=false) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) end @testset "Standard Deviation with $f" for f in weight_funcs wv = f(w) m = mean(x, wv) - @test std(x, wv, false) ≈ sqrt(var(x, wv, false)) - @test std(x, wv, false; mean=0) ≈ sqrt(var(x, wv, false; mean=0)) - @test std(x, wv, false; mean=1.0) ≈ sqrt(var(x, wv, false; mean=1.0)) + @test std(x, wv; corrected=false) ≈ sqrt(var(x, wv; corrected=false)) + @test std(x, wv; mean=0, corrected=false) ≈ sqrt(var(x, wv; mean=0, corrected=false)) + @test std(x, wv; mean=1.0, corrected=false) ≈ sqrt(var(x, wv; mean=1.0, corrected=false)) end @testset "Mean and Variance with $f" for f in weight_funcs wv = f(w) - (m, v) = mean_and_var(x, false) + (m, v) = mean_and_var(x; corrected=false) @test m == mean(x) - @test v == var(x, corrected=false) + @test v == var(x; corrected=corrected=false) - (m, v) = mean_and_var(x, wv, false) + (m, v) = mean_and_var(x, wv; corrected=false) @test m == mean(x, wv) - @test v == var(x, wv, false) + @test v == var(x, wv; corrected=false) end @testset "Mean and Standard Deviation with $f" for f in weight_funcs wv = f(w) - (m, s) = mean_and_std(x, false) + (m, s) = mean_and_std(x; corrected=false) @test m == mean(x) - @test s == std(x, corrected=false) + @test s == std(x; corrected=false) - (m, s) = mean_and_std(x, wv, false) + (m, s) = mean_and_std(x, wv; corrected=false) @test m == mean(x, wv) - @test s == std(x, wv, false) + @test s == std(x, wv; corrected=false) end end @@ -60,42 +60,42 @@ weight_funcs = (aweights, fweights, pweights) wv = weight_funcs[i](w) m = mean(x, wv) - @test var(x, wv, true) ≈ expected[i] - @test var(x, wv, true; mean=0) ≈ expected_0[i] - @test var(x, wv, true; mean=1.0) ≈ expected_1[i] + @test var(x, wv; corrected=true) ≈ expected[i] + @test var(x, wv; mean=0, corrected=true) ≈ expected_0[i] + @test var(x, wv; mean=1.0, corrected=true) ≈ expected_1[i] end end @testset "Standard Deviation with $f" for f in weight_funcs wv = f(w) m = mean(x, wv) - @test std(x, wv, true) ≈ sqrt(var(x, wv, true)) - @test std(x, wv, true; mean=0) ≈ sqrt(var(x, wv, true; mean=0)) - @test std(x, wv, true; mean=1.0) ≈ sqrt(var(x, wv, true; mean=1.0)) + @test std(x, wv; corrected=true) ≈ sqrt(var(x, wv; corrected=true)) + @test std(x, wv; mean=0, corrected=true) ≈ sqrt(var(x, wv; mean=0, corrected=true)) + @test std(x, wv; mean=1.0, corrected=true) ≈ sqrt(var(x, wv; mean=1.0, corrected=true)) end @testset "Mean and Variance with $f" for f in weight_funcs wv = f(w) - (m, v) = mean_and_var(x, true) + (m, v) = mean_and_var(x; corrected=true) @test m == mean(x) - @test v == var(x, corrected=true) + @test v == var(x; corrected=true) - (m, v) = mean_and_var(x, wv, true) + (m, v) = mean_and_var(x, wv; corrected=true) @test m == mean(x, wv) - @test v == var(x, wv, true) + @test v == var(x, wv; corrected=true) end @testset "Mean and Standard Deviation with $f" for f in weight_funcs wv = f(w) - (m, s) = mean_and_std(x, true) + (m, s) = mean_and_std(x; corrected=true) @test m == mean(x) - @test s == std(x, corrected=true) + @test s == std(x; corrected=true) - (m, s) = mean_and_std(x, wv, true) + (m, s) = mean_and_std(x, wv; corrected=true) @test m == mean(x, wv) - @test s == std(x, wv, true) + @test s == std(x, wv; corrected=true) end end end @@ -109,47 +109,47 @@ weight_funcs = (aweights, fweights, pweights) m1 = mean(x, wv1, 1) m2 = mean(x, wv2, 2) - @test var(x, wv1, 1, false; mean=0) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) - @test var(x, wv2, 2, false; mean=0) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) + @test var(x, wv1, 1; mean=0, corrected=false) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) + @test var(x, wv2, 2; mean=0, corrected=false) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) - @test var(x, wv1, 1, false; mean=m1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) - @test var(x, wv2, 2, false; mean=m2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) + @test var(x, wv1, 1; mean=m1, corrected=false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) + @test var(x, wv2, 2; mean=m2, corrected=false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) - @test var(x, wv1, 1, false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) - @test var(x, wv2, 2, false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) + @test var(x, wv1, 1; corrected=false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) + @test var(x, wv2, 2; corrected=false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) - @test std(x, wv1, 1, false) ≈ sqrt.(var(x, wv1, 1, false)) - @test std(x, wv2, 2, false) ≈ sqrt.(var(x, wv2, 2, false)) - @test std(x, wv1, 1, false; mean=0) ≈ sqrt.(var(x, wv1, 1, false; mean=0)) - @test std(x, wv2, 2, false; mean=0) ≈ sqrt.(var(x, wv2, 2, false; mean=0)) - @test std(x, wv1, 1, false; mean=m1) ≈ sqrt.(var(x, wv1, 1, false; mean=m1)) - @test std(x, wv2, 2, false; mean=m2) ≈ sqrt.(var(x, wv2, 2, false; mean=m2)) + @test std(x, wv1, 1; corrected=false) ≈ sqrt.(var(x, wv1, 1; corrected=false)) + @test std(x, wv2, 2; corrected=false) ≈ sqrt.(var(x, wv2, 2; corrected=false)) + @test std(x, wv1, 1; mean=0, corrected=false) ≈ sqrt.(var(x, wv1, 1; mean=0, corrected=false)) + @test std(x, wv2, 2; mean=0, corrected=false) ≈ sqrt.(var(x, wv2, 2; mean=0, corrected=false)) + @test std(x, wv1, 1; mean=m1, corrected=false) ≈ sqrt.(var(x, wv1, 1; mean=m1, corrected=false)) + @test std(x, wv2, 2; mean=m2, corrected=false) ≈ sqrt.(var(x, wv2, 2; mean=m2, corrected=false)) for d in 1:2 - (m, v) = mean_and_var(x, d, false) + (m, v) = mean_and_var(x, d; corrected=false) @test m == mean(x, d) @test v == var(x, d; corrected=false) - (m, s) = mean_and_std(x, d, false) + (m, s) = mean_and_std(x, d; corrected=false) @test m == mean(x, d) @test s == std(x, d; corrected=false) end - (m, v) = mean_and_var(x, wv1, 1, true) + (m, v) = mean_and_var(x, wv1, 1; corrected=true) @test m == mean(x, wv1, 1) - @test v == var(x, wv1, 1, true) + @test v == var(x, wv1, 1; corrected=true) - (m, v) = mean_and_var(x, wv2, 2, false) + (m, v) = mean_and_var(x, wv2, 2; corrected=false) @test m == mean(x, wv2, 2) - @test v == var(x, wv2, 2, false) + @test v == var(x, wv2, 2; corrected=false) - (m, s) = mean_and_std(x, wv1, 1, false) + (m, s) = mean_and_std(x, wv1, 1; corrected=false) @test m == mean(x, wv1, 1) - @test s == std(x, wv1, 1, false) + @test s == std(x, wv1, 1; corrected=false) - (m, s) = mean_and_std(x, wv2, 2, false) + (m, s) = mean_and_std(x, wv2, 2; corrected=false) @test m == mean(x, wv2, 2) - @test s == std(x, wv2, 2, false) + @test s == std(x, wv2, 2; corrected=false) end end From a627e013f829ec71ef035dfa63cca03821d2507d Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 3 May 2017 10:03:12 -0500 Subject: [PATCH 31/50] Added generic `Weights` type and deprecated `WeightVec` to it. Also, did a bunch of docs cleanup from the previous commit. --- src/StatsBase.jl | 4 +- src/common.jl | 6 +- src/cov.jl | 17 +++--- src/deprecates.jl | 37 +----------- src/moments.jl | 144 +++++++++++++++++++++------------------------- src/weights.jl | 56 ++++++++++++++---- 6 files changed, 123 insertions(+), 141 deletions(-) diff --git a/src/StatsBase.jl b/src/StatsBase.jl index 87112a130..afeff5a7f 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -18,11 +18,11 @@ module StatsBase ## weights AbstractWeights, # abstract type to represent any weight vector - WeightVec, # deprecated type to represent any weight vector + Weights, # to represent a generic weight vector AnalyticWeights, # to represent an analytic/precision/reliability weight vector FrequencyWeights, # to representing a frequency/case/repeat weight vector ProbabilityWeights, # to representing a probability/sampling weight vector - weights, # deprecated function for constructing a WeightVec vector + weights, # construct a generic Weights vector aweights, # construct an AnalyticWeights vector fweights, # construct a FrequencyWeights vector pweights, # construct a ProbabilityWeights vector diff --git a/src/common.jl b/src/common.jl index 2124279c2..d556322f0 100644 --- a/src/common.jl +++ b/src/common.jl @@ -30,13 +30,13 @@ fptype(::Type{Complex128}) = Complex128 # A convenient typealias for deprecating default corrected Bool @compat const DepBool = Union{Bool, Void} -const CORRECTED_DEP_MSG = string("Will default to `corrected=true` in the future.", - "Use `corrected=false` for previous behaviour.") +const CORRECTED_DEP_MSG = string("will default to corrected=true in the future. ", + "Use corrected=false for previous behaviour.") function depcheck(fname::Symbol, b::DepBool; msg::AbstractString=CORRECTED_DEP_MSG, default::Bool=false) if b == nothing - Base.depwarn(string(fname, ": ", msg), fname) + Base.depwarn(string(fname, " ", msg), fname) default else b diff --git a/src/cov.jl b/src/cov.jl index 58837b5f4..25d44fdfe 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -49,29 +49,28 @@ function scattermat end """ - cov(X, wv::AbstractWeights, [vardim, corrected]) + cov(X, wv::AbstractWeights; vardim=1, corrected=false) Compute the weighted covariance matrix. Similar to `var` and `std` the biased covariance matrix (`corrected=false`) can be computed by multiplying `scattermat(X, wv)` by -``\frac{1}{\sum{w}}`` to normalize. However, the unbiased covariance matrix +``\\frac{1}{\\sum{w}}`` to normalize. However, the unbiased covariance matrix (`corrected=true`) is dependent on the type of weights used: -* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` -* FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` +* AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where `n = length(w)` """ cov """ - mean_and_cov(x, [wv::AbstractWeights, vardim, corrected]) -> (mean, cov) + mean_and_cov(x, [wv::AbstractWeights]; vardim=1, corrected=false) -> (mean, cov) Return the mean and covariance matrix as a tuple. A weighting vector `wv` can be specified. `vardim` that designates whether the variables are columns in the matrix (`1`) or rows (`2`). -Finally, bias correction can be applied to the covariance calculation if -`corrected=true`. -See `cov` documentation for more details. +Finally, bias correction will be applied to the covariance calculation if +`corrected=true`. See [`cov`](@ref) documentation for more details. """ function mean_and_cov end diff --git a/src/deprecates.jl b/src/deprecates.jl index a84b6a6f1..2336e4a8a 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -1,5 +1,6 @@ import Base.@deprecate import Base.depwarn +import Base.@deprecate_binding import Base.varm, Base.stdm @deprecate varm(v::RealArray, m::Real, wv::AbstractWeights) varm(v, wv, m) @@ -44,38 +45,4 @@ findat(a::AbstractArray, b::AbstractArray) = findat!(Array{Int}(size(b)), a, b) @deprecate df(obj::StatisticalModel) dof(obj) @deprecate df_residual(obj::StatisticalModel) dof_residual(obj) -@weights WeightVec - -""" - WeightVec(vs, wsum=sum(vs)) - -Construct a `WeightVec` with weight values `vs` and sum of weights `wsum`. -""" -function WeightVec{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) - new_types = "AnalyticWeights, FrequencyWeights or ProbabilityWeights" - Base.depwarn("WeightVec is deprecated, use $new_types instead", :WeightVec) - WeightVec{S, eltype(vs), V}(vs, s) -end - -""" - weights(vs) - -Construct a `WeightVec` from a given array. -""" -function weights(vs::RealArray) - Base.depwarn("weights is deprecated, use aweights, fweights or pweights instead", :weights) - v = vec(vs) - s = sum(v) - WeightVec{typeof(s), eltype(v), typeof(v)}(v, s) -end - -""" - varcorrection(w::WeightVec, corrected=false) - -Returns ``\\frac{1}{\sum w}`` when corrected is false and throws an `ArgumentError` -when correction is true. -""" -function varcorrection(w::WeightVec, corrected::Bool=false) - corrected && throw(ArgumentError("WeightVec does not support bias correction.")) - 1 / w.sum -end +@deprecate_binding WeightVec Weights diff --git a/src/moments.jl b/src/moments.jl index 5ee4a9f86..c34b66505 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -2,55 +2,47 @@ ## var """ - varm(x, wv::AbstractWeights, m, [dim, corrected]) + varm(x, w::AbstractWeights, m, [dim]; corrected=false) -Return the variance of a real-valued array `x` with a known mean `m`, optionally -over a dimension `dim`. Observations in `x` or weighted via `wv`. +Compute the variance of a real-valued array `x` with a known mean `m`, optionally +over a dimension `dim`. Observations in `x` are weighted using weight vector `w`. -In base julia a biased variance (`corrected=false`) is calculated as: +The uncorrected (when `corrected=false`) sample variance is defined as: -``\\frac{1}{N} \sum\limits_{i = 1}^N {\left( {x_i - \bar x} \right)^2 }`` +```math +\\frac{1}{\\sum{w}} \\sum{i=1}^n {w_i\\left({x_i - m}\\right)^2 } +``` +where ``n`` is the length of the input. -An unbiased variance (`corrected=true`) is calculated by replacing -``\\frac{1}{N - 1}`` with ``\\frac{1}{N - 1}`` -(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)). +The unbiased estimate of the population variance is computed by replacing +``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights used: -Here we calculate the biased weighted variance (`corrected=false`) as: - -``\\frac{1}{\sum{w}} \sum\limits_{i = 1}^N {w_i\left( {x_i - \bar x} \right)^2 }`` - -An unbiased weighted variance (`corrected=true`) is dependent on the type of weights used: - -* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` -* FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` +* AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` """ Base.varm(v::RealArray, wv::AbstractWeights, m::Real; corrected::DepBool=nothing) = _moment2(v, wv, m; corrected=depcheck(:varm, corrected)) """ - var(x, wv::AbstractWeights, [dim, corrected]; mean=nothing) - -Return the variance of a real-valued array `x` with a known mean `m`, optionally -over a dimension `dim`. Observations in `x` or weighted via `wv`. + var(x, wv::AbstractWeights, [dim]; mean=nothing, corrected=false) -In base julia a biased variance (`corrected=false`) is calculated as: +Compute the variance of a real-valued array `x`, optionally over a dimension `dim`. +Observations in `x` are weighted using weight vector `wv`. -``\\frac{1}{N} \sum\limits_{i = 1}^N {\left( {x_i - \bar x} \right)^2 }`` +The uncorrected (when `corrected=false`) sample variance is defined as: -An unbiased variance (`corrected=true`) is calculated by replacing -``\\frac{1}{N - 1}`` with ``\\frac{1}{N - 1}`` -(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)). +```math +\\frac{1}{\\sum{w}} \\sum{i=1}^n {w_i\\left({x_i - μ}\\right)^2 } +``` +where ``n`` is the length of the input and ``μ`` is the mean. -Here we calculate the biased weighted variance (`corrected=false`) as: +The unbiased estimate of the population variance is computed by replacing +``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights used: -``\\frac{1}{\sum{w}} \sum\limits_{i = 1}^N {w_i\left( {x_i - \bar x} \right)^2 }`` - -An unbiased weighted variance (`corrected=true`) is dependent on the type of weights used: - -* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` -* FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` +* AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` """ function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, corrected::DepBool=nothing) @@ -104,10 +96,10 @@ function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; @static if VERSION < v"0.6.0-dev.1121" Base.varm!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, M, dim; - corrected=corrected) + corrected=corrected) else Base.varm!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, M, - dim; corrected=corrected) + dim; corrected=corrected) end end @@ -126,55 +118,47 @@ end ## std """ - stdm(v, wv::AbstractWeights, m, [dim, corrected]) + stdm(v, wv::AbstractWeights, m, [dim]; corrected=false) -Return the standard deviation of a real-valued array `v` with a known mean `m`, -optionally over a dimension `dim`. Observations in `x` or weighted via `wv`. +Compute the standard deviation of a real-valued array `x` with a known mean `m`, +optionally over a dimension `dim`. Observations in `x` are weighted using weight vector `w`. -In base julia a biased standard deviation (`corrected=false`) is calculated as: +The uncorrected (when `corrected=false`) sample standard deviation is defined as: -``\\sqrt{\frac{1}{N} \sum\limits_{i = 1}^N {\left( {x_i - \bar x} \right)^2 }}`` +```math +\\sqrt{\\frac{1}{\\sum{w}} \\sum{i=1}^n {w_i\\left({x_i - m}\\right)^2 }} +``` +where ``n`` is the length of the input. -An unbiased standard deviation (`corrected=true`) is calculated by replacing -``\\frac{1}{N - 1}`` with ``\\frac{1}{N - 1}`` -(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)). +The unbiased estimate of the population standard deviation is computed by replacing +``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights used: -Here we calculate the biased weighted standard deviation (`corrected=false`) as: - -``\sqrt{\frac{1}{\sum{w}} \sum\limits_{i = 1}^N {w_i\left( {x_i - \bar x} \right)^2 }}`` - -An unbiased standard deviation (`corrected=true`) is dependent on the type of weights used: - -* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` -* FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` +* AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` """ Base.stdm(v::RealArray, wv::AbstractWeights, m::Real; corrected::DepBool=nothing) = sqrt(varm(v, wv, m, corrected=depcheck(:stdm, corrected))) """ - std(v, wv::AbstractWeights, [dim, corrected]; mean=nothing) - -Return the standard deviation of a real-valued array `v` with a known mean `m`, -optionally over a dimension `dim`. Observations in `x` or weighted via `wv`. - -In base julia a biased standard deviation (`corrected=false`) is calculated as: - -``\\sqrt{\frac{1}{N} \sum\limits_{i = 1}^N {\left( {x_i - \bar x} \right)^2 }}`` + std(v, wv::AbstractWeights, [dim]; mean=nothing, corrected=false) -An unbiased standard deviation (`corrected=true`) is calculated by replacing -``\\frac{1}{N - 1}`` with ``\\frac{1}{N - 1}`` -(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)). +Compute the standard deviation of a real-valued array `x`, +optionally over a dimension `dim`. Observations in `x` are weighted using weight vector `w`. -Here we calculate the biased weighted standard deviation (`corrected=false`) as: +The uncorrected (when `corrected=false`) sample standard deviation is defined as: -``\sqrt{\frac{1}{\sum{w}} \sum\limits_{i = 1}^N {w_i\left( {x_i - \bar x} \right)^2 }}`` +```math +\\sqrt{\\frac{1}{\\sum{w}} \\sum{i=1}^n {w_i\\left({x_i - μ}\\right)^2 }} +``` +where ``n`` is the length of the input and ``μ`` is the mean. -An unbiased standard deviation (`corrected=true`) is dependent on the type of weights used: +The unbiased estimate of the population standard deviation is computed by replacing +``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights used: -* AnalyticWeights: ``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` -* FrequencyWeights: ``\\frac{1}{\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` +* AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` """ Base.std(v::RealArray, wv::AbstractWeights; mean=nothing, corrected::DepBool=nothing) = sqrt.(var(v, wv; mean=mean, corrected=depcheck(:std, corrected))) @@ -192,12 +176,12 @@ Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, ##### Fused statistics """ - mean_and_var(x, [wv::AbstractWeights, dim, corrected]) -> (mean, var) + mean_and_var(x, [wv::AbstractWeights], [dim]; corrected=false) -> (mean, var) Return the mean and variance of a real-valued array `x`, optionally over a dimension `dim`, as a tuple. Observations in `x` can be weighted via `wv`. Finally, bias correction can be applied to the variance calculation if `corrected=true`. -See `var` documentation for more details. +See [`var`](@ref) documentation for more details. """ function mean_and_var(A::RealArray; corrected::Bool=true) m = mean(A) @@ -206,13 +190,13 @@ function mean_and_var(A::RealArray; corrected::Bool=true) end """ - mean_and_std(x, [wv::AbstractWeights, dim, corrected]) -> (mean, std) + mean_and_std(x, [wv::AbstractWeights], [dim]; corrected=false) -> (mean, std) Return the mean and standard deviation of a real-valued array `x`, optionally over a dimension `dim`, as a tuple. A weighting vector `wv` can be specified to weight the estimates. Finally, bias correction can be applied to the standard deviation calculation if `corrected=true`. -See `std` documentation for more details. +See [`std`](@ref) documentation for more details. """ function mean_and_std(A::RealArray; corrected::Bool=true) m = mean(A) @@ -289,7 +273,7 @@ function _moment3(v::RealArray, m::Real) @inbounds z = v[i] - m s += z * z * z end - varcorrection(n, false) * s + s / n end function _moment3(v::RealArray, wv::AbstractWeights, m::Real) @@ -300,7 +284,7 @@ function _moment3(v::RealArray, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += (z * z * z) * w[i] end - varcorrection(wv, false) * s + s / sum(wv) end function _moment4(v::RealArray, m::Real; corrected=false) @@ -310,7 +294,7 @@ function _moment4(v::RealArray, m::Real; corrected=false) @inbounds z = v[i] - m s += abs2(z * z) end - varcorrection(n, false) * s + s / n end function _moment4(v::RealArray, wv::AbstractWeights, m::Real) @@ -321,7 +305,7 @@ function _moment4(v::RealArray, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += abs2(z * z) * w[i] end - varcorrection(wv, false) * s + s / sum(wv) end function _momentk(v::RealArray, k::Int, m::Real) @@ -331,7 +315,7 @@ function _momentk(v::RealArray, k::Int, m::Real) @inbounds z = v[i] - m s += (z ^ k) end - varcorrection(n, false) * s + s / n end function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) @@ -342,7 +326,7 @@ function _momentk(v::RealArray, k::Int, wv::AbstractWeights, m::Real) @inbounds z = v[i] - m @inbounds s += (z ^ k) * w[i] end - varcorrection(wv, false) * s + s / sum(wv) end diff --git a/src/weights.jl b/src/weights.jl index cfc538f36..a7d3c516c 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -44,10 +44,42 @@ Base.size(wv::AbstractWeights) = size(wv.values) Compute a bias correction factor for calculating `var`, `std` and `cov` with `n` observations. Returns ``\\frac{1}{n - 1}`` when `corrected=true` (i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)), -otherwise returns ``\\frac{1}{n}`` (i.e no correction). +otherwise returns ``\\frac{1}{n}`` (i.e. no correction). """ -varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected)) +@inline varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected)) +@weights Weights + +""" + Weights(vs, wsum=sum(vs)) + +Construct a `Weights` vector with weight values `vs`. +A precomputed sum may be provided as `wsum`. + +The `Weights` type describes a generic weights vector which does not support +bias correction. +""" +Weights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = Weights{S, eltype(vs), V}(vs, s) + +""" + weights(vs) + +Construct a `Weights` vector from array `vs`. +See the documentation for [`Weights`](@ref) for more details. +""" +weights(vs::RealVector) = Weights(vs) +weights(vs::RealArray) = Weights(vec(vs)) + +""" + varcorrection(w::Weights, corrected=false) + +Returns ``\\frac{1}{\sum w}`` when corrected is false and throws an `ArgumentError` +when corrected is true. +""" +function varcorrection(w::Weights, corrected::Bool=false) + corrected && throw(ArgumentError("Weights does not support bias correction.")) + 1 / w.sum +end @weights AnalyticWeights @@ -68,7 +100,7 @@ AnalyticWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = aweights(vs) Construct an `AnalyticWeights` vector from array `vs`. -See the documentation for `AnalyticWeights` for more details. +See the documentation for [`AnalyticWeights`](@ref) for more details. """ aweights(vs::RealVector) = AnalyticWeights(vs) aweights(vs::RealArray) = AnalyticWeights(vec(vs)) @@ -78,11 +110,11 @@ aweights(vs::RealArray) = AnalyticWeights(vec(vs)) ``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` """ -function varcorrection(w::AnalyticWeights, corrected::Bool=false) +@inline function varcorrection(w::AnalyticWeights, corrected::Bool=false) s = w.sum if corrected - sum_sn = zero(eltype(w)) / one(typeof(s)) ^ 2 # to ensure type stability + sum_sn = zero((zero(eltype(w)) / one(typeof(s))) ^ 2) # to ensure type stability @inbounds for x in w sum_sn += (x / s) ^ 2 end @@ -102,7 +134,7 @@ Construct a `FrequencyWeights` vector with weight values `vs`. A precomputed sum may be provided as `wsum`. Frequency weights describe the number of times (or frequency) each observation -was observed. These weight may also be referred to as case weights or repeat weights. +was observed. These weights may also be referred to as case weights or repeat weights. """ FrequencyWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = FrequencyWeights{S, eltype(vs), V}(vs, s) @@ -111,7 +143,7 @@ FrequencyWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = fweights(vs) Construct a `FrequencyWeights` vector from a given array. -See the documentation for `FrequencyWeights` for more details. +See the documentation for [`FrequencyWeights`](@ref) for more details. """ fweights(vs::RealVector) = FrequencyWeights(vs) fweights(vs::RealArray) = FrequencyWeights(vec(vs)) @@ -121,7 +153,7 @@ fweights(vs::RealArray) = FrequencyWeights(vec(vs)) ``\\frac{1}{\sum{w} - 1}`` """ -function varcorrection(w::FrequencyWeights, corrected::Bool=false) +@inline function varcorrection(w::FrequencyWeights, corrected::Bool=false) s = w.sum if corrected @@ -150,7 +182,7 @@ ProbabilityWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = pweights(vs) Construct a `ProbabilityWeights` vector from a given array. -See the documentation for `ProbabilityWeights` for more details. +See the documentation for [`ProbabilityWeights`](@ref) for more details. """ pweights(vs::RealVector) = ProbabilityWeights(vs) pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) @@ -160,7 +192,7 @@ pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) ``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` """ -function varcorrection(w::ProbabilityWeights, corrected::Bool=false) +@inline function varcorrection(w::ProbabilityWeights, corrected::Bool=false) s = w.sum if corrected @@ -492,7 +524,7 @@ end wmedian(v, w) Compute the weighted median of an array `v` with weights `w`, given as either a -vector or `AbstractWeights`. +vector or an `AbstractWeights` object/vector. """ wmedian(v::RealVector, w::RealVector) = median(v, fweights(w)) wmedian{W<:Real}(v::RealVector, w::AbstractWeights{W}) = median(v, w) @@ -584,7 +616,7 @@ quantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::Number) = quantile( wquantile(v, w, p) Compute the `p`th quantile(s) of `v` with weights `w`, given as either a vector -or a `AbstractWeights`. +or an `AbstractWeights` object/vector. """ wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::RealVector) = quantile(v, w, p) wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::Number) = quantile(v, w, [p])[1] From 34c88f2a91c64d92b6a7551a11236c8a900b45ae Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 3 May 2017 12:35:49 -0500 Subject: [PATCH 32/50] Removed eweights and fixed up test cases. * Removed eweights from this PR. * Fixed up weights and moments tests cases to loop over weight types more often. * Fixed Probability weights n = length(w) -> n = count(!iszero, w) --- src/StatsBase.jl | 1 - src/cov.jl | 2 +- src/moments.jl | 8 +- src/weights.jl | 27 +---- test/moments.jl | 281 +++++++++++++++++++++++++++++++---------------- test/weights.jl | 108 +++++++++--------- 6 files changed, 245 insertions(+), 182 deletions(-) diff --git a/src/StatsBase.jl b/src/StatsBase.jl index afeff5a7f..d3f5e739f 100644 --- a/src/StatsBase.jl +++ b/src/StatsBase.jl @@ -26,7 +26,6 @@ module StatsBase aweights, # construct an AnalyticWeights vector fweights, # construct a FrequencyWeights vector pweights, # construct a ProbabilityWeights vector - eweights, # construct an ExponentialWeights vector wsum, # weighted sum with vector as second argument wsum!, # weighted sum across dimensions with provided storage wmean, # weighted mean diff --git a/src/cov.jl b/src/cov.jl index 25d44fdfe..a1d3d9d07 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -58,7 +58,7 @@ matrix (`corrected=false`) can be computed by multiplying `scattermat(X, wv)` by * AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` * FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where `n = length(w)` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` """ cov diff --git a/src/moments.jl b/src/moments.jl index c34b66505..3fb521a30 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -19,7 +19,7 @@ The unbiased estimate of the population variance is computed by replacing * AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` * FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` """ Base.varm(v::RealArray, wv::AbstractWeights, m::Real; corrected::DepBool=nothing) = _moment2(v, wv, m; corrected=depcheck(:varm, corrected)) @@ -42,7 +42,7 @@ The unbiased estimate of the population variance is computed by replacing * AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` * FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` """ function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, corrected::DepBool=nothing) @@ -135,7 +135,7 @@ The unbiased estimate of the population standard deviation is computed by replac * AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` * FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` """ Base.stdm(v::RealArray, wv::AbstractWeights, m::Real; corrected::DepBool=nothing) = sqrt(varm(v, wv, m, corrected=depcheck(:stdm, corrected))) @@ -158,7 +158,7 @@ The unbiased estimate of the population standard deviation is computed by replac * AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` * FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` +* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` """ Base.std(v::RealArray, wv::AbstractWeights; mean=nothing, corrected::DepBool=nothing) = sqrt.(var(v, wv; mean=mean, corrected=depcheck(:std, corrected))) diff --git a/src/weights.jl b/src/weights.jl index a7d3c516c..73128971a 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -114,11 +114,7 @@ aweights(vs::RealArray) = AnalyticWeights(vec(vs)) s = w.sum if corrected - sum_sn = zero((zero(eltype(w)) / one(typeof(s))) ^ 2) # to ensure type stability - @inbounds for x in w - sum_sn += (x / s) ^ 2 - end - + sum_sn = sum(x -> (x / s) ^ 2, w) 1 / (s * (1 - sum_sn)) else 1 / s @@ -190,36 +186,19 @@ pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) """ varcorrection(w::ProbabilityWeights, corrected=false) -``\\frac{n}{(n - 1) \sum w}`` where `n = length(w)` +``\\frac{n}{(n - 1) \sum w}`` where ``n`` equals `count(!iszero, w)` """ @inline function varcorrection(w::ProbabilityWeights, corrected::Bool=false) s = w.sum if corrected - n = length(w) + n = count(!iszero, w) n / (s * (n - 1)) else 1 / s end end -""" - eweights(n, λ) - -Construct an `AnalyticWeights` vector with length `n`, -where each element in position ``i`` is set to ``λ * (1 - λ)^(1 - i)``. - -``λ`` is a smoothing factor or rate parameter between 0 and 1. -As this value approaches 0 the resulting weights will be almost equal, -while values closer to 1 will put higher weight on the end elements of the vector. -""" -function eweights(n::Integer, λ::Real) - n > 0 || throw(ArgumentError("cannot construct weights of length < 1")) - 0 <= λ <= 1 || throw(ArgumentError("smoothing factor must be between 0 and 1")) - w0 = map(i -> λ * (1 - λ)^(1 - i), 1:n) - aweights(w0) -end - ##### Weighted sum ##### ## weighted sum over vectors diff --git a/test/moments.jl b/test/moments.jl index c5fe79aa9..d9b66ff54 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -2,32 +2,32 @@ using StatsBase using Base.Test @testset "StatsBase.Moments" begin -weight_funcs = (aweights, fweights, pweights) +weight_funcs = (weights, aweights, fweights, pweights) @testset "Variance and Standard Deviation" begin @testset "Vectors" begin x = [0.57, 0.10, 0.91, 0.72, 0.46] w = [3.84, 2.70, 8.29, 8.91, 9.71] - @testset "Uncorrected" begin - @testset "Variance with $f" for f in weight_funcs - wv = f(w) - m = mean(x, wv) - @test var(x, wv; corrected=false) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) - @test var(x, wv; mean=0, corrected=false) ≈ sum(abs2.(x), wv) ./ sum(wv) - @test var(x, wv; mean=1.0, corrected=false) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) + @testset "Uncorrected with $f" for f in weight_funcs + wv = f(w) + m = mean(x, wv) + + # expected uncorrected output + expected_var = sum(abs2.(x .- m), wv) / sum(wv) + expected_std = sqrt(expected_var) + + @testset "Variance" begin + @test var(x, wv; corrected=false) ≈ expected_var + @test var(x, wv; mean=m, corrected=false) ≈ expected_var end - @testset "Standard Deviation with $f" for f in weight_funcs - wv = f(w) - m = mean(x, wv) - @test std(x, wv; corrected=false) ≈ sqrt(var(x, wv; corrected=false)) - @test std(x, wv; mean=0, corrected=false) ≈ sqrt(var(x, wv; mean=0, corrected=false)) - @test std(x, wv; mean=1.0, corrected=false) ≈ sqrt(var(x, wv; mean=1.0, corrected=false)) + @testset "Standard Deviation" begin + @test std(x, wv; corrected=false) ≈ expected_std + @test std(x, wv; mean=m, corrected=false) ≈ expected_std end - @testset "Mean and Variance with $f" for f in weight_funcs - wv = f(w) + @testset "Mean and Variance" begin (m, v) = mean_and_var(x; corrected=false) @test m == mean(x) @test v == var(x; corrected=corrected=false) @@ -37,8 +37,7 @@ weight_funcs = (aweights, fweights, pweights) @test v == var(x, wv; corrected=false) end - @testset "Mean and Standard Deviation with $f" for f in weight_funcs - wv = f(w) + @testset "Mean and Standard Deviation" begin (m, s) = mean_and_std(x; corrected=false) @test m == mean(x) @test s == std(x; corrected=false) @@ -49,112 +48,204 @@ weight_funcs = (aweights, fweights, pweights) end end - @testset "Corrected" begin + # expected corrected output for (weights, aweights, fweights, pweights) + expected_var = [NaN, 0.0694434191182236, 0.05466601256158146, 0.06628969012045285] + expected_std = sqrt(expected_var) + + @testset "Corrected with $(weight_funcs[i])" for i in eachindex(weight_funcs) + wv = weight_funcs[i](w) + m = mean(x, wv) + @testset "Variance" begin - # expected `var` output for (aweights, fweights, pweights) - expected = (0.0694434191182236, 0.05466601256158146, 0.06628969012045285) - expected_0 = (0.5798908707332937, 0.45649137134052387, 0.5535554932735426) - expected_1 = (0.25422659392845115, 0.20012773497688754, 0.24268105381165922) - - @testset "$(weight_funcs[i])" for i in 1:3 - wv = weight_funcs[i](w) - m = mean(x, wv) - - @test var(x, wv; corrected=true) ≈ expected[i] - @test var(x, wv; mean=0, corrected=true) ≈ expected_0[i] - @test var(x, wv; mean=1.0, corrected=true) ≈ expected_1[i] + if isa(wv, Weights) + @test_throws ArgumentError var(x, wv; corrected=true) + else + @test var(x, wv; corrected=true) ≈ expected_var[i] + @test var(x, wv; mean=m, corrected=true) ≈ expected_var[i] end end - @testset "Standard Deviation with $f" for f in weight_funcs - wv = f(w) - m = mean(x, wv) - @test std(x, wv; corrected=true) ≈ sqrt(var(x, wv; corrected=true)) - @test std(x, wv; mean=0, corrected=true) ≈ sqrt(var(x, wv; mean=0, corrected=true)) - @test std(x, wv; mean=1.0, corrected=true) ≈ sqrt(var(x, wv; mean=1.0, corrected=true)) + @testset "Standard Deviation" begin + if isa(wv, Weights) + @test_throws ArgumentError std(x, wv; corrected=true) + else + @test std(x, wv; corrected=true) ≈ expected_std[i] + @test std(x, wv; mean=m, corrected=true) ≈ expected_std[i] + end end - @testset "Mean and Variance with $f" for f in weight_funcs - wv = f(w) - + @testset "Mean and Variance" begin (m, v) = mean_and_var(x; corrected=true) @test m == mean(x) @test v == var(x; corrected=true) - (m, v) = mean_and_var(x, wv; corrected=true) - @test m == mean(x, wv) - @test v == var(x, wv; corrected=true) + if isa(wv, Weights) + @test_throws ArgumentError mean_and_var(x, wv; corrected=true) + else + (m, v) = mean_and_var(x, wv; corrected=true) + @test m == mean(x, wv) + @test v == var(x, wv; corrected=true) + end end - @testset "Mean and Standard Deviation with $f" for f in weight_funcs - wv = f(w) - + @testset "Mean and Standard Deviation" begin (m, s) = mean_and_std(x; corrected=true) @test m == mean(x) @test s == std(x; corrected=true) - (m, s) = mean_and_std(x, wv; corrected=true) - @test m == mean(x, wv) - @test s == std(x, wv; corrected=true) + if isa(wv, Weights) + @test_throws ArgumentError mean_and_std(x, wv; corrected=true) + else + (m, s) = mean_and_std(x, wv; corrected=true) + @test m == mean(x, wv) + @test s == std(x, wv; corrected=true) + end end end end - @testset "Matrices" begin x = rand(5, 6) w1 = rand(5) w2 = rand(6) - wv1 = fweights(w1) - wv2 = fweights(w2) - m1 = mean(x, wv1, 1) - m2 = mean(x, wv2, 2) - - @test var(x, wv1, 1; mean=0, corrected=false) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) - @test var(x, wv2, 2; mean=0, corrected=false) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) - - @test var(x, wv1, 1; mean=m1, corrected=false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) - @test var(x, wv2, 2; mean=m2, corrected=false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) - - @test var(x, wv1, 1; corrected=false) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) - @test var(x, wv2, 2; corrected=false) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) - - @test std(x, wv1, 1; corrected=false) ≈ sqrt.(var(x, wv1, 1; corrected=false)) - @test std(x, wv2, 2; corrected=false) ≈ sqrt.(var(x, wv2, 2; corrected=false)) - @test std(x, wv1, 1; mean=0, corrected=false) ≈ sqrt.(var(x, wv1, 1; mean=0, corrected=false)) - @test std(x, wv2, 2; mean=0, corrected=false) ≈ sqrt.(var(x, wv2, 2; mean=0, corrected=false)) - @test std(x, wv1, 1; mean=m1, corrected=false) ≈ sqrt.(var(x, wv1, 1; mean=m1, corrected=false)) - @test std(x, wv2, 2; mean=m2, corrected=false) ≈ sqrt.(var(x, wv2, 2; mean=m2, corrected=false)) - - for d in 1:2 - (m, v) = mean_and_var(x, d; corrected=false) - @test m == mean(x, d) - @test v == var(x, d; corrected=false) - - (m, s) = mean_and_std(x, d; corrected=false) - @test m == mean(x, d) - @test s == std(x, d; corrected=false) + + @testset "Uncorrected with $f" for f in weight_funcs + wv1 = f(w1) + wv2 = f(w2) + m1 = mean(x, wv1, 1) + m2 = mean(x, wv2, 2) + + expected_var1 = sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) + expected_var2 = sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) + expected_std1 = sqrt.(expected_var1) + expected_std2 = sqrt.(expected_var2) + + @testset "Variance" begin + @test var(x, wv1, 1; corrected=false) ≈ expected_var1 + @test var(x, wv2, 2; corrected=false) ≈ expected_var2 + @test var(x, wv1, 1; mean=m1, corrected=false) ≈ expected_var1 + @test var(x, wv2, 2; mean=m2, corrected=false) ≈ expected_var2 + end + + @testset "Standard Deviation" begin + @test std(x, wv1, 1; corrected=false) ≈ expected_std1 + @test std(x, wv2, 2; corrected=false) ≈ expected_std2 + @test std(x, wv1, 1; mean=m1, corrected=false) ≈ expected_std1 + @test std(x, wv2, 2; mean=m2, corrected=false) ≈ expected_std2 + end + + @testset "Mean and Variance" begin + for d in 1:2 + (m, v) = mean_and_var(x, d; corrected=false) + @test m == mean(x, d) + @test v == var(x, d; corrected=false) + end + + (m, v) = mean_and_var(x, wv1, 1; corrected=false) + @test m == mean(x, wv1, 1) + @test v == var(x, wv1, 1; corrected=false) + + (m, v) = mean_and_var(x, wv2, 2; corrected=false) + @test m == mean(x, wv2, 2) + @test v == var(x, wv2, 2; corrected=false) + end + + @testset "Mean and Standard Deviation" begin + for d in 1:2 + (m, s) = mean_and_std(x, d; corrected=false) + @test m == mean(x, d) + @test s == std(x, d; corrected=false) + end + + (m, s) = mean_and_std(x, wv1, 1; corrected=false) + @test m == mean(x, wv1, 1) + @test s == std(x, wv1, 1; corrected=false) + + (m, s) = mean_and_std(x, wv2, 2; corrected=false) + @test m == mean(x, wv2, 2) + @test s == std(x, wv2, 2; corrected=false) + end end - (m, v) = mean_and_var(x, wv1, 1; corrected=true) - @test m == mean(x, wv1, 1) - @test v == var(x, wv1, 1; corrected=true) + @testset "Corrected with $f" for f in weight_funcs + wv1 = f(w1) + wv2 = f(w2) + m1 = mean(x, wv1, 1) + m2 = mean(x, wv2, 2) + + if !isa(wv1, Weights) + expected_var1 = sum(abs2.(x .- m1) .* w1, 1) .* StatsBase.varcorrection(wv1, true) + expected_var2 = sum(abs2.(x .- m2) .* w2', 2) .* StatsBase.varcorrection(wv2, true) + expected_std1 = sqrt.(expected_var1) + expected_std2 = sqrt.(expected_var2) + end + + @testset "Variance" begin + if isa(wv1, Weights) + @test_throws ArgumentError var(x, wv1, 1; corrected=true) + else + @test var(x, wv1, 1; corrected=true) ≈ expected_var1 + @test var(x, wv2, 2; corrected=true) ≈ expected_var2 + @test var(x, wv1, 1; mean=m1, corrected=true) ≈ expected_var1 + @test var(x, wv2, 2; mean=m2, corrected=true) ≈ expected_var2 + end + end + + @testset "Standard Deviation" begin + if isa(wv1, Weights) + @test_throws ArgumentError std(x, wv1, 1; corrected=true) + else + @test std(x, wv1, 1; corrected=true) ≈ expected_std1 + @test std(x, wv2, 2; corrected=true) ≈ expected_std2 + @test std(x, wv1, 1; mean=m1, corrected=true) ≈ expected_std1 + @test std(x, wv2, 2; mean=m2, corrected=true) ≈ expected_std2 + end + end + + @testset "Mean and Variance" begin + for d in 1:2 + (m, v) = mean_and_var(x, d; corrected=true) + @test m == mean(x, d) + @test v == var(x, d; corrected=true) + end + + if isa(wv1, Weights) + @test_throws ArgumentError mean_and_var(x, wv1, 1; corrected=true) + else + (m, v) = mean_and_var(x, wv1, 1; corrected=true) + @test m == mean(x, wv1, 1) + @test v == var(x, wv1, 1; corrected=true) + + (m, v) = mean_and_var(x, wv2, 2; corrected=true) + @test m == mean(x, wv2, 2) + @test v == var(x, wv2, 2; corrected=true) + end + end - (m, v) = mean_and_var(x, wv2, 2; corrected=false) - @test m == mean(x, wv2, 2) - @test v == var(x, wv2, 2; corrected=false) + @testset "Mean and Standard Deviation" begin + for d in 1:2 + (m, s) = mean_and_std(x, d; corrected=true) + @test m == mean(x, d) + @test s == std(x, d; corrected=true) + end - (m, s) = mean_and_std(x, wv1, 1; corrected=false) - @test m == mean(x, wv1, 1) - @test s == std(x, wv1, 1; corrected=false) + if isa(wv1, Weights) + @test_throws ArgumentError mean_and_std(x, wv1, 1; corrected=true) + else + (m, s) = mean_and_std(x, wv1, 1; corrected=true) + @test m == mean(x, wv1, 1) + @test s == std(x, wv1, 1; corrected=true) - (m, s) = mean_and_std(x, wv2, 2; corrected=false) - @test m == mean(x, wv2, 2) - @test s == std(x, wv2, 2; corrected=false) + (m, s) = mean_and_std(x, wv2, 2; corrected=true) + @test m == mean(x, wv2, 2) + @test s == std(x, wv2, 2; corrected=true) + end + end + end end end -@testset "Skewness and Kurtosis" begin - wv = fweights(ones(5) * 2.0) +@testset "Skewness and Kurtosis with $f" for f in weight_funcs + wv = f(ones(5) * 2.0) @test skewness(1:5) ≈ 0.0 @test skewness([1, 2, 3, 4, 5]) ≈ 0.0 @@ -170,7 +261,7 @@ end @test kurtosis([1, 2, 3, 4, 5], wv) ≈ -1.3 end -@testset "General Moments" begin +@testset "General Moments with $f" for f in weight_funcs x = collect(2.0:8.0) @test moment(x, 2) ≈ sum((x .- 5).^2) / length(x) @test moment(x, 3) ≈ sum((x .- 5).^3) / length(x) @@ -182,7 +273,7 @@ end @test moment(x, 4, 4.0) ≈ sum((x .- 4).^4) / length(x) @test moment(x, 5, 4.0) ≈ sum((x .- 4).^5) / length(x) - w = fweights([1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) + w = f([1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) x2 = collect(2.0:6.0) @test moment(x, 2, w) ≈ sum((x2 .- 4).^2) / 5 @test moment(x, 3, w) ≈ sum((x2 .- 4).^3) / 5 diff --git a/test/weights.jl b/test/weights.jl index aa290f439..75fe0df66 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -37,12 +37,6 @@ weight_funcs = (aweights, fweights, pweights) @test sum(ba, wv) === 4.0 @test sum(sa, wv) === 7.0 end - - @testset "eweights" begin - λ = 0.2 - wv = eweights(4, λ) - @test round(values(wv), 4) == [0.2, 0.25, 0.3125, 0.3906] - end end @testset "Sum" begin @@ -163,31 +157,31 @@ end @testset "Sum and mean syntax" begin a = reshape(1.0:27.0, 3, 3, 3) - @testset "Sum" begin - @test sum([1.0, 2.0, 3.0], fweights([1.0, 0.5, 0.5])) ≈ 3.5 - @test sum(1:3, fweights([1.0, 1.0, 0.5])) ≈ 4.5 + @testset "Sum $f" for f in weight_funcs + @test sum([1.0, 2.0, 3.0], f([1.0, 0.5, 0.5])) ≈ 3.5 + @test sum(1:3, f([1.0, 1.0, 0.5])) ≈ 4.5 for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) - @test sum(a, fweights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) - @test sum(a, fweights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) - @test sum(a, fweights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) + @test sum(a, f(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) + @test sum(a, f(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) + @test sum(a, f(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) end end - @testset "Mean" begin - @test mean([1:3;], fweights([1.0, 1.0, 0.5])) ≈ 1.8 - @test mean(1:3, fweights([1.0, 1.0, 0.5])) ≈ 1.8 + @testset "Mean $f" for f in weight_funcs + @test mean([1:3;], f([1.0, 1.0, 0.5])) ≈ 1.8 + @test mean(1:3, f([1.0, 1.0, 0.5])) ≈ 1.8 for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) - @test mean(a, fweights(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) - @test mean(a, fweights(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) - @test mean(a, fweights(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) + @test mean(a, f(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) + @test mean(a, f(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) + @test mean(a, f(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) @test_throws ErrorException mean(a, fweights(wt), 4) end end end -@testset "Median" begin +@testset "Median $f" for f in weight_funcs data = ( [7, 1, 2, 4, 10], [7, 1, 2, 4, 10], @@ -244,38 +238,38 @@ end num_tests = length(data) for i = 1:num_tests @test wmedian(data[i], wt[i]) == median_answers[i] - @test wmedian(data[i], fweights(wt[i])) == median_answers[i] - @test median(data[i], fweights(wt[i])) == median_answers[i] + @test wmedian(data[i], f(wt[i])) == median_answers[i] + @test median(data[i], f(wt[i])) == median_answers[i] for j = 1:100 # Make sure the weighted median does not change if the data # and weights are reordered. reorder = sortperm(rand(length(data[i]))) - @test median(data[i][reorder], fweights(wt[i][reorder])) == median_answers[i] + @test median(data[i][reorder], f(wt[i][reorder])) == median_answers[i] end end data = [4, 3, 2, 1] wt = [0, 0, 0, 0] @test_throws MethodError wmedian(data[1]) - @test_throws ErrorException median(data, fweights(wt)) + @test_throws ErrorException median(data, f(wt)) @test_throws ErrorException wmedian(data, wt) - @test_throws ErrorException median((Float64)[], fweights((Float64)[])) + @test_throws ErrorException median((Float64)[], f((Float64)[])) wt = [1, 2, 3, 4, 5] - @test_throws ErrorException median(data, fweights(wt)) - @test_throws MethodError median([4 3 2 1 0], fweights(wt)) - @test_throws MethodError median([[1 2];[4 5];[7 8];[10 11];[13 14]], fweights(wt)) + @test_throws ErrorException median(data, f(wt)) + @test_throws MethodError median([4 3 2 1 0], f(wt)) + @test_throws MethodError median([[1 2];[4 5];[7 8];[10 11];[13 14]], f(wt)) data = [1, 3, 2, NaN, 2] @test isnan(median(data, fweights(wt))) wt = [1, 2, NaN, 4, 5] - @test_throws ErrorException median(data, fweights(wt)) + @test_throws ErrorException median(data, f(wt)) data = [1, 3, 2, 1, 2] - @test_throws ErrorException median(data, fweights(wt)) + @test_throws ErrorException median(data, f(wt)) wt = [-1, -1, -1, -1, -1] - @test_throws ErrorException median(data, fweights(wt)) + @test_throws ErrorException median(data, f(wt)) wt = [-1, -1, -1, 0, 0] - @test_throws ErrorException median(data, fweights(wt)) + @test_throws ErrorException median(data, f(wt)) end -@testset "Quantile" begin +@testset "Quantile $f" for f in weight_funcs data = ( [7, 1, 2, 4, 10], [7, 1, 2, 4, 10], @@ -298,25 +292,25 @@ end [-10, 1, 1, -10, -10], ) wt = ( - fweights([1, 1/3, 1/3, 1/3, 1]), - fweights([1, 1, 1, 1, 1]), - fweights([1, 1/3, 1/3, 1/3, 1, 1]), - fweights([1/3, 1/3, 1/3, 1, 1, 1]), - fweights([30, 191, 9, 0]), - fweights([10, 1, 1, 1, 9]), - fweights([10, 1, 1, 1, 900]), - fweights([1, 3, 5, 4, 2]), - fweights([2, 2, 5, 1, 2, 2, 1, 6]), - fweights([0.1, 0.1, 0.8]), - fweights([5, 5, 4, 1]), - fweights([30, 56, 144, 24, 55, 43, 67]), - fweights([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), - fweights([12]), - fweights([7, 1, 1, 1, 6]), - fweights([1, 0, 0, 0, 2]), - fweights([1, 2, 3, 4, 5]), - fweights([0.1, 0.2, 0.3, 0.2, 0.1]), - fweights([1, 1, 1, 1, 1]), + f([1, 1/3, 1/3, 1/3, 1]), + f([1, 1, 1, 1, 1]), + f([1, 1/3, 1/3, 1/3, 1, 1]), + f([1/3, 1/3, 1/3, 1, 1, 1]), + f([30, 191, 9, 0]), + f([10, 1, 1, 1, 9]), + f([10, 1, 1, 1, 900]), + f([1, 3, 5, 4, 2]), + f([2, 2, 5, 1, 2, 2, 1, 6]), + f([0.1, 0.1, 0.8]), + f([5, 5, 4, 1]), + f([30, 56, 144, 24, 55, 43, 67]), + f([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), + f([12]), + f([7, 1, 1, 1, 6]), + f([1, 0, 0, 0, 2]), + f([1, 2, 3, 4, 5]), + f([0.1, 0.2, 0.3, 0.2, 0.1]), + f([1, 1, 1, 1, 1]), ) quantile_answers = ( [1.0,3.6000000000000005,6.181818181818182,8.2,10.0], @@ -352,15 +346,15 @@ end for j = 1:10 # order of w does not matter reorder = sortperm(rand(length(data[i]))) - @test quantile(data[i][reorder], fweights(wt[i][reorder]), p) ≈ quantile_answers[i] + @test quantile(data[i][reorder], f(wt[i][reorder]), p) ≈ quantile_answers[i] end end # w = 1 corresponds to base quantile for i = 1:length(data) - @test quantile(data[i], fweights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) + @test quantile(data[i], f(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) for j = 1:10 prandom = rand(4) - @test quantile(data[i], fweights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) + @test quantile(data[i], f(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) end end @@ -368,9 +362,9 @@ end v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] answer = 6.181818181818182 - @test quantile(data[1], fweights(w), 0.5) ≈ answer - @test wquantile(data[1], fweights(w), [0.5]) ≈ [answer] - @test wquantile(data[1], fweights(w), 0.5) ≈ answer + @test quantile(data[1], f(w), 0.5) ≈ answer + @test wquantile(data[1], f(w), [0.5]) ≈ [answer] + @test wquantile(data[1], f(w), 0.5) ≈ answer @test wquantile(data[1], w, [0.5]) ≈ [answer] @test wquantile(data[1], w, 0.5) ≈ answer end From 1f01bc0b30c0fa2a586cbe7da45a96ee6db1d27a Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 3 May 2017 12:58:01 -0500 Subject: [PATCH 33/50] Added testing of all weights to test/cov.jl --- test/cov.jl | 191 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 124 insertions(+), 67 deletions(-) diff --git a/test/cov.jl b/test/cov.jl index a3ae7ffcb..bb2881917 100644 --- a/test/cov.jl +++ b/test/cov.jl @@ -2,100 +2,157 @@ using StatsBase using Base.Test @testset "StatsBase.Covariance" begin -X = randn(3, 8) +weight_funcs = (weights, aweights, fweights, pweights) -Z1 = X .- mean(X, 1) -Z2 = X .- mean(X, 2) +@testset "$f" for f in weight_funcs + X = randn(3, 8) -w1 = rand(3) -w2 = rand(8) + Z1 = X .- mean(X, 1) + Z2 = X .- mean(X, 2) -wv1 = fweights(w1) -wv2 = fweights(w2) + w1 = rand(3) + w2 = rand(8) -Z1w = X .- mean(X, wv1, 1) -Z2w = X .- mean(X, wv2, 2) + wv1 = f(w1) + wv2 = f(w2) -## reference results + Z1w = X .- mean(X, wv1, 1) + Z2w = X .- mean(X, wv2, 2) -S1 = Z1'Z1 -S2 = Z2 * Z2' + ## reference results -Sz1 = X'X -Sz2 = X * X' + S1 = Z1'Z1 + S2 = Z2 * Z2' -S1w = Z1w' * diagm(w1) * Z1w -S2w = Z2w * diagm(w2) * Z2w' + Sz1 = X'X + Sz2 = X * X' -Sz1w = X' * diagm(w1) * X -Sz2w = X * diagm(w2) * X' + S1w = Z1w' * diagm(w1) * Z1w + S2w = Z2w * diagm(w2) * Z2w' -@testset "Scattermat" begin - @test scattermat(X) ≈ S1 - @test scattermat(X, 2) ≈ S2 + Sz1w = X' * diagm(w1) * X + Sz2w = X * diagm(w2) * X' - @test StatsBase.scattermatm(X, 0) ≈ Sz1 - @test StatsBase.scattermatm(X, 0, 2) ≈ Sz2 + @testset "Scattermat" begin + @test scattermat(X) ≈ S1 + @test scattermat(X, 2) ≈ S2 - @test StatsBase.scattermatm(X, mean(X,1)) ≈ S1 - @test StatsBase.scattermatm(X, mean(X,2), 2) ≈ S2 + @test StatsBase.scattermatm(X, 0) ≈ Sz1 + @test StatsBase.scattermatm(X, 0, 2) ≈ Sz2 - @test StatsBase.scattermatm(X, zeros(1,8)) ≈ Sz1 - @test StatsBase.scattermatm(X, zeros(3), 2) ≈ Sz2 + @test StatsBase.scattermatm(X, mean(X,1)) ≈ S1 + @test StatsBase.scattermatm(X, mean(X,2), 2) ≈ S2 - @testset "Weighted" begin - @test scattermat(X, wv1) ≈ S1w - @test scattermat(X, wv2, 2) ≈ S2w + @test StatsBase.scattermatm(X, zeros(1,8)) ≈ Sz1 + @test StatsBase.scattermatm(X, zeros(3), 2) ≈ Sz2 - @test StatsBase.scattermatm(X, 0, wv1) ≈ Sz1w - @test StatsBase.scattermatm(X, 0, wv2, 2) ≈ Sz2w + @testset "Weighted" begin + @test scattermat(X, wv1) ≈ S1w + @test scattermat(X, wv2, 2) ≈ S2w - @test StatsBase.scattermatm(X, mean(X, wv1, 1), wv1) ≈ S1w - @test StatsBase.scattermatm(X, mean(X, wv2, 2), wv2, 2) ≈ S2w + @test StatsBase.scattermatm(X, 0, wv1) ≈ Sz1w + @test StatsBase.scattermatm(X, 0, wv2, 2) ≈ Sz2w - @test StatsBase.scattermatm(X, zeros(1,8), wv1) ≈ Sz1w - @test StatsBase.scattermatm(X, zeros(3), wv2, 2) ≈ Sz2w + @test StatsBase.scattermatm(X, mean(X, wv1, 1), wv1) ≈ S1w + @test StatsBase.scattermatm(X, mean(X, wv2, 2), wv2, 2) ≈ S2w + + @test StatsBase.scattermatm(X, zeros(1,8), wv1) ≈ Sz1w + @test StatsBase.scattermatm(X, zeros(3), wv2, 2) ≈ Sz2w + end end -end -@testset "Weighted Covariance" begin - @test cov(X, wv1; corrected=false) ≈ S1w ./ sum(wv1) - @test cov(X, wv2, 2; corrected=false) ≈ S2w ./ sum(wv2) + @testset "Uncorrected" begin + @testset "Weighted Covariance" begin + @test cov(X, wv1; corrected=false) ≈ S1w ./ sum(wv1) + @test cov(X, wv2, 2; corrected=false) ≈ S2w ./ sum(wv2) - @test Base.covm(X, 0, wv1, 1; corrected=false) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, 0, wv2, 2; corrected=false) ≈ Sz2w ./ sum(wv2) + @test Base.covm(X, 0, wv1, 1; corrected=false) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, 0, wv2, 2; corrected=false) ≈ Sz2w ./ sum(wv2) - @test Base.covm(X, mean(X, wv1, 1), wv1, 1; corrected=false) ≈ S1w ./ sum(wv1) - @test Base.covm(X, mean(X, wv2, 2), wv2, 2; corrected=false) ≈ S2w ./ sum(wv2) + @test Base.covm(X, mean(X, wv1, 1), wv1, 1; corrected=false) ≈ S1w ./ sum(wv1) + @test Base.covm(X, mean(X, wv2, 2), wv2, 2; corrected=false) ≈ S2w ./ sum(wv2) - @test Base.covm(X, zeros(1,8), wv1, 1; corrected=false) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, zeros(3), wv2, 2; corrected=false) ≈ Sz2w ./ sum(wv2) -end + @test Base.covm(X, zeros(1,8), wv1, 1; corrected=false) ≈ Sz1w ./ sum(wv1) + @test Base.covm(X, zeros(3), wv2, 2; corrected=false) ≈ Sz2w ./ sum(wv2) + end -@testset "Mean and covariance" begin - (m, C) = mean_and_cov(X; corrected=false) - @test m == mean(X, 1) - @test C == cov(X, 1, false) + @testset "Mean and covariance" begin + (m, C) = mean_and_cov(X; corrected=false) + @test m == mean(X, 1) + @test C == cov(X, 1, false) - (m, C) = mean_and_cov(X, 1; corrected=false) - @test m == mean(X, 1) - @test C == cov(X, 1, false) + (m, C) = mean_and_cov(X, 1; corrected=false) + @test m == mean(X, 1) + @test C == cov(X, 1, false) - (m, C) = mean_and_cov(X, 2; corrected=false) - @test m == mean(X, 2) - @test C == cov(X, 2, false) + (m, C) = mean_and_cov(X, 2; corrected=false) + @test m == mean(X, 2) + @test C == cov(X, 2, false) - (m, C) = mean_and_cov(X, wv1; corrected=false) - @test m == mean(X, wv1, 1) - @test C == cov(X, wv1, 1; corrected=false) + (m, C) = mean_and_cov(X, wv1; corrected=false) + @test m == mean(X, wv1, 1) + @test C == cov(X, wv1, 1; corrected=false) - (m, C) = mean_and_cov(X, wv1, 1; corrected=false) - @test m == mean(X, wv1, 1) - @test C == cov(X, wv1, 1; corrected=false) + (m, C) = mean_and_cov(X, wv1, 1; corrected=false) + @test m == mean(X, wv1, 1) + @test C == cov(X, wv1, 1; corrected=false) - (m, C) = mean_and_cov(X, wv2, 2; corrected=false) - @test m == mean(X, wv2, 2) - @test C == cov(X, wv2, 2; corrected=false) -end + (m, C) = mean_and_cov(X, wv2, 2; corrected=false) + @test m == mean(X, wv2, 2) + @test C == cov(X, wv2, 2; corrected=false) + end + end + @testset "Corrected" begin + @testset "Weighted Covariance" begin + if isa(wv1, Weights) + @test_throws ArgumentError cov(X, wv1; corrected=true) + else + var_corr1 = StatsBase.varcorrection(wv1, true) + var_corr2 = StatsBase.varcorrection(wv2, true) + + @test cov(X, wv1; corrected=true) ≈ S1w .* var_corr1 + @test cov(X, wv2, 2; corrected=true) ≈ S2w .* var_corr2 + + @test Base.covm(X, 0, wv1, 1; corrected=true) ≈ Sz1w .* var_corr1 + @test Base.covm(X, 0, wv2, 2; corrected=true) ≈ Sz2w .* var_corr2 + + @test Base.covm(X, mean(X, wv1, 1), wv1, 1; corrected=true) ≈ S1w .* var_corr1 + @test Base.covm(X, mean(X, wv2, 2), wv2, 2; corrected=true) ≈ S2w .* var_corr2 + + @test Base.covm(X, zeros(1,8), wv1, 1; corrected=true) ≈ Sz1w .* var_corr1 + @test Base.covm(X, zeros(3), wv2, 2; corrected=true) ≈ Sz2w .* var_corr2 + end + end + @testset "Mean and covariance" begin + (m, C) = mean_and_cov(X; corrected=true) + @test m == mean(X, 1) + @test C == cov(X, 1, true) + + (m, C) = mean_and_cov(X, 1; corrected=true) + @test m == mean(X, 1) + @test C == cov(X, 1, true) + + (m, C) = mean_and_cov(X, 2; corrected=true) + @test m == mean(X, 2) + @test C == cov(X, 2, true) + + if isa(wv1, Weights) + @test_throws ArgumentError mean_and_cov(X, wv1; corrected=true) + else + (m, C) = mean_and_cov(X, wv1; corrected=true) + @test m == mean(X, wv1, 1) + @test C == cov(X, wv1, 1; corrected=true) + + (m, C) = mean_and_cov(X, wv1, 1; corrected=true) + @test m == mean(X, wv1, 1) + @test C == cov(X, wv1, 1; corrected=true) + + (m, C) = mean_and_cov(X, wv2, 2; corrected=true) + @test m == mean(X, wv2, 2) + @test C == cov(X, wv2, 2; corrected=true) + end + end + end +end end # @testset "StatsBase.Covariance" From bdec9e187f9dea15e23cd9c403ecaca530ff3318 Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 3 May 2017 13:03:54 -0500 Subject: [PATCH 34/50] Removed unnecessary 0 mean condition from `var` --- src/moments.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/moments.jl b/src/moments.jl index 3fb521a30..442c49e3f 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -48,9 +48,7 @@ function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, corrected::DepBool=nothing) corrected = depcheck(:var, corrected) - if mean == 0 - varm(v, wv, 0; corrected=corrected) - elseif mean == nothing + if mean == nothing varm(v, wv, Base.mean(v, wv); corrected=corrected) else varm(v, wv, mean; corrected=corrected) From c0f648877269c8572860523099f857c88eaec4f3 Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 3 May 2017 13:14:50 -0500 Subject: [PATCH 35/50] Reverted changes to skewness and kurtosis. --- src/moments.jl | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/moments.jl b/src/moments.jl index 442c49e3f..0822b28e4 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -375,9 +375,8 @@ function skewness(v::RealArray, m::Real) cm2 += z2 cm3 += z2 * z end - cf = varcorrection(n, false) - cm3 *= cf - cm2 *= cf + cm3 /= n + cm2 /= n return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 end @@ -396,9 +395,9 @@ function skewness(v::RealArray, wv::AbstractWeights, m::Real) cm2 += z2w cm3 += z2w * z end - cf = varcorrection(wv, false) - cm3 *= cf - cm2 *= cf + sw = sum(wv) + cm3 /= sw + cm2 /= sw return cm3 / sqrt(cm2 * cm2 * cm2) # this is much faster than cm2^1.5 end @@ -423,9 +422,8 @@ function kurtosis(v::RealArray, m::Real) cm2 += z2 cm4 += z2 * z2 end - cf = varcorrection(n, false) - cm4 *= cf - cm2 *= cf + cm4 /= n + cm2 /= n return (cm4 / (cm2 * cm2)) - 3.0 end @@ -445,9 +443,9 @@ function kurtosis(v::RealArray, wv::AbstractWeights, m::Real) cm2 += z2w cm4 += z2w * z2 end - cf = varcorrection(wv, false) - cm4 *= cf - cm2 *= cf + sw = sum(wv) + cm4 /= sw + cm2 /= sw return (cm4 / (cm2 * cm2)) - 3.0 end From a8624cd30c088927d4ceebe241956d7d160c9865 Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 3 May 2017 16:18:00 -0500 Subject: [PATCH 36/50] Updated docs to refer to `AbstractWeighs` vs `WeightVec` and included a brief description of different weight types. --- docs/source/counts.rst | 2 +- docs/source/cov.rst | 12 +++++----- docs/source/empirical.rst | 11 +++++---- docs/source/means.rst | 4 ++-- docs/source/sampling.rst | 29 +++++++++++------------- docs/source/scalarstats.rst | 12 +++++----- docs/source/weightvec.rst | 45 +++++++++++++++++++++++++++---------- 7 files changed, 66 insertions(+), 49 deletions(-) diff --git a/docs/source/counts.rst b/docs/source/counts.rst index e3e56b611..db059d594 100644 --- a/docs/source/counts.rst +++ b/docs/source/counts.rst @@ -8,7 +8,7 @@ Counting over an Integer Range .. function:: counts(x, a:b[, wv]) - Count the number of times (or total weights if a weight vector ``wv`` is given) values in ``a:b`` appear in array ``x``. Here, the optional argument ``wv`` should be a weight vector of type ``WeightVec`` (see :ref:`weightvec`). + Count the number of times (or total weights if a weight vector ``wv`` is given) values in ``a:b`` appear in array ``x``. Here, the optional argument ``wv`` should be a weight vector of type ``AbstractWeights`` (see :ref:`weightvec`). This function returns a vector ``r`` of length ``n``, with ``n = length(a:b) = b-a+1``. In particular, we have diff --git a/docs/source/cov.rst b/docs/source/cov.rst index 4745a0469..64655ab67 100644 --- a/docs/source/cov.rst +++ b/docs/source/cov.rst @@ -22,14 +22,14 @@ This package implements functions for computing scatter matrix, as well as weigh .. function:: scatter(X, wv[; vardim=..., mean=...]) - Weighted scatter matrix. The weights are given by a weight vector ``wv`` of type ``WeightVec`` (see :ref:`weightvec`). + Weighted scatter matrix. The weights are given by a weight vector ``wv`` of type ``AbstractWeights`` (see :ref:`weightvec`). -.. function:: cov(X, wv[; vardim=..., mean=...]) +.. function:: cov(X, wv[; vardim=..., mean=..., corrected=...]) - Weighted covariance matrix. + Weighted covariance matrix. - **Note:** By default, the covariance is normalized by the sum of weights, that is, ``cov(X, wv)`` is equal to ``scatter(X, wv) / sum(wv)``. + **Note:** By default, the covariance is normalized by the sum of weights, that is, ``cov(X, wv)`` is equal to ``scatter(X, wv) / sum(wv)``. However, if ``corrected`` is set to ``true`` then the appropriate bias correction is used for that `wv`. -.. function:: mean_and_cov(x[, wv][; vardim=...]) +.. function:: mean_and_cov(x[, wv][; vardim=..., corrected=...]) - Jointly compute the mean and covariance of ``x``. + Jointly compute the mean and covariance of ``x``. diff --git a/docs/source/empirical.rst b/docs/source/empirical.rst index b9b265020..b7aa067f4 100644 --- a/docs/source/empirical.rst +++ b/docs/source/empirical.rst @@ -14,12 +14,12 @@ Histograms can be fitted to data using the ``fit`` method. **Arguments:** -``data`` +``data`` is either a vector (for a 1-dimensional histogram), or a tuple of vectors of equal length (for an *n*-dimensional histogram). ``weight`` - is an optional ``:ref:`weightvec` WeightVec``` (of the same length as the + is an optional ``:ref:`weightvec` AbstractWeights``` (of the same length as the data vectors), denoting the weight each observation contributes to the bin. If no weight vector is supples, each observation has weight 1. @@ -30,7 +30,7 @@ Histograms can be fitted to data using the ``fit`` method. **Keyword arguments:** -``closed=:left/:right`` +``closed=:left/:right`` determines whether the bin intervals are left-closed [a,b), or right-closed (a,b] (default = ``:right``). @@ -48,7 +48,7 @@ Histograms can be fitted to data using the ``fit`` method. h = fit(Histogram, rand(100), weights(rand(100)), 0:0.1:1.0) h = fit(Histogram, [20], 0:20:100) h = fit(Histogram, [20], 0:20:100, closed=:left) - + # Multivariate h = fit(Histogram, (rand(100),rand(100))) h = fit(Histogram, (rand(100),rand(100)),nbins=10) @@ -60,7 +60,6 @@ Empirical Cumulative Distribution Function .. function:: ecdf(x) - Return an empirical cumulative distribution function based on a vector of samples given in ``x``. + Return an empirical cumulative distribution function based on a vector of samples given in ``x``. **Note:** this is a higher-level function that returns a function, which can then be applied to evaluate CDF values on other samples. - diff --git a/docs/source/means.rst b/docs/source/means.rst index 53961f222..210ac5e88 100644 --- a/docs/source/means.rst +++ b/docs/source/means.rst @@ -32,7 +32,7 @@ The package provides functions to compute means of different kinds. .. function:: mean(x, w) - The ``mean`` function is also extended to accept a weight vector of type ``WeightVec`` (see :ref:`weightvec`) to compute weighted mean. + The ``mean`` function is also extended to accept a weight vector of type ``AbstractWeights`` (see :ref:`weightvec`) to compute weighted mean. **Examples:** @@ -43,7 +43,7 @@ The package provides functions to compute means of different kinds. .. function:: mean(x, w, dim) - Compute weighted means of ``x`` along a certain dimension (specified by an integer ``dim``). The weights are given by a weight vector ``w`` (of type ``WeightVec``). + Compute weighted means of ``x`` along a certain dimension (specified by an integer ``dim``). The weights are given by a weight vector ``w`` (of type ``AbstractWeights``). .. function:: mean!(dst, x, w, dim) diff --git a/docs/source/sampling.rst b/docs/source/sampling.rst index 4c558e12b..963dc2b20 100644 --- a/docs/source/sampling.rst +++ b/docs/source/sampling.rst @@ -9,12 +9,11 @@ The package provides functions for sampling from a given population (with or wit .. function:: sample([rng], a) Randomly draw an element from an array ``a``. - Optionally specify a random number generator ``rng`` as the first argument (defaults to ``Base.GLOBAL_RNG``). -.. function:: sample([rng], a, n[; replace=true, ordered=false]) +.. function:: sample([rng], a, n[; replace=true, ordered=false]) - Randomly draw ``n`` elements from ``a``. + Randomly draw ``n`` elements from ``a``. Optionally specify a random number generator ``rng`` as the first argument (defaults to ``Base.GLOBAL_RNG``). @@ -26,14 +25,13 @@ The package provides functions for sampling from a given population (with or wit .. function:: sample!([rng], a, x[; replace=true, ordered=false]) Draw ``length(x)`` elements from ``a`` and write them to a pre-allocated array ``x``. - Optionally specify a random number generator ``rng`` as the first argument (defaults to ``Base.GLOBAL_RNG``). -.. function:: sample([rng], wv) +.. function:: sample([rng], wv) - Draw an integer in ``1:length(wv)`` with probabilities proportional to the weights given in ``wv``. + Draw an integer in ``1:length(wv)`` with probabilities proportional to the weights given in ``wv``. - Here, ``wv`` should be a weight vector of type ``WeightVec`` (see :ref:`weightvec`). + Here, ``wv`` should be a weight vector of type ``AbstractWeights`` (see :ref:`weightvec`). Optionally specify a random number generator ``rng`` as the first argument (defaults to ``Base.GLOBAL_RNG``). @@ -52,7 +50,7 @@ The package provides functions for sampling from a given population (with or wit **Keyword arguments** - ``replace``: indicates whether to have replacement (default = ``true``). - - ``ordered``: indicates whether to arrange the samples in ascending order (default = ``false``). + - ``ordered``: indicates whether to arrange the samples in ascending order (default = ``false``). .. function:: sample!([rng], a, wv, x[; replace=true, ordered=false]) @@ -74,7 +72,7 @@ Here are a list of algorithms implemented in the package. The functions below ar - ``a``: source array representing the population - ``x``: the destination array -- ``wv``: the weight vector (of type ``WeightVec``), for weighted sampling +- ``wv``: the weight vector (of type ``AbstractWeights``), for weighted sampling - ``n``: the length of ``a`` - ``k``: the length of ``x``. For sampling without replacement, ``k`` must not exceed ``n``. - ``rng``: optional random number generator (defaults to ``Base.GLOBAL_RNG``) @@ -108,7 +106,7 @@ All following functions write results to ``x`` (pre-allocated) and return ``x``. .. function:: fisher_yates_sample!([rng], a, x) - *Fisher-Yates shuffling* (with early termination). + *Fisher-Yates shuffling* (with early termination). Pseudo-code :: @@ -118,15 +116,15 @@ All following functions write results to ``x`` (pre-allocated) and return ``x``. swap inds[i] with a random one in inds[i:n] set x[i] = a[inds[i]] end - + This algorithm consumes ``k`` random numbers. It uses an integer array of length ``n`` internally to maintain the shuffled indices. It is considerably faster than Knuth's algorithm especially when ``n`` is greater than ``k``. .. function:: self_avoid_sample!([rng], a, x) - Use a set to maintain the index that has been sampled. Each time draw a new index, if the index has already been sampled, redraw until it draws an unsampled one. + Use a set to maintain the index that has been sampled. Each time draw a new index, if the index has already been sampled, redraw until it draws an unsampled one. - This algorithm consumes about (or slightly more than) ``k`` random numbers, and requires ``O(k)`` memory to store the set of sampled indices. Very fast when ``n >> k``. + This algorithm consumes about (or slightly more than) ``k`` random numbers, and requires ``O(k)`` memory to store the set of sampled indices. Very fast when ``n >> k``. However, if ``k`` is large and approaches ``n``, the rejection rate would increase drastically, resulting in poorer performance. @@ -153,7 +151,7 @@ All following functions write results to ``x`` (pre-allocated) and return ``x``. *Direct sampling.* - Draw each sample by scanning the weight vector. + Draw each sample by scanning the weight vector. This algorithm: (1) consumes ``k`` random numbers; (2) has time complexity ``O(n k)``, as scanning the weight vector each time takes ``O(n)``; and (3) requires no additional memory space. @@ -173,5 +171,4 @@ All following functions write results to ``x`` (pre-allocated) and return ``x``. It makes a copy of the weight vector at initialization, and sets the weight to zero when the corresponding sample is picked. - This algorithm consumes ``O(k)`` random numbers, and has overall time complexity ``O(n k)``. - + This algorithm consumes ``O(k)`` random numbers, and has overall time complexity ``O(n k)``. diff --git a/docs/source/scalarstats.rst b/docs/source/scalarstats.rst index 382296d75..ad8617879 100644 --- a/docs/source/scalarstats.rst +++ b/docs/source/scalarstats.rst @@ -40,20 +40,20 @@ Moments Compute the (standardized) `skewness `_ of ``x``. - One can optionally supply a weight vector of type ``WeightVec`` (see :ref:`weightvec`). + One can optionally supply a weight vector of type ``AbstractWeights`` (see :ref:`weightvec`). .. function:: kurtosis(x[, wv]) Compute the (excessive) `kurtosis `_ of ``x``. - One can optionally supply a weight vector of type ``WeightVec`` (see :ref:`weightvec`). + One can optionally supply a weight vector of type ``AbstractWeights`` (see :ref:`weightvec`). .. function:: moment(x, k[, m][, wv]) Compute the ``k``-th order central moment of the values in `x`. It is the sample mean of ``(x - mean(x)).^k``. - One can optionally supply the center ``m``, and/or a weight vector of type ``WeightVec`` (see :ref:`weightvec`). + One can optionally supply the center ``m``, and/or a weight vector of type ``AbstractWeights`` (see :ref:`weightvec`). Measurements of Variation @@ -160,7 +160,7 @@ Quantile and Friends .. function:: median(x, w) - Compute the weighted median of ``x``, using weights given by a weight vector ``w`` (of type ``WeightVec``). The weight and data vectors must have the same length. The weighted median :math:`x_k` is the element of ``x`` that satisfies :math:`\sum_{x_i < x_k} w_i \le \frac{1}{2} \sum_{j} w_j` and :math:`\sum_{x_i > x_k} w_i \le \frac{1}{2} \sum_{j} w_j`. If a weight has value zero, then its associated data point is ignored. If none of the weights are positive, an error is thrown. ``NaN`` is returned if ``x`` contains any ``NaN`` values. An error is raised if ``w`` contains any ``NaN`` values. + Compute the weighted median of ``x``, using weights given by a weight vector ``w`` (of type ``AbstractWeights``). The weight and data vectors must have the same length. The weighted median :math:`x_k` is the element of ``x`` that satisfies :math:`\sum_{x_i < x_k} w_i \le \frac{1}{2} \sum_{j} w_j` and :math:`\sum_{x_i > x_k} w_i \le \frac{1}{2} \sum_{j} w_j`. If a weight has value zero, then its associated data point is ignored. If none of the weights are positive, an error is thrown. ``NaN`` is returned if ``x`` contains any ``NaN`` values. An error is raised if ``w`` contains any ``NaN`` values. **Examples:** @@ -171,8 +171,8 @@ Quantile and Friends .. function:: quantile(x, w, p) - Compute the weighted quantiles of a vector ``x`` at a specified set of probability values ``p``, using weights given by a weight vector ``w`` (of type ``WeightVec``). Weights must not be negative. The weights and data vectors must have the same length. The quantile for :math:`p` is defined as follows. Denoting :math:`S_k = (k-1)w_k + (n-1) \sum_{i Date: Wed, 3 May 2017 17:25:47 -0500 Subject: [PATCH 37/50] More random fixes. Mostly to docstrings. --- src/common.jl | 11 ++-- src/cov.jl | 21 +++--- src/moments.jl | 167 ++++++++++++++++++++++-------------------------- src/weights.jl | 38 ++++++----- test/moments.jl | 4 +- test/weights.jl | 2 +- 6 files changed, 115 insertions(+), 128 deletions(-) diff --git a/src/common.jl b/src/common.jl index d556322f0..e596bcf95 100644 --- a/src/common.jl +++ b/src/common.jl @@ -30,14 +30,11 @@ fptype(::Type{Complex128}) = Complex128 # A convenient typealias for deprecating default corrected Bool @compat const DepBool = Union{Bool, Void} -const CORRECTED_DEP_MSG = string("will default to corrected=true in the future. ", - "Use corrected=false for previous behaviour.") - -function depcheck(fname::Symbol, b::DepBool; msg::AbstractString=CORRECTED_DEP_MSG, - default::Bool=false) +function depcheck(fname::Symbol, b::DepBool) if b == nothing - Base.depwarn(string(fname, " ", msg), fname) - default + msg = "$fname will default to corrected=true in the future. Use corrected=false for previous behaviour." + Base.depwarn(msg, fname) + false else b end diff --git a/src/cov.jl b/src/cov.jl index a1d3d9d07..98c6b942a 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -49,16 +49,15 @@ function scattermat end """ - cov(X, wv::AbstractWeights; vardim=1, corrected=false) + cov(X, w::AbstractWeights; mean=nothing, vardim=1, corrected=false) Compute the weighted covariance matrix. Similar to `var` and `std` the biased covariance -matrix (`corrected=false`) can be computed by multiplying `scattermat(X, wv)` by +matrix (`corrected=false`) is computed by multiplying `scattermat(X, w)` by ``\\frac{1}{\\sum{w}}`` to normalize. However, the unbiased covariance matrix (`corrected=true`) is dependent on the type of weights used: - -* AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` -* FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `AnalyticWeights`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* `FrequencyWeights`: ``\\frac{1}{\\sum{w} - 1}`` +* `ProbabilityWeights`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` """ cov @@ -69,7 +68,7 @@ cov Return the mean and covariance matrix as a tuple. A weighting vector `wv` can be specified. `vardim` that designates whether the variables are columns in the matrix (`1`) or rows (`2`). -Finally, bias correction will be applied to the covariance calculation if +Finally, bias correction is applied to the covariance calculation if `corrected=true`. See [`cov`](@ref) documentation for more details. """ function mean_and_cov end @@ -88,13 +87,13 @@ scattermat(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1) = scattermatm(x, Base.mean(x, wv, vardim), wv, vardim) ## weighted cov -Base.covm(x::DenseMatrix, mean, wv::AbstractWeights, vardim::Int=1; +Base.covm(x::DenseMatrix, mean, w::AbstractWeights, vardim::Int=1; corrected::DepBool=nothing) = - scale!(scattermatm(x, mean, wv, vardim), varcorrection(wv, depcheck(:covm, corrected))) + scale!(scattermatm(x, mean, w, vardim), varcorrection(w, depcheck(:covm, corrected))) -Base.cov(x::DenseMatrix, wv::AbstractWeights, vardim::Int=1; corrected::DepBool=nothing) = - Base.covm(x, Base.mean(x, wv, vardim), wv, vardim; corrected=depcheck(:cov, corrected)) +Base.cov(x::DenseMatrix, w::AbstractWeights, vardim::Int=1; corrected::DepBool=nothing) = + Base.covm(x, Base.mean(x, w, vardim), w, vardim; corrected=depcheck(:cov, corrected)) function mean_and_cov(x::DenseMatrix, vardim::Int=1; corrected::Bool=true) diff --git a/src/moments.jl b/src/moments.jl index 0822b28e4..e2e324b64 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -6,73 +6,65 @@ Compute the variance of a real-valued array `x` with a known mean `m`, optionally over a dimension `dim`. Observations in `x` are weighted using weight vector `w`. - The uncorrected (when `corrected=false`) sample variance is defined as: - ```math -\\frac{1}{\\sum{w}} \\sum{i=1}^n {w_i\\left({x_i - m}\\right)^2 } +\\frac{1}{\\sum{w}} \\sum_{i=1}^n {w_i\\left({x_i - m}\\right)^2 } ``` -where ``n`` is the length of the input. - -The unbiased estimate of the population variance is computed by replacing +where ``n`` is the length of the input. The unbiased estimate (when `corrected=true`) of +the population variance is computed by replacing ``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights used: - -* AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` -* FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `AnalyticWeights`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* `FrequencyWeights`: ``\\frac{1}{\\sum{w} - 1}`` +* `ProbabilityWeights`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` """ -Base.varm(v::RealArray, wv::AbstractWeights, m::Real; corrected::DepBool=nothing) = - _moment2(v, wv, m; corrected=depcheck(:varm, corrected)) +Base.varm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) = + _moment2(v, w, m; corrected=depcheck(:varm, corrected)) """ - var(x, wv::AbstractWeights, [dim]; mean=nothing, corrected=false) + var(x, w::AbstractWeights, [dim]; mean=nothing, corrected=false) Compute the variance of a real-valued array `x`, optionally over a dimension `dim`. -Observations in `x` are weighted using weight vector `wv`. - +Observations in `x` are weighted using weight vector `w`. The uncorrected (when `corrected=false`) sample variance is defined as: - ```math -\\frac{1}{\\sum{w}} \\sum{i=1}^n {w_i\\left({x_i - μ}\\right)^2 } +\\frac{1}{\\sum{w}} \\sum_{i=1}^n {w_i\\left({x_i - μ}\\right)^2 } ``` where ``n`` is the length of the input and ``μ`` is the mean. - -The unbiased estimate of the population variance is computed by replacing -``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights used: - -* AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` -* FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +The unbiased estimate (when `corrected=true`) of the population variance is computed by +replacing ``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights used: +* `AnalyticWeights`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* `FrequencyWeights`: ``\\frac{1}{\\sum{w} - 1}`` +* `ProbabilityWeights`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` """ -function Base.var(v::RealArray, wv::AbstractWeights; mean=nothing, +function Base.var(v::RealArray, w::AbstractWeights; mean=nothing, corrected::DepBool=nothing) corrected = depcheck(:var, corrected) if mean == nothing - varm(v, wv, Base.mean(v, wv); corrected=corrected) + varm(v, w, Base.mean(v, w); corrected=corrected) else - varm(v, wv, mean; corrected=corrected) + varm(v, w, mean; corrected=corrected) end end ## var along dim -function Base.varm!(R::AbstractArray, A::RealArray, wv::AbstractWeights, M::RealArray, +function Base.varm!(R::AbstractArray, A::RealArray, w::AbstractWeights, M::RealArray, dim::Int; corrected::DepBool=nothing) corrected = depcheck(:varm!, corrected) - scale!(_wsum_centralize!(R, abs2, A, values(wv), M, dim, true), - varcorrection(wv, corrected)) + scale!(_wsum_centralize!(R, abs2, A, values(w), M, dim, true), + varcorrection(w, corrected)) end -function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; +function var!(R::AbstractArray, A::RealArray, w::AbstractWeights, dim::Int; mean=nothing, corrected::DepBool=nothing) corrected = depcheck(:var!, corrected) if mean == 0 - Base.varm!(R, A, wv, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim; + Base.varm!(R, A, w, Base.reducedim_initarray(A, dim, 0, eltype(R)), dim; corrected=corrected) elseif mean == nothing - Base.varm!(R, A, wv, Base.mean(A, wv, dim), dim; corrected=corrected) + Base.varm!(R, A, w, Base.mean(A, w, dim), dim; corrected=corrected) else # check size of mean for i = 1:ndims(A) @@ -84,101 +76,94 @@ function var!(R::AbstractArray, A::RealArray, wv::AbstractWeights, dim::Int; dM == dA || throw(DimensionMismatch("Incorrect size of mean.")) end end - Base.varm!(R, A, wv, mean, dim; corrected=corrected) + Base.varm!(R, A, w, mean, dim; corrected=corrected) end end -function Base.varm(A::RealArray, wv::AbstractWeights, M::RealArray, dim::Int; +function Base.varm(A::RealArray, w::AbstractWeights, M::RealArray, dim::Int; corrected::DepBool=nothing) corrected = depcheck(:varm, corrected) @static if VERSION < v"0.6.0-dev.1121" - Base.varm!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, M, dim; + Base.varm!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, w, M, dim; corrected=corrected) else - Base.varm!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, M, + Base.varm!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, w, M, dim; corrected=corrected) end end -function Base.var(A::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, +function Base.var(A::RealArray, w::AbstractWeights, dim::Int; mean=nothing, corrected::DepBool=nothing) corrected = depcheck(:var, corrected) @static if VERSION < v"0.6.0-dev.1121" - var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, wv, dim; mean=mean, + var!(similar(A, Float64, Base.reduced_dims(size(A), dim)), A, w, dim; mean=mean, corrected=corrected) else - var!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, wv, dim; + var!(similar(A, Float64, Base.reduced_indices(indices(A), dim)), A, w, dim; mean=mean, corrected=corrected) end end ## std """ - stdm(v, wv::AbstractWeights, m, [dim]; corrected=false) + stdm(v, w::AbstractWeights, m, [dim]; corrected=false) Compute the standard deviation of a real-valued array `x` with a known mean `m`, optionally over a dimension `dim`. Observations in `x` are weighted using weight vector `w`. - The uncorrected (when `corrected=false`) sample standard deviation is defined as: - ```math -\\sqrt{\\frac{1}{\\sum{w}} \\sum{i=1}^n {w_i\\left({x_i - m}\\right)^2 }} +\\sqrt{\\frac{1}{\\sum{w}} \\sum_{i=1}^n {w_i\\left({x_i - m}\\right)^2 }} ``` -where ``n`` is the length of the input. - -The unbiased estimate of the population standard deviation is computed by replacing -``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights used: - -* AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` -* FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +where ``n`` is the length of the input. The unbiased estimate (when `corrected=true`) of the +population standard deviation is computed by replacing ``\\frac{1}{\\sum{w}}`` with a factor +dependent on the type of weights used: +* `AnalyticWeights`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* `FrequencyWeights`: ``\\frac{1}{\\sum{w} - 1}`` +* `ProbabilityWeights`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` """ -Base.stdm(v::RealArray, wv::AbstractWeights, m::Real; corrected::DepBool=nothing) = - sqrt(varm(v, wv, m, corrected=depcheck(:stdm, corrected))) +Base.stdm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) = + sqrt(varm(v, w, m, corrected=depcheck(:stdm, corrected))) """ - std(v, wv::AbstractWeights, [dim]; mean=nothing, corrected=false) + std(v, w::AbstractWeights, [dim]; mean=nothing, corrected=false) Compute the standard deviation of a real-valued array `x`, optionally over a dimension `dim`. Observations in `x` are weighted using weight vector `w`. - The uncorrected (when `corrected=false`) sample standard deviation is defined as: - ```math -\\sqrt{\\frac{1}{\\sum{w}} \\sum{i=1}^n {w_i\\left({x_i - μ}\\right)^2 }} +\\sqrt{\\frac{1}{\\sum{w}} \\sum_{i=1}^n {w_i\\left({x_i - μ}\\right)^2 }} ``` where ``n`` is the length of the input and ``μ`` is the mean. - -The unbiased estimate of the population standard deviation is computed by replacing -``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights used: - -* AnalyticWeights: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` -* FrequencyWeights: ``\\frac{1}{\\sum{w} - 1}`` -* ProbabilityWeights: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +The unbiased estimate (when `corrected=true`) of the population standard deviation is +computed by replacing ``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of +weights used: +* `AnalyticWeights`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* `FrequencyWeights`: ``\\frac{1}{\\sum{w} - 1}`` +* `ProbabilityWeights`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` """ -Base.std(v::RealArray, wv::AbstractWeights; mean=nothing, corrected::DepBool=nothing) = - sqrt.(var(v, wv; mean=mean, corrected=depcheck(:std, corrected))) +Base.std(v::RealArray, w::AbstractWeights; mean=nothing, corrected::DepBool=nothing) = + sqrt.(var(v, w; mean=mean, corrected=depcheck(:std, corrected))) Base.stdm(v::RealArray, m::RealArray, dim::Int; corrected::DepBool=nothing) = Base.sqrt!(varm(v, m, dim; corrected=corrected)) -Base.stdm(v::RealArray, wv::AbstractWeights, m::RealArray, dim::Int; +Base.stdm(v::RealArray, w::AbstractWeights, m::RealArray, dim::Int; corrected::DepBool=nothing) = - sqrt.(varm(v, wv, m, dim; corrected=depcheck(:stdm, corrected))) + sqrt.(varm(v, w, m, dim; corrected=depcheck(:stdm, corrected))) -Base.std(v::RealArray, wv::AbstractWeights, dim::Int; mean=nothing, +Base.std(v::RealArray, w::AbstractWeights, dim::Int; mean=nothing, corrected::DepBool=nothing) = - sqrt.(var(v, wv, dim; mean=mean, corrected=depcheck(:std, corrected))) + sqrt.(var(v, w, dim; mean=mean, corrected=depcheck(:std, corrected))) ##### Fused statistics """ - mean_and_var(x, [wv::AbstractWeights], [dim]; corrected=false) -> (mean, var) + mean_and_var(x, [w::AbstractWeights], [dim]; corrected=false) -> (mean, var) Return the mean and variance of a real-valued array `x`, optionally over a dimension -`dim`, as a tuple. Observations in `x` can be weighted via `wv`. Finally, bias correction -can be applied to the variance calculation if `corrected=true`. +`dim`, as a tuple. Observations in `x` can be weighted using weight vector `w`. +Finally, bias correction is be applied to the variance calculation if `corrected=true`. See [`var`](@ref) documentation for more details. """ function mean_and_var(A::RealArray; corrected::Bool=true) @@ -188,11 +173,11 @@ function mean_and_var(A::RealArray; corrected::Bool=true) end """ - mean_and_std(x, [wv::AbstractWeights], [dim]; corrected=false) -> (mean, std) + mean_and_std(x, [w::AbstractWeights], [dim]; corrected=false) -> (mean, std) Return the mean and standard deviation of a real-valued array `x`, optionally -over a dimension `dim`, as a tuple. A weighting vector `wv` can be specified -to weight the estimates. Finally, bias correction can be applied to the +over a dimension `dim`, as a tuple. A weighting vector `w` can be specified +to weight the estimates. Finally, bias correction is applied to the standard deviation calculation if `corrected=true`. See [`std`](@ref) documentation for more details. """ @@ -202,14 +187,14 @@ function mean_and_std(A::RealArray; corrected::Bool=true) m, s end -function mean_and_var(A::RealArray, wv::AbstractWeights; corrected::DepBool=nothing) - m = mean(A, wv) - v = varm(A, wv, m; corrected=depcheck(:mean_and_var, corrected)) +function mean_and_var(A::RealArray, w::AbstractWeights; corrected::DepBool=nothing) + m = mean(A, w) + v = varm(A, w, m; corrected=depcheck(:mean_and_var, corrected)) m, v end -function mean_and_std(A::RealArray, wv::AbstractWeights; corrected::DepBool=nothing) - m = mean(A, wv) - s = stdm(A, wv, m; corrected=depcheck(:mean_and_std, corrected)) +function mean_and_std(A::RealArray, w::AbstractWeights; corrected::DepBool=nothing) + m = mean(A, w) + s = stdm(A, w, m; corrected=depcheck(:mean_and_std, corrected)) m, s end @@ -226,16 +211,16 @@ function mean_and_std(A::RealArray, dim::Int; corrected::Bool=true) end -function mean_and_var(A::RealArray, wv::AbstractWeights, dim::Int; +function mean_and_var(A::RealArray, w::AbstractWeights, dim::Int; corrected::DepBool=nothing) - m = mean(A, wv, dim) - v = varm(A, wv, m, dim; corrected=depcheck(:mean_and_var, corrected)) + m = mean(A, w, dim) + v = varm(A, w, m, dim; corrected=depcheck(:mean_and_var, corrected)) m, v end -function mean_and_std(A::RealArray, wv::AbstractWeights, dim::Int; +function mean_and_std(A::RealArray, w::AbstractWeights, dim::Int; corrected::DepBool=nothing) - m = mean(A, wv, dim) - s = stdm(A, wv, m, dim; corrected=depcheck(:mean_and_std, corrected)) + m = mean(A, w, dim) + s = stdm(A, w, m, dim; corrected=depcheck(:mean_and_std, corrected)) m, s end @@ -285,7 +270,7 @@ function _moment3(v::RealArray, wv::AbstractWeights, m::Real) s / sum(wv) end -function _moment4(v::RealArray, m::Real; corrected=false) +function _moment4(v::RealArray, m::Real) n = length(v) s = 0.0 for i = 1:n @@ -348,7 +333,7 @@ function moment(v::RealArray, k::Int, wv::AbstractWeights, m::Real) _momentk(v, k, wv, m) end -moment(v::RealArray, k::Int; corrected=true) = moment(v, k, mean(v)) +moment(v::RealArray, k::Int) = moment(v, k, mean(v)) function moment(v::RealArray, k::Int, wv::AbstractWeights) moment(v, k, wv, mean(v, wv)) end diff --git a/src/weights.jl b/src/weights.jl index 73128971a..e336513b2 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -57,7 +57,8 @@ Construct a `Weights` vector with weight values `vs`. A precomputed sum may be provided as `wsum`. The `Weights` type describes a generic weights vector which does not support -bias correction. +all operations possible for [`FrequencyWeights`](@ref), [`AnalyticWeights`](@ref) +and [`ProbabilityWeights`](@ref)" """ Weights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = Weights{S, eltype(vs), V}(vs, s) @@ -73,11 +74,12 @@ weights(vs::RealArray) = Weights(vec(vs)) """ varcorrection(w::Weights, corrected=false) -Returns ``\\frac{1}{\sum w}`` when corrected is false and throws an `ArgumentError` -when corrected is true. +Returns ``\\frac{1}{\\sum w}`` when `corrected=false` and throws an `ArgumentError` +if `corrected=true`. """ -function varcorrection(w::Weights, corrected::Bool=false) - corrected && throw(ArgumentError("Weights does not support bias correction.")) +@inline function varcorrection(w::Weights, corrected::Bool=false) + corrected && throw(ArgumentError("Weights type does not support bias correction: " * + "use FrequencyWeights, AnalyticWeights or ProbabilityWeights if applicable.")) 1 / w.sum end @@ -91,7 +93,8 @@ A precomputed sum may be provided as `wsum`. Analytic weights describe a non-random relative importance (usually between 0 and 1) for each observation. These weights may also be referred to as reliability weights, -precision weights or inverse variance weights. +precision weights or inverse variance weights. These are typically used when the observations +being weighted are aggregate values (e.g., averages) with differing variances. """ AnalyticWeights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = AnalyticWeights{S, eltype(vs), V}(vs, s) @@ -108,7 +111,8 @@ aweights(vs::RealArray) = AnalyticWeights(vec(vs)) """ varcorrection(w::AnalyticWeights, corrected=false) -``\\frac{1}{\sum w - \sum {w^2} / \sum w}`` +* `corrected=true`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` +* `corrected=false`: ``\\frac{1}{\\sum w}`` """ @inline function varcorrection(w::AnalyticWeights, corrected::Bool=false) s = w.sum @@ -147,7 +151,8 @@ fweights(vs::RealArray) = FrequencyWeights(vec(vs)) """ varcorrection(w::FrequencyWeights, corrected=false) -``\\frac{1}{\sum{w} - 1}`` +* `corrected=true`: ``\\frac{1}{\\sum{w} - 1}`` +* `corrected=false`: ``\\frac{1}{\\sum w}`` """ @inline function varcorrection(w::FrequencyWeights, corrected::Bool=false) s = w.sum @@ -186,7 +191,8 @@ pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) """ varcorrection(w::ProbabilityWeights, corrected=false) -``\\frac{n}{(n - 1) \sum w}`` where ``n`` equals `count(!iszero, w)` +* `corrected=true`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `corrected=false`: ``\\frac{1}{\\sum w}`` """ @inline function varcorrection(w::ProbabilityWeights, corrected::Bool=false) s = w.sum @@ -427,8 +433,8 @@ Base.sum{T<:Number,W<:Real}(A::AbstractArray{T}, w::AbstractWeights{W}, dim::Int Compute the weighted mean of an array `v` with weights `w`. """ function wmean{T<:Number}(v::AbstractArray{T}, w::AbstractVector) - Base.depwarn("wmean is deprecated, use mean(v, fweights(w)) instead.", :wmean) - mean(v, fweights(w)) + Base.depwarn("wmean is deprecated, use mean(v, weights(w)) instead.", :wmean) + mean(v, weights(w)) end Base.mean(v::AbstractArray, w::AbstractWeights) = sum(v, w) / sum(w) @@ -503,9 +509,9 @@ end wmedian(v, w) Compute the weighted median of an array `v` with weights `w`, given as either a -vector or an `AbstractWeights` object/vector. +vector or an `AbstractWeights` vector. """ -wmedian(v::RealVector, w::RealVector) = median(v, fweights(w)) +wmedian(v::RealVector, w::RealVector) = median(v, weights(w)) wmedian{W<:Real}(v::RealVector, w::AbstractWeights{W}) = median(v, w) ###### Weighted quantile ##### @@ -595,9 +601,9 @@ quantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::Number) = quantile( wquantile(v, w, p) Compute the `p`th quantile(s) of `v` with weights `w`, given as either a vector -or an `AbstractWeights` object/vector. +or an `AbstractWeights` vector. """ wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::RealVector) = quantile(v, w, p) wquantile{W <: Real}(v::RealVector, w::AbstractWeights{W}, p::Number) = quantile(v, w, [p])[1] -wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, fweights(w), p) -wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, fweights(w), [p])[1] +wquantile(v::RealVector, w::RealVector, p::RealVector) = quantile(v, weights(w), p) +wquantile(v::RealVector, w::RealVector, p::Number) = quantile(v, weights(w), [p])[1] diff --git a/test/moments.jl b/test/moments.jl index d9b66ff54..2989a405c 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -6,8 +6,8 @@ weight_funcs = (weights, aweights, fweights, pweights) @testset "Variance and Standard Deviation" begin @testset "Vectors" begin - x = [0.57, 0.10, 0.91, 0.72, 0.46] - w = [3.84, 2.70, 8.29, 8.91, 9.71] + x = [0.57, 0.10, 0.91, 0.72, 0.46, 0.0] + w = [3.84, 2.70, 8.29, 8.91, 9.71, 0.0] @testset "Uncorrected with $f" for f in weight_funcs wv = f(w) diff --git a/test/weights.jl b/test/weights.jl index 75fe0df66..3de5c81e1 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -4,7 +4,7 @@ using Compat import Compat: view @testset "StatsBase.Weights" begin -weight_funcs = (aweights, fweights, pweights) +weight_funcs = (weights, aweights, fweights, pweights) @testset "Construction" begin @testset "$f" for f in weight_funcs From 8d85af7f211088ccf387c17fc4b5502471f2909a Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 4 May 2017 11:02:08 -0500 Subject: [PATCH 38/50] More doc fixes. --- docs/source/weightvec.rst | 61 ++++++++++++++++++++++++++++++++++----- src/weights.jl | 2 +- 2 files changed, 55 insertions(+), 8 deletions(-) diff --git a/docs/source/weightvec.rst b/docs/source/weightvec.rst index 79123c60b..23e36c06e 100644 --- a/docs/source/weightvec.rst +++ b/docs/source/weightvec.rst @@ -61,10 +61,10 @@ The ``AbstractWeights`` type is introduced as the standard way to pass weights, - Statistical functions that utilize weights often need the sum of weights for various purposes. The weight vector maintains the sum of weights, so that it needn't be computed repeatedly each time the sum of weights is needed. -Other AbstractWeights types and bias correction +Variance bias correction ------------------------------------------------- -When computing the weighted uncorrected (when `corrected=false`) sample variance, standard deviation or covariance a of :math:`\frac{1}{\sum{w}}` is used instead of :math:`\frac{1}{n}` where `n` is the number of observations. +When computing the weighted uncorrected (when ``corrected=false``) sample variance, standard deviation or covariance :math:`\frac{1}{\sum{w}}` is used instead of :math:`\frac{1}{n}` (where ``n`` is the number of observations). Example: @@ -74,10 +74,57 @@ vs :math:`s^2 = \frac{1}{n} \sum_{i=1}^n {\left({x_i - m}\right)^2 }` -The unbiased estimate of the population variance, standard deviation or covariance is computed by replacing :math:`\frac{1}{\sum{w}}` with a factor dependent on the type of weights used: +However, unbiased estimates (when ``corrected=true``) are dependent on the types of weights used. All weights presented here are a subtype of ``AbstractWeights``. -- ``AnalyticWeights``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` -- ``FrequencyWeights``: :math:`\frac{1}{\sum{w} - 1}` -- ``ProbabilityWeights``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals `count(!iszero, w)` +Weights +~~~~~~~ -These weights can be created with the appropriate constructor (i.e., ``AnalyticWeights(a)``, ``FrequencyWeights(a)``, ``ProbabilityWeights(a)``) or the utility functions ``aweights(a)``, ``fweights(a)`` and ``pweights(a)``. +The `Weights` type describes a generic weights vector which does not support all operations possible for ``FrequencyWeights``, ``AnalyticWeights`` and ``ProbabilityWeights``. + +- ``corrected=true``: ``ArgumentError`` +- ``corrected=false``: :math:`\frac{1}{\sum{w}}` + +AnalyticWeights +~~~~~~~~~~~~~~~~ + +Analytic weights describe a non-random relative importance (usually between 0 and 1) for each observation. These weights may also be referred to as reliability weights, precision weights or inverse variance weights. These are typically used when the observations being weighted are aggregate values (e.g., averages) with differing variances. + +- ``corrected=true``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` +- ``corrected=false``: :math:`\frac{1}{\sum{w}}` + +.. code-block:: julia + + w = AnalyticWeights([0.2, 0.1, 0.3]) + + w = aweights([0.2, 0.1, 0.3]) + + +FrequencyWeights +~~~~~~~~~~~~~~~~~ + +Frequency weights describe the number of times (or frequency) each observation +was observed. These weights may also be referred to as case weights or repeat weights. + +- ``corrected=true``: :math:`\frac{1}{\sum{w} - 1}` +- ``corrected=false``: :math:`\frac{1}{\sum{w}}` + +.. code-block:: julia + + w = FrequencyWeights([2, 1, 3]) + + w = fweights([2, 1, 3]) + + +ProbabilityWeights +~~~~~~~~~~~~~~~~~~~ + +Probability weights represent the inverse of the sampling probability for each observation, providing a correction mechanism for under- or over-sampling certain population groups. These weights may also be referred to as sampling weights. + +- ``corrected=true``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals ``count(!iszero, w)`` +- ``corrected=false``: :math:`\frac{1}{\sum{w}}` + +.. code-block:: julia + + w = ProbabilityWeights([0.2, 0.1, 0.3]) + + w = pweights([0.2, 0.1, 0.3]) diff --git a/src/weights.jl b/src/weights.jl index e336513b2..f59dbec94 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -58,7 +58,7 @@ A precomputed sum may be provided as `wsum`. The `Weights` type describes a generic weights vector which does not support all operations possible for [`FrequencyWeights`](@ref), [`AnalyticWeights`](@ref) -and [`ProbabilityWeights`](@ref)" +and [`ProbabilityWeights`](@ref). """ Weights{S<:Real, V<:RealVector}(vs::V, s::S=sum(vs)) = Weights{S, eltype(vs), V}(vs, s) From 05a3cd7b45758b7c70fb0231eeb2cb696f239e59 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 4 May 2017 11:43:12 -0500 Subject: [PATCH 39/50] Removed `fweights` from tests in favour of `weights` (to reduces PR size) or `f` where appropriate. --- test/counts.jl | 8 ++++---- test/hist.jl | 16 ++++++++-------- test/weights.jl | 4 ++-- test/wsampling.jl | 4 ++-- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/test/counts.jl b/test/counts.jl index b6b7f632d..2fae64eed 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -6,7 +6,7 @@ n = 5000 # 1D integer counts x = rand(1:5, n) -w = fweights(rand(n)) +w = weights(rand(n)) c = counts(x, 5) @test size(c) == (5,) @@ -40,7 +40,7 @@ c0 = Float64[sum(w.values[x .== i]) for i in 1 : 5] x = rand(1:4, n) y = rand(1:5, n) -w = fweights(rand(n)) +w = weights(rand(n)) c = counts(x, y, (4, 5)) @test size(c) == (4, 5) @@ -85,11 +85,11 @@ pm = proportionmap(x) @test pm["b"] ≈ (1/3) @test pm["c"] ≈ (1/6) -cm = countmap(x, fweights(w)) +cm = countmap(x, weights(w)) @test cm["a"] == 5.5 @test cm["b"] == 4.5 @test cm["c"] == 3.5 -pm = proportionmap(x, fweights(w)) +pm = proportionmap(x, weights(w)) @test pm["a"] ≈ (5.5 / 13.5) @test pm["b"] ≈ (4.5 / 13.5) @test pm["c"] ≈ (3.5 / 13.5) diff --git a/test/hist.jl b/test/hist.jl index 3de1ce302..60403a68d 100644 --- a/test/hist.jl +++ b/test/hist.jl @@ -57,19 +57,19 @@ end @test fit(Histogram,(0:99,0:99),nbins=(5,5), closed=:left).weights == diagm([20,20,20,20,20]) # FIXME: closed (all lines in this block): - @test fit(Histogram,0:99,fweights(ones(100)),nbins=5, closed=:left).weights == [20,20,20,20,20] - @test fit(Histogram,0:99,fweights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] - @test fit(Histogram{Int32},0:99,fweights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] - @test fit(Histogram{Float32},0:99,fweights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] + @test fit(Histogram,0:99,weights(ones(100)),nbins=5, closed=:left).weights == [20,20,20,20,20] + @test fit(Histogram,0:99,weights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] + @test fit(Histogram{Int32},0:99,weights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] + @test fit(Histogram{Float32},0:99,weights(2*ones(100)),nbins=5, closed=:left).weights == [40,40,40,40,40] end @testset "Histogram element type" begin # FIXME: closed (all lines in this block): - @test eltype(@inferred(fit(Histogram,1:100,fweights(ones(Int,100)),nbins=5, closed=:left)).weights) == Int - @test eltype(@inferred(fit(Histogram{Float32},1:100,fweights(ones(Int,100)),nbins=5, closed=:left)).weights) == Float32 - @test eltype(@inferred(fit(Histogram,1:100,fweights(ones(Float64,100)),nbins=5, closed=:left)).weights) == Float64 - @test eltype(@inferred(fit(Histogram{Float32},1:100,fweights(ones(Float64,100)),nbins=5, closed=:left)).weights) == Float32 + @test eltype(@inferred(fit(Histogram,1:100,weights(ones(Int,100)),nbins=5, closed=:left)).weights) == Int + @test eltype(@inferred(fit(Histogram{Float32},1:100,weights(ones(Int,100)),nbins=5, closed=:left)).weights) == Float32 + @test eltype(@inferred(fit(Histogram,1:100,weights(ones(Float64,100)),nbins=5, closed=:left)).weights) == Float64 + @test eltype(@inferred(fit(Histogram{Float32},1:100,weights(ones(Float64,100)),nbins=5, closed=:left)).weights) == Float32 end diff --git a/test/weights.jl b/test/weights.jl index 3de5c81e1..1e09f0220 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -176,7 +176,7 @@ end @test mean(a, f(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) @test mean(a, f(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) @test mean(a, f(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) - @test_throws ErrorException mean(a, fweights(wt), 4) + @test_throws ErrorException mean(a, f(wt), 4) end end end @@ -258,7 +258,7 @@ end @test_throws MethodError median([4 3 2 1 0], f(wt)) @test_throws MethodError median([[1 2];[4 5];[7 8];[10 11];[13 14]], f(wt)) data = [1, 3, 2, NaN, 2] - @test isnan(median(data, fweights(wt))) + @test isnan(median(data, f(wt))) wt = [1, 2, NaN, 4, 5] @test_throws ErrorException median(data, f(wt)) data = [1, 3, 2, 1, 2] diff --git a/test/wsampling.jl b/test/wsampling.jl index d605a868e..31012438a 100644 --- a/test/wsampling.jl +++ b/test/wsampling.jl @@ -35,7 +35,7 @@ end import StatsBase: direct_sample!, alias_sample! n = 10^5 -wv = fweights([0.2, 0.8, 0.4, 0.6]) +wv = weights([0.2, 0.8, 0.4, 0.6]) a = direct_sample!(4:7, wv, zeros(Int, n, 3)) check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=false) @@ -79,7 +79,7 @@ import StatsBase: naive_wsample_norep!, efraimidis_a_wsample_norep!, efraimidis_ares_wsample_norep!, efraimidis_aexpj_wsample_norep! n = 10^5 -wv = fweights([0.2, 0.8, 0.4, 0.6]) +wv = weights([0.2, 0.8, 0.4, 0.6]) a = zeros(Int, 3, n) for j = 1:n From 2370595e9b16d9caf012043298fbb0255d67ce9c Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 4 May 2017 13:38:50 -0500 Subject: [PATCH 40/50] Moved description of different weight types in an Implementations sections and decided to only reference the `var`, `std` and `cov` docstrings. --- docs/source/weightvec.rst | 124 +++++++++++++++----------------------- 1 file changed, 47 insertions(+), 77 deletions(-) diff --git a/docs/source/weightvec.rst b/docs/source/weightvec.rst index 23e36c06e..4e090963b 100644 --- a/docs/source/weightvec.rst +++ b/docs/source/weightvec.rst @@ -5,18 +5,6 @@ Weight Vectors In statistical applications, it is not uncommon to assign weights to samples. To facilitate the use of weight vectors, we introduce the abstract type ``AbstractWeights`` for the purpose of representing weight vectors. -Construction --------------- - -A generic weight vector instance can be constructed using the ``Weights`` constructor or the ``weights`` function: - -.. code-block:: julia - - w = Weights([1., 2., 3.]) - w = Weights([1., 2., 3.], 6.) - - w = weights([1., 2., 3.]) - **Note:** - The weight vector is a light-weight wrapper of the input vector. The input vector is NOT copied during construction. @@ -24,107 +12,89 @@ A generic weight vector instance can be constructed using the ``Weights`` constr - The weight vector maintains the sum of weights, which is computed upon construction. If the value of the sum is pre-computed, one can supply it as the second argument to the constructor and save the time of computing the sum again. -Methods ---------- - -Let ``w`` be an instance of ``AbstractWeights``: - -.. function:: eltype(w) - - Get the type of weight values. - -.. function:: length(w) - - Get the length of the weight vector. - -.. function:: isempty(w) - - Test whether ``w`` is empty, *i.e.* ``length(w) == 0``. - -.. function:: values(w) +Implementations +--------------- - Get the vector of weight values. +Several statistical weight types are provided which subtype ``AbstractWeights``. The choice of weights impacts how bias is corrected in several methods. See the ``var``, ``std`` and ``cov`` docstrings for more details. -.. function:: sum(w) +``Weights`` +~~~~~~~~~~~~ - Get the sum of weights. +The ``Weights`` type describes a generic weights vector which does not support all operations possible for ``FrequencyWeights``, ``AnalyticWeights`` and ``ProbabilityWeights``. - :note: The sum of weights is maintained by the weight vector, and thus this function can immediately return the value in ``O(1)`` (without computation). +.. code-block:: julia + w = Weights([1., 2., 3.]) + w = Weights([1., 2., 3.,], 6.) -Why we want an AbstractWeights type ------------------------------------- + w = weights([1., 2., 3.]) -The ``AbstractWeights`` type is introduced as the standard way to pass weights, which has two advantages: -- A different type ``AbstractWeights`` distinguishes the role of the weight vector from other data vectors in the input arguments. -- Statistical functions that utilize weights often need the sum of weights for various purposes. The weight vector maintains the sum of weights, so that it needn't be computed repeatedly each time the sum of weights is needed. +``AnalyticWeights`` +~~~~~~~~~~~~~~~~~~~~ +Analytic weights describe a non-random relative importance (usually between 0 and 1) for each observation. These weights may also be referred to as reliability weights, precision weights or inverse variance weights. These are typically used when the observations being weighted are aggregate values (e.g., averages) with differing variances. -Variance bias correction -------------------------------------------------- +.. code-block:: julia -When computing the weighted uncorrected (when ``corrected=false``) sample variance, standard deviation or covariance :math:`\frac{1}{\sum{w}}` is used instead of :math:`\frac{1}{n}` (where ``n`` is the number of observations). + w = AnalyticWeights([0.2, 0.1, 0.3]) + w = aweights([0.2, 0.1, 0.3]) -Example: -:math:`s^2 = \frac{1}{\sum{w}} \sum_{i=1}^n {w_i\left({x_i - m}\right)^2 }` +``FrequencyWeights`` +~~~~~~~~~~~~~~~~~~~~~ -vs +Frequency weights describe the number of times (or frequency) each observation was observed. These weights may also be referred to as case weights or repeat weights. -:math:`s^2 = \frac{1}{n} \sum_{i=1}^n {\left({x_i - m}\right)^2 }` +.. code-block:: julia -However, unbiased estimates (when ``corrected=true``) are dependent on the types of weights used. All weights presented here are a subtype of ``AbstractWeights``. + w = FrequencyWeights([2, 1, 3]) + w = fweights([2, 1, 3]) -Weights -~~~~~~~ -The `Weights` type describes a generic weights vector which does not support all operations possible for ``FrequencyWeights``, ``AnalyticWeights`` and ``ProbabilityWeights``. +``ProbabilityWeights`` +~~~~~~~~~~~~~~~~~~~~~~ -- ``corrected=true``: ``ArgumentError`` -- ``corrected=false``: :math:`\frac{1}{\sum{w}}` +Probability weights represent the inverse of the sampling probability for each observation, providing a correction mechanism for under- or over-sampling certain population groups. These weights may also be referred to as sampling weights. -AnalyticWeights -~~~~~~~~~~~~~~~~ +.. code-block:: julia -Analytic weights describe a non-random relative importance (usually between 0 and 1) for each observation. These weights may also be referred to as reliability weights, precision weights or inverse variance weights. These are typically used when the observations being weighted are aggregate values (e.g., averages) with differing variances. + w = ProbabilityWeights([0.2, 0.1, 0.3]) + w = pweights([0.2, 0.1, 0.3]) -- ``corrected=true``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` -- ``corrected=false``: :math:`\frac{1}{\sum{w}}` -.. code-block:: julia +Methods +--------- - w = AnalyticWeights([0.2, 0.1, 0.3]) +Let ``w`` be an instance of ``AbstractWeights``: - w = aweights([0.2, 0.1, 0.3]) +.. function:: eltype(w) + Get the type of weight values. -FrequencyWeights -~~~~~~~~~~~~~~~~~ +.. function:: length(w) -Frequency weights describe the number of times (or frequency) each observation -was observed. These weights may also be referred to as case weights or repeat weights. + Get the length of the weight vector. -- ``corrected=true``: :math:`\frac{1}{\sum{w} - 1}` -- ``corrected=false``: :math:`\frac{1}{\sum{w}}` +.. function:: isempty(w) -.. code-block:: julia + Test whether ``w`` is empty, *i.e.* ``length(w) == 0``. - w = FrequencyWeights([2, 1, 3]) +.. function:: values(w) - w = fweights([2, 1, 3]) + Get the vector of weight values. +.. function:: sum(w) -ProbabilityWeights -~~~~~~~~~~~~~~~~~~~ + Get the sum of weights. -Probability weights represent the inverse of the sampling probability for each observation, providing a correction mechanism for under- or over-sampling certain population groups. These weights may also be referred to as sampling weights. + :note: The sum of weights is maintained by the weight vector, and thus this function can immediately return the value in ``O(1)`` (without computation). -- ``corrected=true``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals ``count(!iszero, w)`` -- ``corrected=false``: :math:`\frac{1}{\sum{w}}` -.. code-block:: julia +Why we want an ``AbstractWeights`` type +---------------------------------------- - w = ProbabilityWeights([0.2, 0.1, 0.3]) +The ``AbstractWeights`` type is introduced as the standard way to pass weights, which has two advantages: - w = pweights([0.2, 0.1, 0.3]) +- A different type ``AbstractWeights`` distinguishes the role of the weight vector from other data vectors in the input arguments. +- Statistical functions that utilize weights often need the sum of weights for various purposes. The weight vector maintains the sum of weights, so that it needn't be computed repeatedly each time the sum of weights is needed. From 3f84e718b73833b7c3b04f22820c22720ee785a0 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 4 May 2017 14:16:33 -0500 Subject: [PATCH 41/50] Moved `Weights` description later in the docs. --- docs/source/weightvec.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/source/weightvec.rst b/docs/source/weightvec.rst index 4e090963b..d3ad91f36 100644 --- a/docs/source/weightvec.rst +++ b/docs/source/weightvec.rst @@ -17,19 +17,6 @@ Implementations Several statistical weight types are provided which subtype ``AbstractWeights``. The choice of weights impacts how bias is corrected in several methods. See the ``var``, ``std`` and ``cov`` docstrings for more details. -``Weights`` -~~~~~~~~~~~~ - -The ``Weights`` type describes a generic weights vector which does not support all operations possible for ``FrequencyWeights``, ``AnalyticWeights`` and ``ProbabilityWeights``. - -.. code-block:: julia - - w = Weights([1., 2., 3.]) - w = Weights([1., 2., 3.,], 6.) - - w = weights([1., 2., 3.]) - - ``AnalyticWeights`` ~~~~~~~~~~~~~~~~~~~~ @@ -63,6 +50,19 @@ Probability weights represent the inverse of the sampling probability for each o w = pweights([0.2, 0.1, 0.3]) +``Weights`` +~~~~~~~~~~~~ + +The ``Weights`` type describes a generic weights vector which does not support all operations possible for ``FrequencyWeights``, ``AnalyticWeights`` and ``ProbabilityWeights``. + +.. code-block:: julia + + w = Weights([1., 2., 3.]) + w = Weights([1., 2., 3.,], 6.) + + w = weights([1., 2., 3.]) + + Methods --------- From 8bcf4484ab2122e426f74ef759b4e8b2acdd3762 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 4 May 2017 14:23:49 -0500 Subject: [PATCH 42/50] Removed two argument example from weightvec docs. --- docs/source/weightvec.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/weightvec.rst b/docs/source/weightvec.rst index d3ad91f36..cd08b3179 100644 --- a/docs/source/weightvec.rst +++ b/docs/source/weightvec.rst @@ -58,8 +58,6 @@ The ``Weights`` type describes a generic weights vector which does not support a .. code-block:: julia w = Weights([1., 2., 3.]) - w = Weights([1., 2., 3.,], 6.) - w = weights([1., 2., 3.]) From dedb1558e161ac301a4c37fa44c7785c3b651174 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 4 May 2017 15:23:16 -0500 Subject: [PATCH 43/50] Moved description of weight vector benefits to the top of the file. --- docs/source/weightvec.rst | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/docs/source/weightvec.rst b/docs/source/weightvec.rst index cd08b3179..7a36478fd 100644 --- a/docs/source/weightvec.rst +++ b/docs/source/weightvec.rst @@ -3,12 +3,14 @@ Weight Vectors ================ -In statistical applications, it is not uncommon to assign weights to samples. To facilitate the use of weight vectors, we introduce the abstract type ``AbstractWeights`` for the purpose of representing weight vectors. +In statistical applications, it is not uncommon to assign weights to samples. To facilitate the use of weight vectors, we introduce the abstract type ``AbstractWeights`` for the purpose of representing weight vectors, which has two advantages: + +- A different type ``AbstractWeights`` distinguishes the role of the weight vector from other data vectors in the input arguments. +- Statistical functions that utilize weights often need the sum of weights for various purposes. The weight vector maintains the sum of weights, so that it needn't be computed repeatedly each time the sum of weights is needed. **Note:** - The weight vector is a light-weight wrapper of the input vector. The input vector is NOT copied during construction. - - The weight vector maintains the sum of weights, which is computed upon construction. If the value of the sum is pre-computed, one can supply it as the second argument to the constructor and save the time of computing the sum again. @@ -87,12 +89,3 @@ Let ``w`` be an instance of ``AbstractWeights``: Get the sum of weights. :note: The sum of weights is maintained by the weight vector, and thus this function can immediately return the value in ``O(1)`` (without computation). - - -Why we want an ``AbstractWeights`` type ----------------------------------------- - -The ``AbstractWeights`` type is introduced as the standard way to pass weights, which has two advantages: - -- A different type ``AbstractWeights`` distinguishes the role of the weight vector from other data vectors in the input arguments. -- Statistical functions that utilize weights often need the sum of weights for various purposes. The weight vector maintains the sum of weights, so that it needn't be computed repeatedly each time the sum of weights is needed. From 281654d9b92b03f4c0567bf8bd12f1429c9c199f Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 4 May 2017 15:43:51 -0500 Subject: [PATCH 44/50] Not sure how much this helped, but tried to minimize the amount of `@testset` changes (`cov` was kind of a lost cause). --- test/moments.jl | 457 ++++++++++++++++++++++++------------------------ test/weights.jl | 268 ++++++++++++++-------------- 2 files changed, 358 insertions(+), 367 deletions(-) diff --git a/test/moments.jl b/test/moments.jl index 2989a405c..ca1794198 100644 --- a/test/moments.jl +++ b/test/moments.jl @@ -4,242 +4,239 @@ using Base.Test @testset "StatsBase.Moments" begin weight_funcs = (weights, aweights, fweights, pweights) -@testset "Variance and Standard Deviation" begin - @testset "Vectors" begin - x = [0.57, 0.10, 0.91, 0.72, 0.46, 0.0] - w = [3.84, 2.70, 8.29, 8.91, 9.71, 0.0] - - @testset "Uncorrected with $f" for f in weight_funcs - wv = f(w) - m = mean(x, wv) - - # expected uncorrected output - expected_var = sum(abs2.(x .- m), wv) / sum(wv) - expected_std = sqrt(expected_var) - - @testset "Variance" begin - @test var(x, wv; corrected=false) ≈ expected_var - @test var(x, wv; mean=m, corrected=false) ≈ expected_var - end - - @testset "Standard Deviation" begin - @test std(x, wv; corrected=false) ≈ expected_std - @test std(x, wv; mean=m, corrected=false) ≈ expected_std - end - - @testset "Mean and Variance" begin - (m, v) = mean_and_var(x; corrected=false) - @test m == mean(x) - @test v == var(x; corrected=corrected=false) - - (m, v) = mean_and_var(x, wv; corrected=false) - @test m == mean(x, wv) - @test v == var(x, wv; corrected=false) - end - - @testset "Mean and Standard Deviation" begin - (m, s) = mean_and_std(x; corrected=false) - @test m == mean(x) - @test s == std(x; corrected=false) - - (m, s) = mean_and_std(x, wv; corrected=false) - @test m == mean(x, wv) - @test s == std(x, wv; corrected=false) - end +##### weighted var & std + +x = [0.57, 0.10, 0.91, 0.72, 0.46, 0.0] +w = [3.84, 2.70, 8.29, 8.91, 9.71, 0.0] + +@testset "Uncorrected with $f" for f in weight_funcs + wv = f(w) + m = mean(x, wv) + + # expected uncorrected output + expected_var = sum(abs2.(x .- m), wv) / sum(wv) + expected_std = sqrt(expected_var) + + @testset "Variance" begin + @test var(x, wv; corrected=false) ≈ expected_var + @test var(x, wv; mean=m, corrected=false) ≈ expected_var + end + + @testset "Standard Deviation" begin + @test std(x, wv; corrected=false) ≈ expected_std + @test std(x, wv; mean=m, corrected=false) ≈ expected_std + end + + @testset "Mean and Variance" begin + (m, v) = mean_and_var(x; corrected=false) + @test m == mean(x) + @test v == var(x; corrected=corrected=false) + + (m, v) = mean_and_var(x, wv; corrected=false) + @test m == mean(x, wv) + @test v == var(x, wv; corrected=false) + end + + @testset "Mean and Standard Deviation" begin + (m, s) = mean_and_std(x; corrected=false) + @test m == mean(x) + @test s == std(x; corrected=false) + + (m, s) = mean_and_std(x, wv; corrected=false) + @test m == mean(x, wv) + @test s == std(x, wv; corrected=false) + end +end + +# expected corrected output for (weights, aweights, fweights, pweights) +expected_var = [NaN, 0.0694434191182236, 0.05466601256158146, 0.06628969012045285] +expected_std = sqrt(expected_var) + +@testset "Corrected with $(weight_funcs[i])" for i in eachindex(weight_funcs) + wv = weight_funcs[i](w) + m = mean(x, wv) + + @testset "Variance" begin + if isa(wv, Weights) + @test_throws ArgumentError var(x, wv; corrected=true) + else + @test var(x, wv; corrected=true) ≈ expected_var[i] + @test var(x, wv; mean=m, corrected=true) ≈ expected_var[i] + end + end + + @testset "Standard Deviation" begin + if isa(wv, Weights) + @test_throws ArgumentError std(x, wv; corrected=true) + else + @test std(x, wv; corrected=true) ≈ expected_std[i] + @test std(x, wv; mean=m, corrected=true) ≈ expected_std[i] + end + end + + @testset "Mean and Variance" begin + (m, v) = mean_and_var(x; corrected=true) + @test m == mean(x) + @test v == var(x; corrected=true) + + if isa(wv, Weights) + @test_throws ArgumentError mean_and_var(x, wv; corrected=true) + else + (m, v) = mean_and_var(x, wv; corrected=true) + @test m == mean(x, wv) + @test v == var(x, wv; corrected=true) + end + end + + @testset "Mean and Standard Deviation" begin + (m, s) = mean_and_std(x; corrected=true) + @test m == mean(x) + @test s == std(x; corrected=true) + + if isa(wv, Weights) + @test_throws ArgumentError mean_and_std(x, wv; corrected=true) + else + (m, s) = mean_and_std(x, wv; corrected=true) + @test m == mean(x, wv) + @test s == std(x, wv; corrected=true) + end + end +end + +x = rand(5, 6) +w1 = rand(5) +w2 = rand(6) + +@testset "Uncorrected with $f" for f in weight_funcs + wv1 = f(w1) + wv2 = f(w2) + m1 = mean(x, wv1, 1) + m2 = mean(x, wv2, 2) + + expected_var1 = sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) + expected_var2 = sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) + expected_std1 = sqrt.(expected_var1) + expected_std2 = sqrt.(expected_var2) + + @testset "Variance" begin + @test var(x, wv1, 1; corrected=false) ≈ expected_var1 + @test var(x, wv2, 2; corrected=false) ≈ expected_var2 + @test var(x, wv1, 1; mean=m1, corrected=false) ≈ expected_var1 + @test var(x, wv2, 2; mean=m2, corrected=false) ≈ expected_var2 + end + + @testset "Standard Deviation" begin + @test std(x, wv1, 1; corrected=false) ≈ expected_std1 + @test std(x, wv2, 2; corrected=false) ≈ expected_std2 + @test std(x, wv1, 1; mean=m1, corrected=false) ≈ expected_std1 + @test std(x, wv2, 2; mean=m2, corrected=false) ≈ expected_std2 + end + + @testset "Mean and Variance" begin + for d in 1:2 + (m, v) = mean_and_var(x, d; corrected=false) + @test m == mean(x, d) + @test v == var(x, d; corrected=false) + end + + (m, v) = mean_and_var(x, wv1, 1; corrected=false) + @test m == mean(x, wv1, 1) + @test v == var(x, wv1, 1; corrected=false) + + (m, v) = mean_and_var(x, wv2, 2; corrected=false) + @test m == mean(x, wv2, 2) + @test v == var(x, wv2, 2; corrected=false) + end + + @testset "Mean and Standard Deviation" begin + for d in 1:2 + (m, s) = mean_and_std(x, d; corrected=false) + @test m == mean(x, d) + @test s == std(x, d; corrected=false) + end + + (m, s) = mean_and_std(x, wv1, 1; corrected=false) + @test m == mean(x, wv1, 1) + @test s == std(x, wv1, 1; corrected=false) + + (m, s) = mean_and_std(x, wv2, 2; corrected=false) + @test m == mean(x, wv2, 2) + @test s == std(x, wv2, 2; corrected=false) + end +end + +@testset "Corrected with $f" for f in weight_funcs + wv1 = f(w1) + wv2 = f(w2) + m1 = mean(x, wv1, 1) + m2 = mean(x, wv2, 2) + + if !isa(wv1, Weights) + expected_var1 = sum(abs2.(x .- m1) .* w1, 1) .* StatsBase.varcorrection(wv1, true) + expected_var2 = sum(abs2.(x .- m2) .* w2', 2) .* StatsBase.varcorrection(wv2, true) + expected_std1 = sqrt.(expected_var1) + expected_std2 = sqrt.(expected_var2) + end + + @testset "Variance" begin + if isa(wv1, Weights) + @test_throws ArgumentError var(x, wv1, 1; corrected=true) + else + @test var(x, wv1, 1; corrected=true) ≈ expected_var1 + @test var(x, wv2, 2; corrected=true) ≈ expected_var2 + @test var(x, wv1, 1; mean=m1, corrected=true) ≈ expected_var1 + @test var(x, wv2, 2; mean=m2, corrected=true) ≈ expected_var2 + end + end + + @testset "Standard Deviation" begin + if isa(wv1, Weights) + @test_throws ArgumentError std(x, wv1, 1; corrected=true) + else + @test std(x, wv1, 1; corrected=true) ≈ expected_std1 + @test std(x, wv2, 2; corrected=true) ≈ expected_std2 + @test std(x, wv1, 1; mean=m1, corrected=true) ≈ expected_std1 + @test std(x, wv2, 2; mean=m2, corrected=true) ≈ expected_std2 end + end - # expected corrected output for (weights, aweights, fweights, pweights) - expected_var = [NaN, 0.0694434191182236, 0.05466601256158146, 0.06628969012045285] - expected_std = sqrt(expected_var) - - @testset "Corrected with $(weight_funcs[i])" for i in eachindex(weight_funcs) - wv = weight_funcs[i](w) - m = mean(x, wv) - - @testset "Variance" begin - if isa(wv, Weights) - @test_throws ArgumentError var(x, wv; corrected=true) - else - @test var(x, wv; corrected=true) ≈ expected_var[i] - @test var(x, wv; mean=m, corrected=true) ≈ expected_var[i] - end - end - - @testset "Standard Deviation" begin - if isa(wv, Weights) - @test_throws ArgumentError std(x, wv; corrected=true) - else - @test std(x, wv; corrected=true) ≈ expected_std[i] - @test std(x, wv; mean=m, corrected=true) ≈ expected_std[i] - end - end - - @testset "Mean and Variance" begin - (m, v) = mean_and_var(x; corrected=true) - @test m == mean(x) - @test v == var(x; corrected=true) - - if isa(wv, Weights) - @test_throws ArgumentError mean_and_var(x, wv; corrected=true) - else - (m, v) = mean_and_var(x, wv; corrected=true) - @test m == mean(x, wv) - @test v == var(x, wv; corrected=true) - end - end - - @testset "Mean and Standard Deviation" begin - (m, s) = mean_and_std(x; corrected=true) - @test m == mean(x) - @test s == std(x; corrected=true) - - if isa(wv, Weights) - @test_throws ArgumentError mean_and_std(x, wv; corrected=true) - else - (m, s) = mean_and_std(x, wv; corrected=true) - @test m == mean(x, wv) - @test s == std(x, wv; corrected=true) - end - end + @testset "Mean and Variance" begin + for d in 1:2 + (m, v) = mean_and_var(x, d; corrected=true) + @test m == mean(x, d) + @test v == var(x, d; corrected=true) + end + + if isa(wv1, Weights) + @test_throws ArgumentError mean_and_var(x, wv1, 1; corrected=true) + else + (m, v) = mean_and_var(x, wv1, 1; corrected=true) + @test m == mean(x, wv1, 1) + @test v == var(x, wv1, 1; corrected=true) + + (m, v) = mean_and_var(x, wv2, 2; corrected=true) + @test m == mean(x, wv2, 2) + @test v == var(x, wv2, 2; corrected=true) end end - @testset "Matrices" begin - x = rand(5, 6) - w1 = rand(5) - w2 = rand(6) - - @testset "Uncorrected with $f" for f in weight_funcs - wv1 = f(w1) - wv2 = f(w2) - m1 = mean(x, wv1, 1) - m2 = mean(x, wv2, 2) - - expected_var1 = sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) - expected_var2 = sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) - expected_std1 = sqrt.(expected_var1) - expected_std2 = sqrt.(expected_var2) - - @testset "Variance" begin - @test var(x, wv1, 1; corrected=false) ≈ expected_var1 - @test var(x, wv2, 2; corrected=false) ≈ expected_var2 - @test var(x, wv1, 1; mean=m1, corrected=false) ≈ expected_var1 - @test var(x, wv2, 2; mean=m2, corrected=false) ≈ expected_var2 - end - - @testset "Standard Deviation" begin - @test std(x, wv1, 1; corrected=false) ≈ expected_std1 - @test std(x, wv2, 2; corrected=false) ≈ expected_std2 - @test std(x, wv1, 1; mean=m1, corrected=false) ≈ expected_std1 - @test std(x, wv2, 2; mean=m2, corrected=false) ≈ expected_std2 - end - - @testset "Mean and Variance" begin - for d in 1:2 - (m, v) = mean_and_var(x, d; corrected=false) - @test m == mean(x, d) - @test v == var(x, d; corrected=false) - end - - (m, v) = mean_and_var(x, wv1, 1; corrected=false) - @test m == mean(x, wv1, 1) - @test v == var(x, wv1, 1; corrected=false) - - (m, v) = mean_and_var(x, wv2, 2; corrected=false) - @test m == mean(x, wv2, 2) - @test v == var(x, wv2, 2; corrected=false) - end - - @testset "Mean and Standard Deviation" begin - for d in 1:2 - (m, s) = mean_and_std(x, d; corrected=false) - @test m == mean(x, d) - @test s == std(x, d; corrected=false) - end - - (m, s) = mean_and_std(x, wv1, 1; corrected=false) - @test m == mean(x, wv1, 1) - @test s == std(x, wv1, 1; corrected=false) - - (m, s) = mean_and_std(x, wv2, 2; corrected=false) - @test m == mean(x, wv2, 2) - @test s == std(x, wv2, 2; corrected=false) - end + + @testset "Mean and Standard Deviation" begin + for d in 1:2 + (m, s) = mean_and_std(x, d; corrected=true) + @test m == mean(x, d) + @test s == std(x, d; corrected=true) end - @testset "Corrected with $f" for f in weight_funcs - wv1 = f(w1) - wv2 = f(w2) - m1 = mean(x, wv1, 1) - m2 = mean(x, wv2, 2) - - if !isa(wv1, Weights) - expected_var1 = sum(abs2.(x .- m1) .* w1, 1) .* StatsBase.varcorrection(wv1, true) - expected_var2 = sum(abs2.(x .- m2) .* w2', 2) .* StatsBase.varcorrection(wv2, true) - expected_std1 = sqrt.(expected_var1) - expected_std2 = sqrt.(expected_var2) - end - - @testset "Variance" begin - if isa(wv1, Weights) - @test_throws ArgumentError var(x, wv1, 1; corrected=true) - else - @test var(x, wv1, 1; corrected=true) ≈ expected_var1 - @test var(x, wv2, 2; corrected=true) ≈ expected_var2 - @test var(x, wv1, 1; mean=m1, corrected=true) ≈ expected_var1 - @test var(x, wv2, 2; mean=m2, corrected=true) ≈ expected_var2 - end - end - - @testset "Standard Deviation" begin - if isa(wv1, Weights) - @test_throws ArgumentError std(x, wv1, 1; corrected=true) - else - @test std(x, wv1, 1; corrected=true) ≈ expected_std1 - @test std(x, wv2, 2; corrected=true) ≈ expected_std2 - @test std(x, wv1, 1; mean=m1, corrected=true) ≈ expected_std1 - @test std(x, wv2, 2; mean=m2, corrected=true) ≈ expected_std2 - end - end - - @testset "Mean and Variance" begin - for d in 1:2 - (m, v) = mean_and_var(x, d; corrected=true) - @test m == mean(x, d) - @test v == var(x, d; corrected=true) - end - - if isa(wv1, Weights) - @test_throws ArgumentError mean_and_var(x, wv1, 1; corrected=true) - else - (m, v) = mean_and_var(x, wv1, 1; corrected=true) - @test m == mean(x, wv1, 1) - @test v == var(x, wv1, 1; corrected=true) - - (m, v) = mean_and_var(x, wv2, 2; corrected=true) - @test m == mean(x, wv2, 2) - @test v == var(x, wv2, 2; corrected=true) - end - end - - @testset "Mean and Standard Deviation" begin - for d in 1:2 - (m, s) = mean_and_std(x, d; corrected=true) - @test m == mean(x, d) - @test s == std(x, d; corrected=true) - end - - if isa(wv1, Weights) - @test_throws ArgumentError mean_and_std(x, wv1, 1; corrected=true) - else - (m, s) = mean_and_std(x, wv1, 1; corrected=true) - @test m == mean(x, wv1, 1) - @test s == std(x, wv1, 1; corrected=true) - - (m, s) = mean_and_std(x, wv2, 2; corrected=true) - @test m == mean(x, wv2, 2) - @test s == std(x, wv2, 2; corrected=true) - end - end + if isa(wv1, Weights) + @test_throws ArgumentError mean_and_std(x, wv1, 1; corrected=true) + else + (m, s) = mean_and_std(x, wv1, 1; corrected=true) + @test m == mean(x, wv1, 1) + @test s == std(x, wv1, 1; corrected=true) + + (m, s) = mean_and_std(x, wv2, 2; corrected=true) + @test m == mean(x, wv2, 2) + @test s == std(x, wv2, 2; corrected=true) end end end diff --git a/test/weights.jl b/test/weights.jl index 1e09f0220..da590d4f1 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -6,178 +6,172 @@ import Compat: view @testset "StatsBase.Weights" begin weight_funcs = (weights, aweights, fweights, pweights) -@testset "Construction" begin - @testset "$f" for f in weight_funcs - @test isa(f([1, 2, 3]), AbstractWeights{Int}) - @test isa(f([1., 2., 3.]), AbstractWeights{Float64}) - @test isa(f([1 2 3; 4 5 6]), AbstractWeights{Int}) - - @test isempty(f(Float64[])) - @test size(f([1, 2, 3])) == (3,) - - w = [1., 2., 3.] - wv = f(w) - @test eltype(wv) === Float64 - @test length(wv) === 3 - @test values(wv) === w - @test sum(wv) === 6.0 - @test !isempty(wv) - - b = trues(3) - bv = f(b) - @test eltype(bv) === Bool - @test length(bv) === 3 - @test values(bv) === b - @test sum(bv) === 3 - @test !isempty(bv) - - ba = BitArray([true, false, true]) - sa = sparsevec([1., 0., 2.]) - - @test sum(ba, wv) === 4.0 - @test sum(sa, wv) === 7.0 - end +# Construction +@testset "$f" for f in weight_funcs + @test isa(f([1, 2, 3]), AbstractWeights{Int}) + @test isa(f([1., 2., 3.]), AbstractWeights{Float64}) + @test isa(f([1 2 3; 4 5 6]), AbstractWeights{Int}) + + @test isempty(f(Float64[])) + @test size(f([1, 2, 3])) == (3,) + + w = [1., 2., 3.] + wv = f(w) + @test eltype(wv) === Float64 + @test length(wv) === 3 + @test values(wv) === w + @test sum(wv) === 6.0 + @test !isempty(wv) + + b = trues(3) + bv = f(b) + @test eltype(bv) === Bool + @test length(bv) === 3 + @test values(bv) === b + @test sum(bv) === 3 + @test !isempty(bv) + + ba = BitArray([true, false, true]) + sa = sparsevec([1., 0., 2.]) + + @test sum(ba, wv) === 4.0 + @test sum(sa, wv) === 7.0 end -@testset "Sum" begin - x = [6., 8., 9.] - w = [2., 3., 4.] - p = [1. 2. ; 3. 4.] - q = [1., 2., 3., 4.] +## wsum +x = [6., 8., 9.] +w = [2., 3., 4.] +p = [1. 2. ; 3. 4.] +q = [1., 2., 3., 4.] - @test wsum(Float64[], Float64[]) === 0.0 - @test wsum(x, w) === 72.0 - @test wsum(p, q) === 29.0 +@test wsum(Float64[], Float64[]) === 0.0 +@test wsum(x, w) === 72.0 +@test wsum(p, q) === 29.0 - @testset "Along dimensions" begin - @test wsum(x, w, 1) == [72.0] +## wsum along dimension +@test wsum(x, w, 1) == [72.0] - x = rand(6, 8) - w1 = rand(6) - w2 = rand(8) +x = rand(6, 8) +w1 = rand(6) +w2 = rand(8) - @test size(wsum(x, w1, 1)) == (1, 8) - @test size(wsum(x, w2, 2)) == (6, 1) +@test size(wsum(x, w1, 1)) == (1, 8) +@test size(wsum(x, w2, 2)) == (6, 1) - @test wsum(x, w1, 1) ≈ sum(x .* w1, 1) - @test wsum(x, w2, 2) ≈ sum(x .* w2', 2) +@test wsum(x, w1, 1) ≈ sum(x .* w1, 1) +@test wsum(x, w2, 2) ≈ sum(x .* w2', 2) - x = rand(6, 5, 4) - w1 = rand(6) - w2 = rand(5) - w3 = rand(4) +x = rand(6, 5, 4) +w1 = rand(6) +w2 = rand(5) +w3 = rand(4) - @test size(wsum(x, w1, 1)) == (1, 5, 4) - @test size(wsum(x, w2, 2)) == (6, 1, 4) - @test size(wsum(x, w3, 3)) == (6, 5, 1) +@test size(wsum(x, w1, 1)) == (1, 5, 4) +@test size(wsum(x, w2, 2)) == (6, 1, 4) +@test size(wsum(x, w3, 3)) == (6, 5, 1) - @test wsum(x, w1, 1) ≈ sum(x .* w1, 1) - @test wsum(x, w2, 2) ≈ sum(x .* w2', 2) - @test wsum(x, w3, 3) ≈ sum(x .* reshape(w3, 1, 1, 4), 3) +@test wsum(x, w1, 1) ≈ sum(x .* w1, 1) +@test wsum(x, w2, 2) ≈ sum(x .* w2', 2) +@test wsum(x, w3, 3) ≈ sum(x .* reshape(w3, 1, 1, 4), 3) - v = view(x, 2:4, :, :) +v = view(x, 2:4, :, :) - @test wsum(v, w1[1:3], 1) ≈ sum(v .* w1[1:3], 1) - @test wsum(v, w2, 2) ≈ sum(v .* w2', 2) - @test wsum(v, w3, 3) ≈ sum(v .* reshape(w3, 1, 1, 4), 3) - end +@test wsum(v, w1[1:3], 1) ≈ sum(v .* w1[1:3], 1) +@test wsum(v, w2, 2) ≈ sum(v .* w2', 2) +@test wsum(v, w3, 3) ≈ sum(v .* reshape(w3, 1, 1, 4), 3) - @testset "Arrays with non-BlasReal elements" begin - x = rand(1:100, 6, 8) - w1 = rand(6) - w2 = rand(8) +## wsum for Arrays with non-BlasReal elements +x = rand(1:100, 6, 8) +w1 = rand(6) +w2 = rand(8) - @test wsum(x, w1, 1) ≈ sum(x .* w1, 1) - @test wsum(x, w2, 2) ≈ sum(x .* w2', 2) - end +@test wsum(x, w1, 1) ≈ sum(x .* w1, 1) +@test wsum(x, w2, 2) ≈ sum(x .* w2', 2) - @testset "In place" begin - x = rand(6) - w = rand(6) +## wsum! +x = rand(6) +w = rand(6) - r = ones(1) - @test wsum!(r, x, w, 1; init=true) === r - @test r ≈ [dot(x, w)] +r = ones(1) +@test wsum!(r, x, w, 1; init=true) === r +@test r ≈ [dot(x, w)] - r = ones(1) - @test wsum!(r, x, w, 1; init=false) === r - @test r ≈ [dot(x, w) + 1.0] +r = ones(1) +@test wsum!(r, x, w, 1; init=false) === r +@test r ≈ [dot(x, w) + 1.0] - x = rand(6, 8) - w1 = rand(6) - w2 = rand(8) +x = rand(6, 8) +w1 = rand(6) +w2 = rand(8) - r = ones(1, 8) - @test wsum!(r, x, w1, 1; init=true) === r - @test r ≈ sum(x .* w1, 1) +r = ones(1, 8) +@test wsum!(r, x, w1, 1; init=true) === r +@test r ≈ sum(x .* w1, 1) - r = ones(1, 8) - @test wsum!(r, x, w1, 1; init=false) === r - @test r ≈ sum(x .* w1, 1) .+ 1.0 +r = ones(1, 8) +@test wsum!(r, x, w1, 1; init=false) === r +@test r ≈ sum(x .* w1, 1) .+ 1.0 - r = ones(6) - @test wsum!(r, x, w2, 2; init=true) === r - @test r ≈ sum(x .* w2', 2) +r = ones(6) +@test wsum!(r, x, w2, 2; init=true) === r +@test r ≈ sum(x .* w2', 2) - r = ones(6) - @test wsum!(r, x, w2, 2; init=false) === r - @test r ≈ sum(x .* w2', 2) .+ 1.0 +r = ones(6) +@test wsum!(r, x, w2, 2; init=false) === r +@test r ≈ sum(x .* w2', 2) .+ 1.0 - x = rand(8, 6, 5) - w1 = rand(8) - w2 = rand(6) - w3 = rand(5) +x = rand(8, 6, 5) +w1 = rand(8) +w2 = rand(6) +w3 = rand(5) - r = ones(1, 6, 5) - @test wsum!(r, x, w1, 1; init=true) === r - @test r ≈ sum(x .* w1, 1) +r = ones(1, 6, 5) +@test wsum!(r, x, w1, 1; init=true) === r +@test r ≈ sum(x .* w1, 1) - r = ones(1, 6, 5) - @test wsum!(r, x, w1, 1; init=false) === r - @test r ≈ sum(x .* w1, 1) .+ 1.0 +r = ones(1, 6, 5) +@test wsum!(r, x, w1, 1; init=false) === r +@test r ≈ sum(x .* w1, 1) .+ 1.0 - r = ones(8, 1, 5) - @test wsum!(r, x, w2, 2; init=true) === r - @test r ≈ sum(x .* w2', 2) +r = ones(8, 1, 5) +@test wsum!(r, x, w2, 2; init=true) === r +@test r ≈ sum(x .* w2', 2) - r = ones(8, 1, 5) - @test wsum!(r, x, w2, 2; init=false) === r - @test r ≈ sum(x .* w2', 2) .+ 1.0 +r = ones(8, 1, 5) +@test wsum!(r, x, w2, 2; init=false) === r +@test r ≈ sum(x .* w2', 2) .+ 1.0 - r = ones(8, 6) - @test wsum!(r, x, w3, 3; init=true) === r - @test r ≈ sum(x .* reshape(w3, (1, 1, 5)), 3) +r = ones(8, 6) +@test wsum!(r, x, w3, 3; init=true) === r +@test r ≈ sum(x .* reshape(w3, (1, 1, 5)), 3) - r = ones(8, 6) - @test wsum!(r, x, w3, 3; init=false) === r - @test r ≈ sum(x .* reshape(w3, (1, 1, 5)), 3) .+ 1.0 - end -end +r = ones(8, 6) +@test wsum!(r, x, w3, 3; init=false) === r +@test r ≈ sum(x .* reshape(w3, (1, 1, 5)), 3) .+ 1.0 -@testset "Sum and mean syntax" begin - a = reshape(1.0:27.0, 3, 3, 3) +## the sum and mean syntax +a = reshape(1.0:27.0, 3, 3, 3) - @testset "Sum $f" for f in weight_funcs - @test sum([1.0, 2.0, 3.0], f([1.0, 0.5, 0.5])) ≈ 3.5 - @test sum(1:3, f([1.0, 1.0, 0.5])) ≈ 4.5 +@testset "Sum $f" for f in weight_funcs + @test sum([1.0, 2.0, 3.0], f([1.0, 0.5, 0.5])) ≈ 3.5 + @test sum(1:3, f([1.0, 1.0, 0.5])) ≈ 4.5 - for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) - @test sum(a, f(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) - @test sum(a, f(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) - @test sum(a, f(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) - end + for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) + @test sum(a, f(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1) + @test sum(a, f(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2) + @test sum(a, f(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3) end +end - @testset "Mean $f" for f in weight_funcs - @test mean([1:3;], f([1.0, 1.0, 0.5])) ≈ 1.8 - @test mean(1:3, f([1.0, 1.0, 0.5])) ≈ 1.8 +@testset "Mean $f" for f in weight_funcs + @test mean([1:3;], f([1.0, 1.0, 0.5])) ≈ 1.8 + @test mean(1:3, f([1.0, 1.0, 0.5])) ≈ 1.8 - for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) - @test mean(a, f(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) - @test mean(a, f(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) - @test mean(a, f(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) - @test_throws ErrorException mean(a, f(wt), 4) - end + for wt in ([1.0, 1.0, 1.0], [1.0, 0.2, 0.0], [0.2, 0.0, 1.0]) + @test mean(a, f(wt), 1) ≈ sum(a.*reshape(wt, length(wt), 1, 1), 1)/sum(wt) + @test mean(a, f(wt), 2) ≈ sum(a.*reshape(wt, 1, length(wt), 1), 2)/sum(wt) + @test mean(a, f(wt), 3) ≈ sum(a.*reshape(wt, 1, 1, length(wt)), 3)/sum(wt) + @test_throws ErrorException mean(a, f(wt), 4) end end From 5bdf58b987230f48cd6cfd9b722410cb5ab51755 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 4 May 2017 15:51:31 -0500 Subject: [PATCH 45/50] Added comment about unsupported bias correction for the `Weights` type in `var`, `std` and `cov` docstrings. --- src/cov.jl | 1 + src/moments.jl | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/src/cov.jl b/src/cov.jl index 98c6b942a..7668652a8 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -58,6 +58,7 @@ matrix (`corrected=false`) is computed by multiplying `scattermat(X, w)` by * `AnalyticWeights`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` * `FrequencyWeights`: ``\\frac{1}{\\sum{w} - 1}`` * `ProbabilityWeights`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `Weights`: `ArgumentError` (bias correction not supported) """ cov diff --git a/src/moments.jl b/src/moments.jl index e2e324b64..9781c9686 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -16,6 +16,7 @@ the population variance is computed by replacing * `AnalyticWeights`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` * `FrequencyWeights`: ``\\frac{1}{\\sum{w} - 1}`` * `ProbabilityWeights`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `Weights`: `ArgumentError` (bias correction not supported) """ Base.varm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) = _moment2(v, w, m; corrected=depcheck(:varm, corrected)) @@ -35,6 +36,7 @@ replacing ``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights * `AnalyticWeights`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` * `FrequencyWeights`: ``\\frac{1}{\\sum{w} - 1}`` * `ProbabilityWeights`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `Weights`: `ArgumentError` (bias correction not supported) """ function Base.var(v::RealArray, w::AbstractWeights; mean=nothing, corrected::DepBool=nothing) @@ -122,6 +124,7 @@ dependent on the type of weights used: * `AnalyticWeights`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` * `FrequencyWeights`: ``\\frac{1}{\\sum{w} - 1}`` * `ProbabilityWeights`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `Weights`: `ArgumentError` (bias correction not supported) """ Base.stdm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) = sqrt(varm(v, w, m, corrected=depcheck(:stdm, corrected))) @@ -142,6 +145,7 @@ weights used: * `AnalyticWeights`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}`` * `FrequencyWeights`: ``\\frac{1}{\\sum{w} - 1}`` * `ProbabilityWeights`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)` +* `Weights`: `ArgumentError` (bias correction not supported) """ Base.std(v::RealArray, w::AbstractWeights; mean=nothing, corrected::DepBool=nothing) = sqrt.(var(v, w; mean=mean, corrected=depcheck(:std, corrected))) From b9916678f695c9bb5ddc679d9db219c73bbbef3d Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 4 May 2017 15:58:57 -0500 Subject: [PATCH 46/50] Removed deprecation tests and corresponding hacks. --- test/deprecates.jl | 205 --------------------------------------------- 1 file changed, 205 deletions(-) delete mode 100644 test/deprecates.jl diff --git a/test/deprecates.jl b/test/deprecates.jl deleted file mode 100644 index e3827f387..000000000 --- a/test/deprecates.jl +++ /dev/null @@ -1,205 +0,0 @@ -using StatsBase -using Base.Test -using Compat -import Compat: view - -@testset "StatsBase.Deprecates" begin - -@testset "Deprecates WeightVec and weights" begin - @test isa(weights([1, 2, 3]), WeightVec{Int}) - @test isa(weights([1., 2., 3.]), WeightVec{Float64}) - @test isa(weights([1 2 3; 4 5 6]), WeightVec{Int}) - - @test isa(WeightVec([1, 2, 3], 6), WeightVec{Int}) - - @test isempty(weights(Float64[])) - @test size(weights([1, 2, 3])) == (3,) - - w = [1., 2., 3.] - wv = weights(w) - @test eltype(wv) === Float64 - @test length(wv) === 3 - @test values(wv) === w - @test sum(wv) === 6.0 - @test !isempty(wv) - - b = trues(3) - bv = weights(b) - @test eltype(bv) === Bool - @test length(bv) === 3 - @test values(bv) === b - @test sum(bv) === 3 - @test !isempty(bv) - - ba = BitArray([true, false, true]) - sa = sparsevec([1., 0., 2.]) - - @test sum(ba, wv) === 4.0 - @test sum(sa, wv) === 7.0 -end - -@testset "Moments" begin - @testset "Vectors" begin - x = rand(10) - wv = weights(rand(10)) - m = mean(x, wv) - - @testset "var" begin - @test var(x, wv) ≈ sum(abs2.(x .- m), wv) ./ sum(wv) - @test var(x, wv; mean=0) ≈ sum(abs2.(x), wv) ./ sum(wv) - @test var(x, wv; mean=1.0) ≈ sum(abs2.(x .- 1.0), wv) ./ sum(wv) - end - - @testset "std" begin - @test std(x, wv) ≈ sqrt(var(x, wv)) - @test std(x, wv; mean=0) ≈ sqrt(var(x, wv; mean=0)) - @test std(x, wv; mean=1.0) ≈ sqrt(var(x, wv; mean=1.0)) - end - - @testset "mean_and_var" begin - (m, v) = mean_and_var(x) - @test m == mean(x) - @test v == var(x) - - (m, v) = mean_and_var(x, wv) - @test m == mean(x, wv) - @test v == var(x, wv) - end - - @testset "mean_and_std" begin - (m, s) = mean_and_std(x) - @test m == mean(x) - @test s == std(x) - - (m, s) = mean_and_std(x, wv) - @test m == mean(x, wv) - @test s == std(x, wv) - end - end - - @testset "Matrices" begin - x = rand(5, 6) - w1 = rand(5) - w2 = rand(6) - wv1 = weights(w1) - wv2 = weights(w2) - m1 = mean(x, wv1, 1) - m2 = mean(x, wv2, 2) - - @testset "var" begin - @test var(x, wv1, 1; mean=0) ≈ sum(abs2.(x) .* w1, 1) ./ sum(wv1) - @test var(x, wv2, 2; mean=0) ≈ sum(abs2.(x) .* w2', 2) ./ sum(wv2) - @test var(x, wv1, 1; mean=m1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) - @test var(x, wv2, 2; mean=m2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) - @test var(x, wv1, 1) ≈ sum(abs2.(x .- m1) .* w1, 1) ./ sum(wv1) - @test var(x, wv2, 2) ≈ sum(abs2.(x .- m2) .* w2', 2) ./ sum(wv2) - end - - @testset "std" begin - @test std(x, wv1, 1) ≈ sqrt.(var(x, wv1, 1)) - @test std(x, wv2, 2) ≈ sqrt.(var(x, wv2, 2)) - @test std(x, wv1, 1; mean=0) ≈ sqrt.(var(x, wv1, 1; mean=0)) - @test std(x, wv2, 2; mean=0) ≈ sqrt.(var(x, wv2, 2; mean=0)) - @test std(x, wv1, 1; mean=m1) ≈ sqrt.(var(x, wv1, 1; mean=m1)) - @test std(x, wv2, 2; mean=m2) ≈ sqrt.(var(x, wv2, 2; mean=m2)) - end - - @testset "mean_and_var" begin - for d in 1:2 - (m, v) = mean_and_var(x, d) - @test m == mean(x, d) - @test v == var(x, d) - end - - (m, v) = mean_and_var(x, wv1, 1) - @test m == mean(x, wv1, 1) - @test v == var(x, wv1, 1) - - (m, v) = mean_and_var(x, wv2, 2) - @test m == mean(x, wv2, 2) - @test v == var(x, wv2, 2) - end - - @testset "mean_and_std" begin - for d in 1:2 - (m, s) = mean_and_std(x, d) - @test m == mean(x, d) - @test s == std(x, d) - end - - (m, s) = mean_and_std(x, wv1, 1) - @test m == mean(x, wv1, 1) - @test s == std(x, wv1, 1) - - (m, s) = mean_and_std(x, wv2, 2) - @test m == mean(x, wv2, 2) - @test s == std(x, wv2, 2) - end - end -end - -@testset "Covariance" begin - X = randn(3, 8) - - Z1 = X .- mean(X, 1) - Z2 = X .- mean(X, 2) - - w1 = rand(3) - w2 = rand(8) - - wv1 = weights(w1) - wv2 = weights(w2) - - Z1w = X .- mean(X, wv1, 1) - Z2w = X .- mean(X, wv2, 2) - - S1 = Z1'Z1 - S2 = Z2 * Z2' - - Sz1 = X'X - Sz2 = X * X' - - S1w = Z1w' * diagm(w1) * Z1w - S2w = Z2w * diagm(w2) * Z2w' - - Sz1w = X' * diagm(w1) * X - Sz2w = X * diagm(w2) * X' - - @testset "cov" begin - @test cov(X, wv1) ≈ S1w ./ sum(wv1) - @test cov(X, wv2, 2) ≈ S2w ./ sum(wv2) - - @test Base.covm(X, 0, wv1) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, 0, wv2, 2) ≈ Sz2w ./ sum(wv2) - - @test Base.covm(X, mean(X, wv1, 1), wv1) ≈ S1w ./ sum(wv1) - @test Base.covm(X, mean(X, wv2, 2), wv2, 2) ≈ S2w ./ sum(wv2) - - @test Base.covm(X, zeros(1,8), wv1) ≈ Sz1w ./ sum(wv1) - @test Base.covm(X, zeros(3), wv2, 2) ≈ Sz2w ./ sum(wv2) - end - - @testset "mean_and_cov" begin - (m, C) = mean_and_cov(X, 1) - @test m == mean(X, 1) - @test C == cov(X, 1) - - (m, C) = mean_and_cov(X, 2) - @test m == mean(X, 2) - @test C == cov(X, 2) - - (m, C) = mean_and_cov(X, wv1) - @test m == mean(X, wv1, 1) - @test C == cov(X, wv1, 1) - - (m, C) = mean_and_cov(X, wv1, 1) - @test m == mean(X, wv1, 1) - @test C == cov(X, wv1, 1) - - (m, C) = mean_and_cov(X, wv2, 2) - @test m == mean(X, wv2, 2) - @test C == cov(X, wv2, 2) - end -end - -end # @testset "StatsBase.Deprecates" From 926678e7bb036b846232a9d40332808949f4430b Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 4 May 2017 16:10:56 -0500 Subject: [PATCH 47/50] Removed more deprecation test hacks and convert `wv` -> `w` in the appropriate docs. --- docs/source/cov.rst | 4 ++-- docs/source/scalarstats.rst | 14 +++++++------- test/runtests.jl | 12 ------------ 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/docs/source/cov.rst b/docs/source/cov.rst index 64655ab67..048fc3653 100644 --- a/docs/source/cov.rst +++ b/docs/source/cov.rst @@ -24,11 +24,11 @@ This package implements functions for computing scatter matrix, as well as weigh Weighted scatter matrix. The weights are given by a weight vector ``wv`` of type ``AbstractWeights`` (see :ref:`weightvec`). -.. function:: cov(X, wv[; vardim=..., mean=..., corrected=...]) +.. function:: cov(X, w[; vardim=..., mean=..., corrected=...]) Weighted covariance matrix. - **Note:** By default, the covariance is normalized by the sum of weights, that is, ``cov(X, wv)`` is equal to ``scatter(X, wv) / sum(wv)``. However, if ``corrected`` is set to ``true`` then the appropriate bias correction is used for that `wv`. + **Note:** By default, the covariance is normalized by the sum of weights, that is, ``cov(X, w)`` is equal to ``scatter(X, w) / sum(w)``. However, if ``corrected`` is set to ``true`` then the appropriate bias correction is used for that `w`. .. function:: mean_and_cov(x[, wv][; vardim=..., corrected=...]) diff --git a/docs/source/scalarstats.rst b/docs/source/scalarstats.rst index ad8617879..3d5c36ec8 100644 --- a/docs/source/scalarstats.rst +++ b/docs/source/scalarstats.rst @@ -6,33 +6,33 @@ The package implements functions for computing various statistics over an array Moments --------- -.. function:: var(x, wv[; mean=...]) +.. function:: var(x, w[; mean=..., corrected=...]) Compute weighted variance. One can set the keyword argument ``mean``, which can be either ``nothing`` (to compute the mean value within the function), ``0``, or a pre-computed mean value. - **Note:** the result is normalized by ``sum(wv)`` without correction. + **Note:** the result is normalized by ``sum(w)`` without correction unless ``corrected=true``. -.. function:: var(x, wv, dim[; mean=...]) +.. function:: var(x, w, dim[; mean=..., corrected=...]) Weighted variance along a specific dimension. -.. function:: std(x, wv[; mean=...]) +.. function:: std(x, w[; mean=..., corrected=...]) Compute weighted standard deviation. One can set the keyword argument ``mean``, which can be either ``nothing`` (to compute the mean value within the function), ``0``, or a pre-computed mean value. -.. function:: std(x, wv, dim[; mean=...]) +.. function:: std(x, w, dim[; mean=..., corrected=...]) Weighted standard deviation along a specific dimension. -.. function:: mean_and_var(x[, wv][, dim]) +.. function:: mean_and_var(x[, w][, dim][; corrected=...]) Jointly compute the mean and variance of ``x``. -.. function:: mean_and_std(x[, wv][, dim]) +.. function:: mean_and_std(x[, w][, dim][; corrected=...]) Jointly compute the mean and standard deviation of ``x``. diff --git a/test/runtests.jl b/test/runtests.jl index 54527b77e..e3bea13bd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,13 +1,5 @@ using StatsBase -opts = Base.JLOptions() -depwarns = isdefined(opts, :depwarn) ? opts.depwarn != 0 : true -test_deprecates = if haskey(ENV, "TEST_DEPRECATES") - lowercase(ENV["TEST_DEPRECATES"]) == "true" -else - false -end - tests = ["weights", "moments", "scalarstats", @@ -25,10 +17,6 @@ tests = ["weights", "statmodels"]#, #"statquiz"] -if !depwarns || test_deprecates - push!(tests, "deprecates") -end - println("Running tests:") for t in tests From 85ace2a4a6090549449c6da916eed19eeec0df34 Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 5 May 2017 05:13:39 -0500 Subject: [PATCH 48/50] Missing depcheck on a `stdm` call. --- src/moments.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/moments.jl b/src/moments.jl index 9781c9686..de9d89666 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -151,7 +151,7 @@ Base.std(v::RealArray, w::AbstractWeights; mean=nothing, corrected::DepBool=noth sqrt.(var(v, w; mean=mean, corrected=depcheck(:std, corrected))) Base.stdm(v::RealArray, m::RealArray, dim::Int; corrected::DepBool=nothing) = - Base.sqrt!(varm(v, m, dim; corrected=corrected)) + Base.sqrt!(varm(v, m, dim; corrected=depcheck(:stdm, corrected))) Base.stdm(v::RealArray, w::AbstractWeights, m::RealArray, dim::Int; corrected::DepBool=nothing) = From a0a2ad6933641e0ed4a39e9c45328b5dcaf605f0 Mon Sep 17 00:00:00 2001 From: rofinn Date: Sat, 6 May 2017 16:21:14 -0500 Subject: [PATCH 49/50] Updated the rst scalarstats and cov docs with the updated docstrings. --- docs/source/cov.rst | 13 +++++++--- docs/source/scalarstats.rst | 47 ++++++++++++++++++++++++++----------- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/docs/source/cov.rst b/docs/source/cov.rst index 048fc3653..96a67e940 100644 --- a/docs/source/cov.rst +++ b/docs/source/cov.rst @@ -26,10 +26,17 @@ This package implements functions for computing scatter matrix, as well as weigh .. function:: cov(X, w[; vardim=..., mean=..., corrected=...]) - Weighted covariance matrix. + Compute the weighted covariance matrix. Similar to ``var`` and ``std`` the biased covariance matrix (``corrected=false``) is computed by multiplying ``scattermat(X, w)`` by :math:`\frac{1}{\sum{w}}` to normalize. + However, the unbiased covariance matrix (``corrected=true``) is dependent on the type of weights used: - **Note:** By default, the covariance is normalized by the sum of weights, that is, ``cov(X, w)`` is equal to ``scatter(X, w) / sum(w)``. However, if ``corrected`` is set to ``true`` then the appropriate bias correction is used for that `w`. + - ``AnalyticWeights``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` + - ``FrequencyWeights``: :math:`\frac{1}{\sum{w} - 1}` + - ``ProbabilityWeights``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals ``count(!iszero, w)`` + - ``Weights``: ``ArgumentError`` (bias correction not supported) .. function:: mean_and_cov(x[, wv][; vardim=..., corrected=...]) - Jointly compute the mean and covariance of ``x``. + Jointly compute the mean and covariance matrix as a tuple. + A weighting vector `wv` can be specified. `vardim` that designates whether the variables are columns in the matrix (`1`) or rows (`2`). + Finally, bias correction is applied to the covariance calculation if ``corrected=true``. + See ``cov`` documentation for more details. diff --git a/docs/source/scalarstats.rst b/docs/source/scalarstats.rst index 3d5c36ec8..2c6e2b4ec 100644 --- a/docs/source/scalarstats.rst +++ b/docs/source/scalarstats.rst @@ -6,35 +6,54 @@ The package implements functions for computing various statistics over an array Moments --------- -.. function:: var(x, w[; mean=..., corrected=...]) +.. function:: var(x, w, [dim][; mean=..., corrected=...]) - Compute weighted variance. + Compute the variance of a real-valued array ``x``, optionally over a dimension ``dim``. + Observations in ``x`` are weighted using weight vector ``w``. + The uncorrected (when ``corrected=false``) sample variance is defined as: - One can set the keyword argument ``mean``, which can be either ``nothing`` (to compute the mean value within the function), ``0``, or a pre-computed mean value. + :math:`\frac{1}{\sum{w}} \sum_{i=1}^n {w_i\left({x_i - m}\right)^2 }` - **Note:** the result is normalized by ``sum(w)`` without correction unless ``corrected=true``. + where ``n`` is the length of the input and ``m`` is the mean. + The unbiased estimate (when ``corrected=true``) of the population variance is computed by + replacing :math:`\frac{1}{\sum{w}}` with a factor dependent on the type of weights used: -.. function:: var(x, w, dim[; mean=..., corrected=...]) + - ``AnalyticWeights``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` + - ``FrequencyWeights``: :math:`\frac{1}{\sum{w} - 1}` + - ``ProbabilityWeights``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals ``count(!iszero, w)`` + - ``Weights``: ``ArgumentError`` (bias correction not supported) - Weighted variance along a specific dimension. +.. function:: std(v, w, [dim][; mean=..., corrected=...]) -.. function:: std(x, w[; mean=..., corrected=...]) + Compute the standard deviation of a real-valued array ``x``, optionally over a dimension ``dim``. + Observations in ``x`` are weighted using weight vector ``w``. + The uncorrected (when ``corrected=false``) sample standard deviation is defined as: - Compute weighted standard deviation. + :math:`\sqrt{\frac{1}{\sum{w}} \sum_{i=1}^n {w_i\left({x_i - m}\right)^2 }}` - One can set the keyword argument ``mean``, which can be either ``nothing`` (to compute the mean value within the function), ``0``, or a pre-computed mean value. + where ``n`` is the length of the input and ``m`` is the mean. + The unbiased estimate (when ``corrected=true``) of the population standard deviation is + computed by replacing :math:`\frac{1}{\sum{w}}` with a factor dependent on the type of + weights used: -.. function:: std(x, w, dim[; mean=..., corrected=...]) - - Weighted standard deviation along a specific dimension. + - ``AnalyticWeights``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` + - ``FrequencyWeights``: :math:`\frac{1}{\sum{w} - 1}` + - ``ProbabilityWeights``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals ``count(!iszero, w)`` + - ``Weights``: ``ArgumentError`` (bias correction not supported) .. function:: mean_and_var(x[, w][, dim][; corrected=...]) - Jointly compute the mean and variance of ``x``. + Jointly compute the mean and variance of a real-valued array ``x``, optionally over a dimension ``dim``, as a tuple. + Observations in ``x`` can be weighted using weight vector ``w``. + Finally, bias correction is be applied to the variance calculation if ``corrected=true``. + See ``var`` documentation for more details. .. function:: mean_and_std(x[, w][, dim][; corrected=...]) - Jointly compute the mean and standard deviation of ``x``. + Jointly compute the mean and standard deviation of a real-valued array `x`, optionally over a dimension `dim`, as a tuple. + A weighting vector `w` can be specified to weight the estimates. + Finally, bias correction is applied to the standard deviation calculation if `corrected=true`. + See ``std`` documentation for more details. .. function:: skewness(x[, wv]) From 6087b7f8386aca1fe19b6f4f71babfe4ea698a12 Mon Sep 17 00:00:00 2001 From: rofinn Date: Sat, 6 May 2017 17:39:38 -0500 Subject: [PATCH 50/50] More rst docstring updates. --- docs/source/cov.rst | 10 +++++----- docs/source/scalarstats.rst | 22 +++++++++++----------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/source/cov.rst b/docs/source/cov.rst index 96a67e940..4e4339f34 100644 --- a/docs/source/cov.rst +++ b/docs/source/cov.rst @@ -29,14 +29,14 @@ This package implements functions for computing scatter matrix, as well as weigh Compute the weighted covariance matrix. Similar to ``var`` and ``std`` the biased covariance matrix (``corrected=false``) is computed by multiplying ``scattermat(X, w)`` by :math:`\frac{1}{\sum{w}}` to normalize. However, the unbiased covariance matrix (``corrected=true``) is dependent on the type of weights used: - - ``AnalyticWeights``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` - - ``FrequencyWeights``: :math:`\frac{1}{\sum{w} - 1}` - - ``ProbabilityWeights``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals ``count(!iszero, w)`` - - ``Weights``: ``ArgumentError`` (bias correction not supported) + * ``AnalyticWeights``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` + * ``FrequencyWeights``: :math:`\frac{1}{\sum{w} - 1}` + * ``ProbabilityWeights``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals ``count(!iszero, w)`` + * ``Weights``: ``ArgumentError`` (bias correction not supported) .. function:: mean_and_cov(x[, wv][; vardim=..., corrected=...]) Jointly compute the mean and covariance matrix as a tuple. - A weighting vector `wv` can be specified. `vardim` that designates whether the variables are columns in the matrix (`1`) or rows (`2`). + A weighting vector ``wv`` can be specified. ``vardim`` that designates whether the variables are columns in the matrix (``1``) or rows (``2``). Finally, bias correction is applied to the covariance calculation if ``corrected=true``. See ``cov`` documentation for more details. diff --git a/docs/source/scalarstats.rst b/docs/source/scalarstats.rst index 2c6e2b4ec..82100f54b 100644 --- a/docs/source/scalarstats.rst +++ b/docs/source/scalarstats.rst @@ -18,10 +18,10 @@ Moments The unbiased estimate (when ``corrected=true``) of the population variance is computed by replacing :math:`\frac{1}{\sum{w}}` with a factor dependent on the type of weights used: - - ``AnalyticWeights``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` - - ``FrequencyWeights``: :math:`\frac{1}{\sum{w} - 1}` - - ``ProbabilityWeights``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals ``count(!iszero, w)`` - - ``Weights``: ``ArgumentError`` (bias correction not supported) + * ``AnalyticWeights``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` + * ``FrequencyWeights``: :math:`\frac{1}{\sum{w} - 1}` + * ``ProbabilityWeights``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals ``count(!iszero, w)`` + * ``Weights``: ``ArgumentError`` (bias correction not supported) .. function:: std(v, w, [dim][; mean=..., corrected=...]) @@ -36,10 +36,10 @@ Moments computed by replacing :math:`\frac{1}{\sum{w}}` with a factor dependent on the type of weights used: - - ``AnalyticWeights``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` - - ``FrequencyWeights``: :math:`\frac{1}{\sum{w} - 1}` - - ``ProbabilityWeights``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals ``count(!iszero, w)`` - - ``Weights``: ``ArgumentError`` (bias correction not supported) + * ``AnalyticWeights``: :math:`\frac{1}{\sum w - \sum {w^2} / \sum w}` + * ``FrequencyWeights``: :math:`\frac{1}{\sum{w} - 1}` + * ``ProbabilityWeights``: :math:`\frac{n}{(n - 1) \sum w}` where ``n`` equals ``count(!iszero, w)`` + * ``Weights``: ``ArgumentError`` (bias correction not supported) .. function:: mean_and_var(x[, w][, dim][; corrected=...]) @@ -50,9 +50,9 @@ Moments .. function:: mean_and_std(x[, w][, dim][; corrected=...]) - Jointly compute the mean and standard deviation of a real-valued array `x`, optionally over a dimension `dim`, as a tuple. - A weighting vector `w` can be specified to weight the estimates. - Finally, bias correction is applied to the standard deviation calculation if `corrected=true`. + Jointly compute the mean and standard deviation of a real-valued array ``x``, optionally over a dimension ``dim``, as a tuple. + A weighting vector ``w`` can be specified to weight the estimates. + Finally, bias correction is applied to the standard deviation calculation if ``corrected=true``. See ``std`` documentation for more details. .. function:: skewness(x[, wv])