From a7fb967495ab68dff11562277e01ad3e7f0f855b Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Mon, 22 May 2023 22:56:03 +0200 Subject: [PATCH] Add OrderStatistic and JointOrderStatistics distributions (#1668) * Add OrderStatistic distribution * Add JointOrderStatistics distribution * Simplify implementation of logpdf * Bug fix * Add complementary methods * Avoid code duplication * Repair cquantile * Add OrderStatistic tests * Fix cquantile and add comments * Actually repair cquantile function * Add more tests * Remove suffix * Remove scrU * Use correction for multiple tests * Simplify code * Use correct distribution name * Use larger sample size * Ensure return type is constant * Swap i and j * Ensure eltype is returned * Add JointOrderStatistics tests * Add reference * Link between docstrings * Unify OrderStatistic tests * More stringently compute number of checks * Update testset name * Increase number of draws * Set seed in testset * Add an order statistics docs page * Rename rank variables to `rank` and `ranks` * Make arg checking more efficient * Support tuple of ranks * Apply suggestions from code review Co-authored-by: David Widmann * Use Fill * Reduce chances of type-instability * Ensure type-stability and use GammaMTSampler * Update docstrings * Apply suggestions from code review Co-authored-by: David Widmann * Add tests that check for empty ranks * Add comment explaining choice of gamma sampler * Apply suggestions from code review Co-authored-by: David Widmann * Test for 0 density out of support * Return -Inf for logpdf if out of support --------- Co-authored-by: David Widmann --- docs/make.jl | 1 + docs/src/order_statistics.md | 16 ++ src/Distributions.jl | 2 + src/multivariate/jointorderstatistics.jl | 168 ++++++++++++++++ src/multivariates.jl | 1 + src/univariate/orderstatistic.jl | 108 ++++++++++ src/univariates.jl | 2 + test/multivariate/jointorderstatistics.jl | 232 ++++++++++++++++++++++ test/runtests.jl | 2 + test/univariate/orderstatistic.jl | 232 ++++++++++++++++++++++ 10 files changed, 764 insertions(+) create mode 100644 docs/src/order_statistics.md create mode 100644 src/multivariate/jointorderstatistics.jl create mode 100644 src/univariate/orderstatistic.jl create mode 100644 test/multivariate/jointorderstatistics.jl create mode 100644 test/univariate/orderstatistic.jl diff --git a/docs/make.jl b/docs/make.jl index 39e51d4db6..f95b3f60b3 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -17,6 +17,7 @@ makedocs( "reshape.md", "cholesky.md", "mixture.md", + "order_statistics.md", "convolution.md", "fit.md", "extends.md", diff --git a/docs/src/order_statistics.md b/docs/src/order_statistics.md new file mode 100644 index 0000000000..cd2bad58bd --- /dev/null +++ b/docs/src/order_statistics.md @@ -0,0 +1,16 @@ +# Order Statistics + +The $i$th [Order Statistic](https://en.wikipedia.org/wiki/Order_statistic) of a random sample of size $n$ from a univeriate distribution is the $i$th element after sorting in increasing order. +As a special case, the first and $n$th order statistics are the minimum and maximum of the sample, while for odd $n$, the $\lceil \frac{n}{2} \rceil$th entry is the sample median. + +Given any univariate distribution and the sample size $n$, we can construct the distribution of its $i$th order statistic: + +```@docs +OrderStatistic +``` + +If we are interested in more than one order statistic, for continuous univariate distributions we can also construct the joint distribution of order statistics: + +```@docs +JointOrderStatistics +``` diff --git a/src/Distributions.jl b/src/Distributions.jl index 489f6435ac..1e2d580c55 100644 --- a/src/Distributions.jl +++ b/src/Distributions.jl @@ -108,6 +108,7 @@ export IsoNormal, IsoNormalCanon, JohnsonSU, + JointOrderStatistics, Kolmogorov, KSDist, KSOneSided, @@ -144,6 +145,7 @@ export Normal, NormalCanon, NormalInverseGaussian, + OrderStatistic, Pareto, PGeneralizedGaussian, SkewedExponentialPower, diff --git a/src/multivariate/jointorderstatistics.jl b/src/multivariate/jointorderstatistics.jl new file mode 100644 index 0000000000..71c3f93bf3 --- /dev/null +++ b/src/multivariate/jointorderstatistics.jl @@ -0,0 +1,168 @@ +# Implementation based on chapters 2-4 of +# Arnold, Barry C., Narayanaswamy Balakrishnan, and Haikady Navada Nagaraja. +# A first course in order statistics. Society for Industrial and Applied Mathematics, 2008. + +""" + JointOrderStatistics <: ContinuousMultivariateDistribution + +The joint distribution of a subset of order statistics from a sample from a continuous +univariate distribution. + + JointOrderStatistics( + dist::ContinuousUnivariateDistribution, + n::Int, + ranks=Base.OneTo(n); + check_args::Bool=true, + ) + +Construct the joint distribution of order statistics for the specified `ranks` from an IID +sample of size `n` from `dist`. + +The ``i``th order statistic of a sample is the ``i``th element of the sorted sample. +For example, the 1st order statistic is the sample minimum, while the ``n``th order +statistic is the sample maximum. + +`ranks` must be a sorted vector or tuple of unique `Int`s between 1 and `n`. + +For a single order statistic, use [`OrderStatistic`](@ref) instead. + +## Examples + +```julia +JointOrderStatistics(Normal(), 10) # Product(fill(Normal(), 10)) restricted to ordered vectors +JointOrderStatistics(Cauchy(), 10, 2:9) # joint distribution of all but the extrema +JointOrderStatistics(Cauchy(), 10, (1, 10)) # joint distribution of only the extrema +``` +""" +struct JointOrderStatistics{ + D<:ContinuousUnivariateDistribution,R<:Union{AbstractVector{Int},Tuple{Int,Vararg{Int}}} +} <: ContinuousMultivariateDistribution + dist::D + n::Int + ranks::R + function JointOrderStatistics( + dist::ContinuousUnivariateDistribution, + n::Int, + ranks::Union{AbstractVector{Int},Tuple{Int,Vararg{Int}}}=Base.OneTo(n); + check_args::Bool=true, + ) + @check_args( + JointOrderStatistics, + (n, n ≥ 1, "`n` must be a positive integer."), + ( + ranks, + _are_ranks_valid(ranks, n), + "`ranks` must be a sorted vector or tuple of unique integers between 1 and `n`.", + ), + ) + return new{typeof(dist),typeof(ranks)}(dist, n, ranks) + end +end + +_islesseq(x, y) = isless(x, y) || isequal(x, y) + +function _are_ranks_valid(ranks, n) + # this is equivalent to but faster than + # issorted(ranks) && allunique(ranks) + !isempty(ranks) && first(ranks) ≥ 1 && last(ranks) ≤ n && issorted(ranks; lt=_islesseq) +end +function _are_ranks_valid(ranks::AbstractRange, n) + !isempty(ranks) && first(ranks) ≥ 1 && last(ranks) ≤ n && step(ranks) > 0 +end + +length(d::JointOrderStatistics) = length(d.ranks) +function insupport(d::JointOrderStatistics, x::AbstractVector) + length(d) == length(x) || return false + xi, state = iterate(x) # at least one element! + dist = d.dist + insupport(dist, xi) || return false + while (xj_state = iterate(x, state)) !== nothing + xj, state = xj_state + xj ≥ xi && insupport(dist, xj) || return false + xi = xj + end + return true +end +minimum(d::JointOrderStatistics) = Fill(minimum(d.dist), length(d)) +maximum(d::JointOrderStatistics) = Fill(maximum(d.dist), length(d)) + +params(d::JointOrderStatistics) = tuple(params(d.dist)..., d.n, d.ranks) +partype(d::JointOrderStatistics) = partype(d.dist) +Base.eltype(::Type{<:JointOrderStatistics{D}}) where {D} = Base.eltype(D) +Base.eltype(d::JointOrderStatistics) = eltype(d.dist) + +function logpdf(d::JointOrderStatistics, x::AbstractVector{<:Real}) + n = d.n + ranks = d.ranks + lp = loglikelihood(d.dist, x) + T = typeof(lp) + lp += loggamma(T(n + 1)) + if length(ranks) == n + issorted(x) && return lp + return oftype(lp, -Inf) + end + i = first(ranks) + xᵢ = first(x) + if i > 1 # _marginalize_range(d.dist, 0, i, -Inf, xᵢ, T) + lp += (i - 1) * logcdf(d.dist, xᵢ) - loggamma(T(i)) + end + for (j, xⱼ) in Iterators.drop(zip(ranks, x), 1) + xⱼ < xᵢ && return oftype(lp, -Inf) + lp += _marginalize_range(d.dist, i, j, xᵢ, xⱼ, T) + i = j + xᵢ = xⱼ + end + if i < n # _marginalize_range(d.dist, i, n + 1, xᵢ, Inf, T) + lp += (n - i) * logccdf(d.dist, xᵢ) - loggamma(T(n - i + 1)) + end + return lp +end + +# given ∏ₖf(xₖ), marginalize all xₖ for i < k < j +function _marginalize_range(dist, i, j, xᵢ, xⱼ, T) + k = j - i - 1 + k == 0 && return zero(T) + return k * T(logdiffcdf(dist, xⱼ, xᵢ)) - loggamma(T(k + 1)) +end + +function _rand!(rng::AbstractRNG, d::JointOrderStatistics, x::AbstractVector{<:Real}) + n = d.n + if n == length(d.ranks) # ranks == 1:n + # direct method, slower than inversion method for large `n` and distributions with + # fast quantile function or that use inversion sampling + rand!(rng, d.dist, x) + sort!(x) + else + # use exponential generation method with inversion, where for gaps in the ranks, we + # use the fact that the sum Y of k IID variables xₘ ~ Exp(1) is Y ~ Gamma(k, 1). + # Lurie, D., and H. O. Hartley. "Machine-generation of order statistics for Monte + # Carlo computations." The American Statistician 26.1 (1972): 26-27. + # this is slow if length(d.ranks) is close to n and quantile for d.dist is expensive, + # but this branch is probably taken when length(d.ranks) is small or much smaller than n. + T = typeof(one(eltype(x))) + s = zero(eltype(x)) + i = 0 + for (m, j) in zip(eachindex(x), d.ranks) + k = j - i + if k > 1 + # specify GammaMTSampler directly to avoid unnecessarily checking the shape + # parameter again and because it has been benchmarked to be the fastest for + # shape k ≥ 1 and scale 1 + s += T(rand(rng, GammaMTSampler(Gamma{T}(T(k), T(1))))) + else + s += randexp(rng, T) + end + i = j + x[m] = s + end + j = n + 1 + k = j - i + if k > 1 + s += T(rand(rng, GammaMTSampler(Gamma{T}(T(k), T(1))))) + else + s += randexp(rng, T) + end + x .= quantile.(d.dist, x ./ s) + end + return x +end diff --git a/src/multivariates.jl b/src/multivariates.jl index 477c78ba5a..7a6f926a65 100644 --- a/src/multivariates.jl +++ b/src/multivariates.jl @@ -112,6 +112,7 @@ end for fname in ["dirichlet.jl", "multinomial.jl", "dirichletmultinomial.jl", + "jointorderstatistics.jl", "mvnormal.jl", "mvnormalcanon.jl", "mvlognormal.jl", diff --git a/src/univariate/orderstatistic.jl b/src/univariate/orderstatistic.jl new file mode 100644 index 0000000000..1a7055ef91 --- /dev/null +++ b/src/univariate/orderstatistic.jl @@ -0,0 +1,108 @@ +# Implementation based on chapters 2-4 of +# Arnold, Barry C., Narayanaswamy Balakrishnan, and Haikady Navada Nagaraja. +# A first course in order statistics. Society for Industrial and Applied Mathematics, 2008. + +""" + OrderStatistic{D<:UnivariateDistribution,S<:ValueSupport} <: UnivariateDistribution{S} + +The distribution of an order statistic from IID samples from a univariate distribution. + + OrderStatistic(dist::UnivariateDistribution, n::Int, rank::Int; check_args::Bool=true) + +Construct the distribution of the `rank` ``=i``th order statistic from `n` independent +samples from `dist`. + +The ``i``th order statistic of a sample is the ``i``th element of the sorted sample. +For example, the 1st order statistic is the sample minimum, while the ``n``th order +statistic is the sample maximum. + +If ``f`` is the probability density (mass) function of `dist` with distribution function +``F``, then the probability density function ``g`` of the order statistic for continuous +`dist` is +```math +g(x; n, i) = {n \\choose i} [F(x)]^{i-1} [1 - F(x)]^{n-i} f(x), +``` +and the probability mass function ``g`` of the order statistic for discrete `dist` is +```math +g(x; n, i) = \\sum_{k=i}^n {n \\choose k} \\left( [F(x)]^k [1 - F(x)]^{n-k} - [F(x_-)]^k [1 - F(x_-)]^{n-k} \\right), +``` +where ``x_-`` is the largest element in the support of `dist` less than ``x``. + +For the joint distribution of a subset of order statistics, use +[`JointOrderStatistics`](@ref) instead. + +## Examples + +```julia +OrderStatistic(Cauchy(), 10, 1) # distribution of the sample minimum +OrderStatistic(DiscreteUniform(10), 10, 10) # distribution of the sample maximum +OrderStatistic(Gamma(1, 1), 11, 5) # distribution of the sample median +``` +""" +struct OrderStatistic{D<:UnivariateDistribution,S<:ValueSupport} <: + UnivariateDistribution{S} + dist::D + n::Int + rank::Int + function OrderStatistic( + dist::UnivariateDistribution, n::Int, rank::Int; check_args::Bool=true + ) + @check_args(OrderStatistic, 1 ≤ rank ≤ n) + return new{typeof(dist),value_support(typeof(dist))}(dist, n, rank) + end +end + +minimum(d::OrderStatistic) = minimum(d.dist) +maximum(d::OrderStatistic) = maximum(d.dist) +insupport(d::OrderStatistic, x::Real) = insupport(d.dist, x) + +params(d::OrderStatistic) = tuple(params(d.dist)..., d.n, d.rank) +partype(d::OrderStatistic) = partype(d.dist) +Base.eltype(::Type{<:OrderStatistic{D}}) where {D} = Base.eltype(D) +Base.eltype(d::OrderStatistic) = eltype(d.dist) + +# distribution of the ith order statistic from an IID uniform distribution, with CDF Uᵢₙ(x) +function _uniform_orderstatistic(d::OrderStatistic) + n = d.n + rank = d.rank + return Beta{Int}(rank, n - rank + 1) +end + +function logpdf(d::OrderStatistic, x::Real) + b = _uniform_orderstatistic(d) + p = cdf(d.dist, x) + if value_support(typeof(d)) === Continuous + return logpdf(b, p) + logpdf(d.dist, x) + else + return logdiffcdf(b, p, p - pdf(d.dist, x)) + end +end + +for f in (:logcdf, :logccdf, :cdf, :ccdf) + @eval begin + function $f(d::OrderStatistic, x::Real) + b = _uniform_orderstatistic(d) + return $f(b, cdf(d.dist, x)) + end + end +end + +for f in (:quantile, :cquantile) + @eval begin + function $f(d::OrderStatistic, p::Real) + # since cdf is Fᵢₙ(x) = Uᵢₙ(Fₓ(x)), and Uᵢₙ is invertible and increasing, we + # have Fₓ(x) = Uᵢₙ⁻¹(Fᵢₙ(x)). then quantile function is + # Qᵢₙ(p) = inf{x: p ≤ Fᵢₙ(x)} = inf{x: Uᵢₙ⁻¹(p) ≤ Fₓ(x)} = Qₓ(Uᵢₙ⁻¹(p)) + b = _uniform_orderstatistic(d) + return quantile(d.dist, $f(b, p)) + end + end +end + +function rand(rng::AbstractRNG, d::OrderStatistic) + # inverse transform sampling. Since quantile function is Qₓ(Uᵢₙ⁻¹(p)), we draw a random + # variable from Uᵢₙ and pass it through the quantile function of `d.dist` + T = eltype(d.dist) + b = _uniform_orderstatistic(d) + return T(quantile(d.dist, rand(rng, b))) +end diff --git a/src/univariates.jl b/src/univariates.jl index 16d1a2e71e..726fd8e429 100644 --- a/src/univariates.jl +++ b/src/univariates.jl @@ -731,3 +731,5 @@ end for dname in continuous_distributions include(joinpath("univariate", "continuous", "$(dname).jl")) end + +include(joinpath("univariate", "orderstatistic.jl")) diff --git a/test/multivariate/jointorderstatistics.jl b/test/multivariate/jointorderstatistics.jl new file mode 100644 index 0000000000..d5d65a752e --- /dev/null +++ b/test/multivariate/jointorderstatistics.jl @@ -0,0 +1,232 @@ +using Distributions, LinearAlgebra, Random, SpecialFunctions, Statistics, Test + +@testset "JointOrderStatistics" begin + Random.seed!(123) + + @testset "check_args" begin + dist = Normal() + JointOrderStatistics(dist, 2, 1:2) + JointOrderStatistics(dist, 3, 2:3) + JointOrderStatistics(dist, 5, [2, 3]) + @test_throws DomainError JointOrderStatistics(dist, 0, 1:2) + @test_throws DomainError JointOrderStatistics(dist, 2, 2:3) + @test_throws DomainError JointOrderStatistics(dist, 3, 0:3) + @test_throws DomainError JointOrderStatistics(dist, 5, 3:-1:2) + @test_throws DomainError JointOrderStatistics(dist, 5, 2:1:1) + @test_throws DomainError JointOrderStatistics(dist, 0, [1, 2]) + @test_throws DomainError JointOrderStatistics(dist, 2, [2, 3]) + @test_throws DomainError JointOrderStatistics(dist, 3, [0, 1, 2, 3]) + @test_throws DomainError JointOrderStatistics(dist, 5, Int[]) + @test_throws DomainError JointOrderStatistics(dist, 5, (3, 2)) + @test_throws DomainError JointOrderStatistics(dist, 5, (3, 3)) + JointOrderStatistics(dist, 0, 1:2; check_args=false) + JointOrderStatistics(dist, 2, 2:3; check_args=false) + JointOrderStatistics(dist, 3, 0:3; check_args=false) + JointOrderStatistics(dist, 5, 3:-1:2; check_args=false) + JointOrderStatistics(dist, 5, 2:1:1; check_args=false) + JointOrderStatistics(dist, 0, [1, 2]; check_args=false) + JointOrderStatistics(dist, 2, [2, 3]; check_args=false) + JointOrderStatistics(dist, 3, [0, 1, 2, 3]; check_args=false) + JointOrderStatistics(dist, 5, Int[]; check_args=false) + JointOrderStatistics(dist, 5, (3, 2); check_args=false) + JointOrderStatistics(dist, 5, (3, 3); check_args=false) + end + + @testset for T in [Float32, Float64], + dist in [Uniform(T(2), T(10)), Exponential(T(10)), Normal(T(100), T(10))], + n in [16, 40], + r in [ + 1:n, + ([i, j] for j in 2:n for i in 1:min(10, j - 1))..., + vcat(2:4, (n - 10):(n - 5)), + (2, n ÷ 2, n - 5), + ] + + d = JointOrderStatistics(dist, n, r) + + @testset "basic" begin + @test d isa JointOrderStatistics + @test d.dist === dist + @test d.n === n + @test d.ranks === r + @test length(d) == length(r) + @test params(d) == (params(dist)..., d.n, d.ranks) + @test partype(d) === partype(dist) + @test eltype(d) === eltype(dist) + + length(r) == n && @test JointOrderStatistics(dist, n) == d + end + + @testset "support" begin + @test minimum(d) == fill(minimum(dist), length(r)) + @test maximum(d) == fill(maximum(dist), length(r)) + x = sort(rand(dist, length(r))) + x2 = sort(rand(dist, length(r) + 1)) + @test insupport(d, x) + if length(x) > 1 + @test !insupport(d, reverse(x)) + @test !insupport(d, x[1:(end - 1)]) + end + @test !insupport(d, x2) + @test !insupport(d, fill(NaN, length(x))) + end + + @testset "pdf/logpdf" begin + x = convert(Vector{T}, sort(rand(dist, length(r)))) + @test @inferred(logpdf(d, x)) isa T + @test @inferred(pdf(d, x)) isa T + + if length(r) == 1 + @test logpdf(d, x) ≈ logpdf(OrderStatistic(dist, n, r[1]), x[1]) + @test pdf(d, x) ≈ pdf(OrderStatistic(dist, n, r[1]), x[1]) + elseif length(r) == 2 + i, j = r + xi, xj = x + lc = T( + logfactorial(n) - logfactorial(i - 1) - logfactorial(n - j) - + logfactorial(j - i - 1), + ) + lp = ( + lc + + (i - 1) * logcdf(dist, xi) + + (n - j) * logccdf(dist, xj) + + (j - i - 1) * logdiffcdf(dist, xj, xi) + + logpdf(dist, xi) + + logpdf(dist, xj) + ) + @test logpdf(d, x) ≈ lp + @test pdf(d, x) ≈ exp(lp) + elseif collect(r) == 1:n + @test logpdf(d, x) ≈ sum(Base.Fix1(logpdf, d.dist), x) + loggamma(T(n + 1)) + @test pdf(d, x) ≈ exp(logpdf(d, x)) + end + + @testset "no density for vectors out of support" begin + # check unsorted vectors have 0 density + x2 = copy(x) + x2[1], x2[2] = x2[2], x2[1] + @test logpdf(d, x2) == T(-Inf) + @test pdf(d, x2) == zero(T) + + x3 = copy(x) + x3[end-1], x3[end] = x3[end], x3[end-1] + @test logpdf(d, x3) == T(-Inf) + @test pdf(d, x3) == zero(T) + + # check out of support of original distribution + if islowerbounded(dist) + x4 = copy(x) + x4[1] = minimum(dist) - 1 + @test logpdf(d, x4) == T(-Inf) + @test pdf(d, x4) == zero(T) + end + end + end + end + + @testset "rand" begin + @testset for T in [Float32, Float64] + dist = Uniform(T(-2), T(1)) + d = JointOrderStatistics(dist, 10, 1:10) + S = typeof(rand(dist)) + + v = rand(d) + @test v isa Vector{S} + @test insupport(d, v) + @test size(v) == (10,) + + rng = Random.default_rng() + Random.seed!(rng, 42) + x = @inferred(rand(rng, d, 20)) + @test x isa Matrix{S} + @test size(x) == (10, 20) + @test all(xi -> insupport(d, xi), eachcol(x)) + + Random.seed!(rng, 42) + x2 = rand(rng, d, 20) + @test x2 == x + end + + ndraws = 300_000 + dists = [Uniform(), Exponential()] + + @testset "marginal mean and standard deviation" begin + n = 20 + rs = [1:n, [1, n], vcat(1:7, 12:17)] + @testset for dist in dists, r in rs + d = JointOrderStatistics(dist, n, r) + x = rand(d, ndraws) + @test all(xi -> insupport(d, xi), eachcol(x)) + + m = mean(x; dims=2) + v = var(x; mean=m, dims=2) + if dist isa Uniform + # Arnold (2008). A first course in order statistics. eq 2.2.20-21 + m_exact = r ./ (n + 1) + v_exact = @. (m_exact * (1 - m_exact) / (n + 2)) + elseif dist isa Exponential + # Arnold (2008). A first course in order statistics. eq 4.6.6-7 + m_exact = [sum(k -> inv(n - k + 1), 1:i) for i in r] + v_exact = [sum(k -> inv((n - k + 1)^2), 1:i) for i in r] + end + # compute asymptotic sample standard deviation + mean_std = @. sqrt(v_exact / ndraws) + m4 = dropdims(mapslices(xi -> moment(xi, 4), x; dims=2); dims=2) + var_std = @. sqrt((m4 - v_exact^2) / ndraws) + + nchecks = length(r) + α = (0.01 / nchecks) / 2 # multiple correction + tol = quantile(Normal(), 1 - α) + for k in eachindex(m, m_exact, v, v_exact, mean_std, var_std) + @test m[k] ≈ m_exact[k] atol = (tol * mean_std[k]) + @test v[k] ≈ v_exact[k] atol = (tol * var_std[k]) + end + end + end + + @testset "pairwise correlations" begin + n = 100 + rs = [ # good mixture of r values with gaps and no gaps + 1:n, + vcat(1:10, (div(n, 2) - 5):(div(n, 2) + 5), (n - 9):n), + vcat(10:20, (n - 19):(n - 10)), + (1, n), + ] + + nchecks = length(dists) * sum(rs) do r + m = length(r) + return div(m * (m - 1), 2) + end + α = (0.01 / nchecks) / 2 # multiple correction + tol = quantile(Normal(), 1 - α) / sqrt(ndraws) + + @testset for dist in dists, r in rs + d = JointOrderStatistics(dist, n, r) + x = rand(d, ndraws) + @test all(xi -> insupport(d, xi), eachcol(x)) + + m = length(r) + + xcor = cor(x; dims=2) + if dist isa Uniform + # Arnold (2008). A first course in order statistics. Eq 2.3.16 + s = @. n - r + 1 + xcor_exact = Symmetric(sqrt.((r .* collect(s)') ./ (collect(r)' .* s))) + elseif dist isa Exponential + # Arnold (2008). A first course in order statistics. Eq 4.6.8 + v = [sum(k -> inv((n - k + 1)^2), 1:i) for i in r] + xcor_exact = Symmetric(sqrt.(v ./ v')) + end + for ii in 1:m, ji in (ii + 1):m + i = r[ii] + j = r[ji] + ρ = xcor[ii, ji] + ρ_exact = xcor_exact[ii, ji] + # use variance-stabilizing transformation, recommended in §3.6 of + # Van der Vaart, A. W. (2000). Asymptotic statistics (Vol. 3). + @test atanh(ρ) ≈ atanh(ρ_exact) atol = tol + end + end + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 9c8f5b3a4b..cb724278b4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -88,6 +88,8 @@ const tests = [ "univariate/continuous/skewedexponentialpower", "univariate/discrete/discreteuniform", "univariate/continuous/tdist", + "univariate/orderstatistic", + "multivariate/jointorderstatistics", "multivariate/product", "eachvariate", "univariate/continuous/triangular", diff --git a/test/univariate/orderstatistic.jl b/test/univariate/orderstatistic.jl new file mode 100644 index 0000000000..5cd65cb856 --- /dev/null +++ b/test/univariate/orderstatistic.jl @@ -0,0 +1,232 @@ +using Test, Distributions +using Random +using StatsBase + +@testset "OrderStatistic" begin + @testset "basic" begin + for dist in [Uniform(), Normal(), DiscreteUniform(10)], n in [1, 2, 10], i in 1:n + d = OrderStatistic(dist, n, i) + @test d isa OrderStatistic + if dist isa DiscreteUnivariateDistribution + @test d isa DiscreteUnivariateDistribution + else + @test d isa ContinuousUnivariateDistribution + end + @test d.dist === dist + @test d.n == n + @test d.rank == i + end + @test_throws ArgumentError OrderStatistic(Normal(), 0, 1) + OrderStatistic(Normal(), 0, 1; check_args=false) + @test_throws ArgumentError OrderStatistic(Normal(), 10, 11) + OrderStatistic(Normal(), 10, 11; check_args=false) + @test_throws ArgumentError OrderStatistic(Normal(), 10, 0) + OrderStatistic(Normal(), 10, 0; check_args=false) + end + + @testset "params" begin + for dist in [Uniform(), Normal(), DiscreteUniform(10)], n in [1, 2, 10], i in 1:n + d = OrderStatistic(dist, n, i) + @test params(d) == (params(dist)..., n, i) + @test partype(d) === partype(dist) + end + end + + @testset "support" begin + n = 10 + for i in 1:10 + d1 = OrderStatistic(Uniform(), n, i) + @test minimum(d1) == 0 + @test maximum(d1) == 1 + @test insupport(d1, 0) + @test insupport(d1, 0.5) + @test insupport(d1, 1) + @test !insupport(d1, -eps()) + @test !insupport(d1, 1 + eps()) + @test !hasfinitesupport(d1) + @test islowerbounded(d1) + @test isupperbounded(d1) + + d2 = OrderStatistic(Normal(), n, i) + @test minimum(d2) == -Inf + @test maximum(d2) == Inf + @test insupport(d2, -Inf) + @test insupport(d2, 0) + @test insupport(d2, Inf) + @test !hasfinitesupport(d2) + @test !islowerbounded(d2) + @test !isupperbounded(d2) + + d3 = OrderStatistic(DiscreteUniform(1, 10), n, i) + @test minimum(d3) == 1 + @test maximum(d3) == 10 + @test insupport(d3, 1) + @test insupport(d3, 5) + @test insupport(d3, 10) + @test !insupport(d3, 0) + @test !insupport(d3, 11) + @test hasfinitesupport(d3) + @test islowerbounded(d3) + @test isupperbounded(d3) + end + end + + @testset "pdf/logpdf" begin + @testset "continuous" begin + # test against the exact formula computed using BigFloats + @testset for T in (Float32, Float64) + @testset for dist in + [Uniform(T(-2), T(1)), Normal(T(3), T(2)), Exponential(T(10))], + n in [1, 10, 100], + i in 1:n + + d = OrderStatistic(dist, n, i) + c = factorial(big(n)) / factorial(big(i - 1)) / factorial(big(n - i)) + # since density is concentrated around the i/n quantile, sample a point + # nearby it + x = quantile(dist, clamp(i / n + (rand() - 1//2) / 10, 0, 1)) + p = cdf(dist, big(x)) + pdf_exp = c * p^(i - 1) * (1 - p)^(n - i) * pdf(dist, big(x)) + @test @inferred(T, pdf(d, x)) ≈ T(pdf_exp) + @test @inferred(T, logpdf(d, x)) ≈ T(log(pdf_exp)) + end + end + end + @testset "discrete" begin + # test check that the pdf is the difference of the CDF at adjacent points + @testset for dist in + [DiscreteUniform(10, 30), Poisson(100.0), Binomial(20, 0.3)], + n in [1, 10, 100], + i in 1:n + + d = OrderStatistic(dist, n, i) + xs = quantile(dist, 0.01):quantile(dist, 0.99) + for x in xs + p = @inferred pdf(d, x) + lp = @inferred logpdf(d, x) + @test lp ≈ logdiffcdf(d, x, x - 1) + @test p ≈ exp(lp) + end + end + end + end + + @testset "distribution normalizes to 1" begin + @testset for dist in [ + Uniform(-2, 1), + Normal(2, 3), + Exponential(5), + DiscreteUniform(10, 40), + Poisson(100), + ], + n in [1, 10, 20], + i in 1:n + + d = OrderStatistic(dist, n, i) + Distributions.expectation(one, d) ≈ 1 + end + end + + @testset "cdf/logcdf/ccdf/logccdf/quantile/cquantile" begin + # test against the exact formula computed using BigFloats + @testset for T in (Float32, Float64) + dists = [ + (Uniform(T(-2), T(1)), Uniform(big(-2), big(1))), + (Normal(T(3), T(2)), Normal(big(3), big(2))), + (Exponential(T(10)), Exponential(big(10))), + (DiscreteUniform(1, 10), DiscreteUniform(1, 10)), + (Poisson(T(20)), Poisson(big(20))), + ] + @testset for (dist, bigdist) in dists, n in [10, 100], i in 1:n + dist isa DiscreteDistribution && T !== Float64 && continue + d = OrderStatistic(dist, n, i) + # since density is concentrated around the i/n quantile, sample a point + # nearby it + x = quantile(dist, clamp(i / n + (rand() - 1//2) / 10, 1e-4, 1 - 1e-4)) + p = cdf(bigdist, big(x)) + cdf_exp = sum(i:n) do j + c = binomial(big(n), big(j)) + return c * p^j * (1 - p)^(n - j) + end + @test @inferred(T, cdf(d, x)) ≈ T(cdf_exp) + @test cdf(d, maximum(d)) ≈ one(T) + @test cdf(d, minimum(d) - 1) ≈ zero(T) + @test @inferred(T, logcdf(d, x)) ≈ T(log(cdf_exp)) + @test logcdf(d, maximum(d)) ≈ zero(T) + @test logcdf(d, minimum(d) - 1) ≈ -Inf + @test @inferred(T, ccdf(d, x)) ≈ T(1 - cdf_exp) + @test ccdf(d, maximum(d)) ≈ zero(T) + @test ccdf(d, minimum(d) - 1) ≈ one(T) + @test @inferred(T, logccdf(d, x)) ≈ T(log(1 - cdf_exp)) + @test logccdf(d, maximum(d)) ≈ -Inf + @test logccdf(d, minimum(d) - 1) ≈ zero(T) + q = cdf(d, x) + if dist isa DiscreteDistribution + # for discrete distributions, tiny numerical error can cause the wrong + # integer value to be returned. + q -= sqrt(eps(T)) + end + xq = @inferred(T, quantile(d, q)) + xqc = @inferred(T, cquantile(d, 1 - q)) + @test xq ≈ xqc + @test isapprox(xq, T(x); atol=1e-4) || + (dist isa DiscreteDistribution && xq < x) + end + end + end + + @testset "rand" begin + @testset for T in [Float32, Float64], + dist in [Uniform(T(-2), T(1)), Normal(T(1), T(2))] + + d = OrderStatistic(dist, 10, 5) + rng = Random.default_rng() + Random.seed!(rng, 42) + x = @inferred(rand(rng, d)) + xs = @inferred(rand(rng, d, 10)) + S = eltype(rand(dist)) + @test typeof(x) === S + @test eltype(xs) === S + @test length(xs) == 10 + + Random.seed!(rng, 42) + x2 = rand(rng, d) + xs2 = rand(rng, d, 10) + @test x2 == x + @test xs2 == xs + end + + ndraws = 100_000 + nchecks = 4 * 2 * 111 # NOTE: update if the below number of tests changes + α = (0.01 / nchecks) / 2 # multiple correction + tol = quantile(Normal(), 1 - α) + + @testset for dist in [Uniform(), Exponential(), Poisson(20), Binomial(20, 0.3)] + @testset for n in [1, 10, 100], i in 1:n + d = OrderStatistic(dist, n, i) + x = rand(d, ndraws) + m, v = mean_and_var(x) + if dist isa Uniform + # Arnold (2008). A first course in order statistics. Eqs 2.2.20-21 + m_exact = i / (n + 1) + v_exact = m_exact * (1 - m_exact) / (n + 2) + elseif dist isa Exponential + # Arnold (2008). A first course in order statistics. Eqs 4.6.6-7 + m_exact = sum(k -> inv(n - k + 1), 1:i) + v_exact = sum(k -> inv((n - k + 1)^2), 1:i) + elseif dist isa DiscreteUnivariateDistribution + # estimate mean and variance with explicit sum, Eqs 3.2.6-7 from + # Arnold (2008). A first course in order statistics. + xs = 0:quantile(dist, 0.9999) + m_exact = sum(x -> ccdf(d, x), xs) + v_exact = 2 * sum(x -> x * ccdf(d, x), xs) + m_exact - m_exact^2 + end + # compute asymptotic sample standard deviation + mean_std = sqrt(v_exact / ndraws) + var_std = sqrt((moment(x, 4) - v_exact^2) / ndraws) + @test m ≈ m_exact atol = (tol * mean_std) + @test v ≈ v_exact atol = (tol * var_std) + end + end + end +end