Add OrderStatistic and JointOrderStatistics distributions (#1668)

* Add OrderStatistic distribution * Add JointOrderStatistics distribution * Simplify implementation of logpdf * Bug fix * Add complementary methods * Avoid code duplication * Repair cquantile * Add OrderStatistic tests * Fix cquantile and add comments * Actually repair cquantile function * Add more tests * Remove suffix * Remove scrU * Use correction for multiple tests * Simplify code * Use correct distribution name * Use larger sample size * Ensure return type is constant * Swap i and j * Ensure eltype is returned * Add JointOrderStatistics tests * Add reference * Link between docstrings * Unify OrderStatistic tests * More stringently compute number of checks * Update testset name * Increase number of draws * Set seed in testset * Add an order statistics docs page * Rename rank variables to `rank` and `ranks` * Make arg checking more efficient * Support tuple of ranks * Apply suggestions from code review Co-authored-by: David Widmann <devmotion@users.noreply.github.com> * Use Fill * Reduce chances of type-instability * Ensure type-stability and use GammaMTSampler * Update docstrings * Apply suggestions from code review Co-authored-by: David Widmann <devmotion@users.noreply.github.com> * Add tests that check for empty ranks * Add comment explaining choice of gamma sampler * Apply suggestions from code review Co-authored-by: David Widmann <devmotion@users.noreply.github.com> * Test for 0 density out of support * Return -Inf for logpdf if out of support --------- Co-authored-by: David Widmann <devmotion@users.noreply.github.com>
JuliaStats · May 22, 2023 · a7fb967 · a7fb967
1 parent 87f323a
commit a7fb967
Show file tree

Hide file tree

Showing 10 changed files with 764 additions and 0 deletions.
diff --git a/docs/make.jl b/docs/make.jl
@@ -17,6 +17,7 @@ makedocs(
         "reshape.md",
         "cholesky.md",
         "mixture.md",
+        "order_statistics.md",
         "convolution.md",
         "fit.md",
         "extends.md",

diff --git a/docs/src/order_statistics.md b/docs/src/order_statistics.md
@@ -0,0 +1,16 @@
+# Order Statistics
+
+The $i$th [Order Statistic](https://en.wikipedia.org/wiki/Order_statistic) of a random sample of size $n$ from a univeriate distribution is the $i$th element after sorting in increasing order.
+As a special case, the first and $n$th order statistics are the minimum and maximum of the sample, while for odd $n$, the $\lceil \frac{n}{2} \rceil$th entry is the sample median.
+
+Given any univariate distribution and the sample size $n$, we can construct the distribution of its $i$th order statistic:
+
+```@docs
+OrderStatistic
+```
+
+If we are interested in more than one order statistic, for continuous univariate distributions we can also construct the joint distribution of order statistics:
+
+```@docs
+JointOrderStatistics
+```
diff --git a/src/Distributions.jl b/src/Distributions.jl
@@ -108,6 +108,7 @@ export
     IsoNormal,
     IsoNormalCanon,
     JohnsonSU,
+    JointOrderStatistics,
     Kolmogorov,
     KSDist,
     KSOneSided,
@@ -144,6 +145,7 @@ export
     Normal,
     NormalCanon,
     NormalInverseGaussian,
+    OrderStatistic,
     Pareto,
     PGeneralizedGaussian,
     SkewedExponentialPower,

diff --git a/src/multivariate/jointorderstatistics.jl b/src/multivariate/jointorderstatistics.jl
@@ -0,0 +1,168 @@
+# Implementation based on chapters 2-4 of
+# Arnold, Barry C., Narayanaswamy Balakrishnan, and Haikady Navada Nagaraja.
+# A first course in order statistics. Society for Industrial and Applied Mathematics, 2008.
+
+"""
+    JointOrderStatistics <: ContinuousMultivariateDistribution
+
+The joint distribution of a subset of order statistics from a sample from a continuous
+univariate distribution.
+
+    JointOrderStatistics(
+        dist::ContinuousUnivariateDistribution,
+        n::Int,
+        ranks=Base.OneTo(n);
+        check_args::Bool=true,
+    )
+
+Construct the joint distribution of order statistics for the specified `ranks` from an IID
+sample of size `n` from `dist`.
+
+The ``i``th order statistic of a sample is the ``i``th element of the sorted sample.
+For example, the 1st order statistic is the sample minimum, while the ``n``th order
+statistic is the sample maximum.
+
+`ranks` must be a sorted vector or tuple of unique `Int`s between 1 and `n`.
+
+For a single order statistic, use [`OrderStatistic`](@ref) instead.
+
+## Examples
+
+```julia
+JointOrderStatistics(Normal(), 10)           # Product(fill(Normal(), 10)) restricted to ordered vectors
+JointOrderStatistics(Cauchy(), 10, 2:9)      # joint distribution of all but the extrema
+JointOrderStatistics(Cauchy(), 10, (1, 10))  # joint distribution of only the extrema
+```
+"""
+struct JointOrderStatistics{
+    D<:ContinuousUnivariateDistribution,R<:Union{AbstractVector{Int},Tuple{Int,Vararg{Int}}}
+} <: ContinuousMultivariateDistribution
+    dist::D
+    n::Int
+    ranks::R
+    function JointOrderStatistics(
+        dist::ContinuousUnivariateDistribution,
+        n::Int,
+        ranks::Union{AbstractVector{Int},Tuple{Int,Vararg{Int}}}=Base.OneTo(n);
+        check_args::Bool=true,
+    )
+        @check_args(
+            JointOrderStatistics,
+            (n, n ≥ 1, "`n` must be a positive integer."),
+            (
+                ranks,
+                _are_ranks_valid(ranks, n),
+                "`ranks` must be a sorted vector or tuple of unique integers between 1 and `n`.",
+            ),
+        )
+        return new{typeof(dist),typeof(ranks)}(dist, n, ranks)
+    end
+end
+
+_islesseq(x, y) = isless(x, y) || isequal(x, y)
+
+function _are_ranks_valid(ranks, n)
+    # this is equivalent to but faster than
+    # issorted(ranks) && allunique(ranks)
+    !isempty(ranks) && first(ranks) ≥ 1 && last(ranks) ≤ n && issorted(ranks; lt=_islesseq)
+end
+function _are_ranks_valid(ranks::AbstractRange, n)
+    !isempty(ranks) && first(ranks) ≥ 1 && last(ranks) ≤ n && step(ranks) > 0
+end
+
+length(d::JointOrderStatistics) = length(d.ranks)
+function insupport(d::JointOrderStatistics, x::AbstractVector)
+    length(d) == length(x) || return false
+    xi, state = iterate(x) # at least one element!
+    dist = d.dist
+    insupport(dist, xi) || return false
+    while (xj_state = iterate(x, state)) !== nothing
+        xj, state = xj_state
+        xj ≥ xi && insupport(dist, xj) || return false
+        xi = xj
+    end
+    return true
+end
+minimum(d::JointOrderStatistics) = Fill(minimum(d.dist), length(d))
+maximum(d::JointOrderStatistics) = Fill(maximum(d.dist), length(d))
+
+params(d::JointOrderStatistics) = tuple(params(d.dist)..., d.n, d.ranks)
+partype(d::JointOrderStatistics) = partype(d.dist)
+Base.eltype(::Type{<:JointOrderStatistics{D}}) where {D} = Base.eltype(D)
+Base.eltype(d::JointOrderStatistics) = eltype(d.dist)
+
+function logpdf(d::JointOrderStatistics, x::AbstractVector{<:Real})
+    n = d.n
+    ranks = d.ranks
+    lp = loglikelihood(d.dist, x)
+    T = typeof(lp)
+    lp += loggamma(T(n + 1))
+    if length(ranks) == n
+        issorted(x) && return lp
+        return oftype(lp, -Inf)
+    end
+    i = first(ranks)
+    xᵢ = first(x)
+    if i > 1  # _marginalize_range(d.dist, 0, i, -Inf, xᵢ, T)
+        lp += (i - 1) * logcdf(d.dist, xᵢ) - loggamma(T(i))
+    end
+    for (j, xⱼ) in Iterators.drop(zip(ranks, x), 1)
+        xⱼ < xᵢ && return oftype(lp, -Inf)
+        lp += _marginalize_range(d.dist, i, j, xᵢ, xⱼ, T)
+        i = j
+        xᵢ = xⱼ
+    end
+    if i < n  # _marginalize_range(d.dist, i, n + 1, xᵢ, Inf, T)
+        lp += (n - i) * logccdf(d.dist, xᵢ) - loggamma(T(n - i + 1))
+    end
+    return lp
+end
+
+# given ∏ₖf(xₖ), marginalize all xₖ for i < k < j
+function _marginalize_range(dist, i, j, xᵢ, xⱼ, T)
+    k = j - i - 1
+    k == 0 && return zero(T)
+    return k * T(logdiffcdf(dist, xⱼ, xᵢ)) - loggamma(T(k + 1))
+end
+
+function _rand!(rng::AbstractRNG, d::JointOrderStatistics, x::AbstractVector{<:Real})
+    n = d.n
+    if n == length(d.ranks)  # ranks == 1:n
+        # direct method, slower than inversion method for large `n` and distributions with
+        # fast quantile function or that use inversion sampling
+        rand!(rng, d.dist, x)
+        sort!(x)
+    else
+        # use exponential generation method with inversion, where for gaps in the ranks, we
+        # use the fact that the sum Y of k IID variables xₘ ~ Exp(1) is Y ~ Gamma(k, 1).
+        # Lurie, D., and H. O. Hartley. "Machine-generation of order statistics for Monte
+        # Carlo computations." The American Statistician 26.1 (1972): 26-27.
+        # this is slow if length(d.ranks) is close to n and quantile for d.dist is expensive,
+        # but this branch is probably taken when length(d.ranks) is small or much smaller than n.
+        T = typeof(one(eltype(x)))
+        s = zero(eltype(x))
+        i = 0
+        for (m, j) in zip(eachindex(x), d.ranks)
+            k = j - i
+            if k > 1
+                # specify GammaMTSampler directly to avoid unnecessarily checking the shape
+                # parameter again and because it has been benchmarked to be the fastest for
+                # shape k ≥ 1 and scale 1
+                s += T(rand(rng, GammaMTSampler(Gamma{T}(T(k), T(1)))))
+            else
+                s += randexp(rng, T)
+            end
+            i = j
+            x[m] = s
+        end
+        j = n + 1
+        k = j - i
+        if k > 1
+            s += T(rand(rng, GammaMTSampler(Gamma{T}(T(k), T(1)))))
+        else
+            s += randexp(rng, T)
+        end
+        x .= quantile.(d.dist, x ./ s)
+    end
+    return x
+end
diff --git a/src/multivariates.jl b/src/multivariates.jl
@@ -112,6 +112,7 @@ end
 for fname in ["dirichlet.jl",
               "multinomial.jl",
               "dirichletmultinomial.jl",
+              "jointorderstatistics.jl",
               "mvnormal.jl",
               "mvnormalcanon.jl",
               "mvlognormal.jl",

diff --git a/src/univariate/orderstatistic.jl b/src/univariate/orderstatistic.jl
@@ -0,0 +1,108 @@
+# Implementation based on chapters 2-4 of
+# Arnold, Barry C., Narayanaswamy Balakrishnan, and Haikady Navada Nagaraja.
+# A first course in order statistics. Society for Industrial and Applied Mathematics, 2008.
+
+"""
+    OrderStatistic{D<:UnivariateDistribution,S<:ValueSupport} <: UnivariateDistribution{S}
+
+The distribution of an order statistic from IID samples from a univariate distribution.
+
+    OrderStatistic(dist::UnivariateDistribution, n::Int, rank::Int; check_args::Bool=true)
+
+Construct the distribution of the `rank` ``=i``th order statistic from `n` independent
+samples from `dist`.
+
+The ``i``th order statistic of a sample is the ``i``th element of the sorted sample.
+For example, the 1st order statistic is the sample minimum, while the ``n``th order
+statistic is the sample maximum.
+
+If ``f`` is the probability density (mass) function of `dist` with distribution function
+``F``, then the probability density function ``g`` of the order statistic for continuous
+`dist` is
+```math
+g(x; n, i) = {n \\choose i} [F(x)]^{i-1} [1 - F(x)]^{n-i} f(x),
+```
+and the probability mass function ``g`` of the order statistic for discrete `dist` is
+```math
+g(x; n, i) = \\sum_{k=i}^n {n \\choose k} \\left( [F(x)]^k [1 - F(x)]^{n-k} - [F(x_-)]^k [1 - F(x_-)]^{n-k} \\right),
+```
+where ``x_-`` is the largest element in the support of `dist` less than ``x``.
+
+For the joint distribution of a subset of order statistics, use
+[`JointOrderStatistics`](@ref) instead.
+
+## Examples
+
+```julia
+OrderStatistic(Cauchy(), 10, 1)              # distribution of the sample minimum
+OrderStatistic(DiscreteUniform(10), 10, 10)  # distribution of the sample maximum
+OrderStatistic(Gamma(1, 1), 11, 5)           # distribution of the sample median
+```
+"""
+struct OrderStatistic{D<:UnivariateDistribution,S<:ValueSupport} <:
+       UnivariateDistribution{S}
+    dist::D
+    n::Int
+    rank::Int
+    function OrderStatistic(
+        dist::UnivariateDistribution, n::Int, rank::Int; check_args::Bool=true
+    )
+        @check_args(OrderStatistic, 1 ≤ rank ≤ n)
+        return new{typeof(dist),value_support(typeof(dist))}(dist, n, rank)
+    end
+end
+
+minimum(d::OrderStatistic) = minimum(d.dist)
+maximum(d::OrderStatistic) = maximum(d.dist)
+insupport(d::OrderStatistic, x::Real) = insupport(d.dist, x)
+
+params(d::OrderStatistic) = tuple(params(d.dist)..., d.n, d.rank)
+partype(d::OrderStatistic) = partype(d.dist)
+Base.eltype(::Type{<:OrderStatistic{D}}) where {D} = Base.eltype(D)
+Base.eltype(d::OrderStatistic) = eltype(d.dist)
+
+# distribution of the ith order statistic from an IID uniform distribution, with CDF Uᵢₙ(x)
+function _uniform_orderstatistic(d::OrderStatistic)
+    n = d.n
+    rank = d.rank
+    return Beta{Int}(rank, n - rank + 1)
+end
+
+function logpdf(d::OrderStatistic, x::Real)
+    b = _uniform_orderstatistic(d)
+    p = cdf(d.dist, x)
+    if value_support(typeof(d)) === Continuous
+        return logpdf(b, p) + logpdf(d.dist, x)
+    else
+        return logdiffcdf(b, p, p - pdf(d.dist, x))
+    end
+end
+
+for f in (:logcdf, :logccdf, :cdf, :ccdf)
+    @eval begin
+        function $f(d::OrderStatistic, x::Real)
+            b = _uniform_orderstatistic(d)
+            return $f(b, cdf(d.dist, x))
+        end
+    end
+end
+
+for f in (:quantile, :cquantile)
+    @eval begin
+        function $f(d::OrderStatistic, p::Real)
+            # since cdf is Fᵢₙ(x) = Uᵢₙ(Fₓ(x)), and Uᵢₙ is invertible and increasing, we
+            # have Fₓ(x) = Uᵢₙ⁻¹(Fᵢₙ(x)). then quantile function is
+            # Qᵢₙ(p) = inf{x: p ≤ Fᵢₙ(x)} = inf{x: Uᵢₙ⁻¹(p) ≤ Fₓ(x)} = Qₓ(Uᵢₙ⁻¹(p))
+            b = _uniform_orderstatistic(d)
+            return quantile(d.dist, $f(b, p))
+        end
+    end
+end
+
+function rand(rng::AbstractRNG, d::OrderStatistic)
+    # inverse transform sampling. Since quantile function is Qₓ(Uᵢₙ⁻¹(p)), we draw a random
+    # variable from Uᵢₙ and pass it through the quantile function of `d.dist`
+    T = eltype(d.dist)
+    b = _uniform_orderstatistic(d)
+    return T(quantile(d.dist, rand(rng, b)))
+end
diff --git a/src/univariates.jl b/src/univariates.jl
@@ -731,3 +731,5 @@ end
 for dname in continuous_distributions
     include(joinpath("univariate", "continuous", "$(dname).jl"))
 end
+
+include(joinpath("univariate", "orderstatistic.jl"))