Skip to content

Commit

Permalink
Merge branch 'main' into compathelper/new_version/2023-05-03-01-15-23…
Browse files Browse the repository at this point in the history
…-035-01843517865
  • Loading branch information
smishr authored Aug 3, 2023
2 parents 1d534cc + 735613e commit 6383e51
Show file tree
Hide file tree
Showing 12 changed files with 151 additions and 97 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Survey"
uuid = "c1a98b4d-6cd2-47ec-b9e9-69b59c35373c"
authors = ["Ayush Patnaik <ayushpatnaik@gmail.com>"]
version = "0.2.0"
version = "0.3.0"

[deps]
AlgebraOfGraphics = "cbdf2221-f076-402e-a563-3d30da359d67"
Expand Down
4 changes: 3 additions & 1 deletion docs/src/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ Private = false
AbstractSurveyDesign
SurveyDesign
ReplicateDesign
BootstrapReplicates
JackknifeReplicates
load_data
bootweights
jackknifeweights
jackknife_variance
variance
mean
total
quantile
Expand Down
5 changes: 3 additions & 2 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ using Missings

include("SurveyDesign.jl")
include("bootstrap.jl")
include("jackknife.jl")
include("mean.jl")
include("quantile.jl")
include("total.jl")
Expand All @@ -25,17 +26,17 @@ include("boxplot.jl")
include("show.jl")
include("ratio.jl")
include("by.jl")
include("jackknife.jl")

export load_data
export AbstractSurveyDesign, SurveyDesign, ReplicateDesign
export BootstrapReplicates, JackknifeReplicates
export dim, colnames, dimnames
export mean, total, quantile
export plot
export hist, sturges, freedman_diaconis
export boxplot
export bootweights
export ratio
export jackknifeweights, jackknife_variance
export jackknifeweights, variance

end
93 changes: 60 additions & 33 deletions src/SurveyDesign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -123,46 +123,71 @@ struct SurveyDesign <: AbstractSurveyDesign
end
end

"""
InferenceMethod
Abstract type for inference methods.
"""
abstract type InferenceMethod end

"""
BootstrapReplicates <: InferenceMethod
Type for the bootstrap replicates method. For more details, see [`bootweights`](@ref).
"""
struct BootstrapReplicates <: InferenceMethod
replicates::UInt
end

"""
JackknifeReplicates <: InferenceMethod
Type for the Jackknife replicates method. For more details, see [`jackknifeweights`](@ref).
"""
struct JackknifeReplicates <: InferenceMethod
replicates::UInt
end

"""
ReplicateDesign <: AbstractSurveyDesign
Survey design obtained by replicating an original design using [`bootweights`](@ref). If
replicate weights are available, then they can be used to directly create a `ReplicateDesign`.
Survey design obtained by replicating an original design using an inference method like [`bootweights`](@ref) or [`jackknifeweights`](@ref). If
replicate weights are available, then they can be used to directly create a `ReplicateDesign` object.
# Constructors
```julia
ReplicateDesign(
ReplicateDesign{ReplicateType}(
data::AbstractDataFrame,
replicate_weights::Vector{Symbol};
clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
strata::Union{Nothing,Symbol} = nothing,
popsize::Union{Nothing,Symbol} = nothing,
weights::Union{Nothing,Symbol} = nothing
)
) where {ReplicateType <: InferenceMethod}
ReplicateDesign(
ReplicateDesign{ReplicateType}(
data::AbstractDataFrame,
replicate_weights::UnitIndex{Int};
clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
strata::Union{Nothing,Symbol} = nothing,
popsize::Union{Nothing,Symbol} = nothing,
weights::Union{Nothing,Symbol} = nothing
)
) where {ReplicateType <: InferenceMethod}
ReplicateDesign(
ReplicateDesign{ReplicateType}(
data::AbstractDataFrame,
replicate_weights::Regex;
clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
strata::Union{Nothing,Symbol} = nothing,
popsize::Union{Nothing,Symbol} = nothing,
weights::Union{Nothing,Symbol} = nothing
)
) where {ReplicateType <: InferenceMethod}
```
# Arguments
The constructor has the same arguments as [`SurveyDesign`](@ref). The only additional argument is `replicate_weights`, which can
`ReplicateType` must be one of the supported inference types; currently the package supports [`BootstrapReplicates`](@ref) and [`JackknifeReplicates`](@ref). The constructor has the same arguments as [`SurveyDesign`](@ref). The only additional argument is `replicate_weights`, which can
be of one of the following types.
- `Vector{Symbol}`: In this case, each `Symbol` in the vector should represent a column of `data` containing the replicate weights.
Expand All @@ -173,15 +198,15 @@ All the columns containing the replicate weights will be renamed to the form `re
# Examples
Here is an example where the [`bootweights`](@ref) function is used to create a `ReplicateDesign`.
Here is an example where the [`bootweights`](@ref) function is used to create a `ReplicateDesign{BootstrapReplicates}`.
```jldoctest replicate-design; setup = :(using Survey, CSV, DataFrames)
julia> apistrat = load_data("apistrat");
julia> dstrat = SurveyDesign(apistrat; strata=:stype, weights=:pw);
julia> bootstrat = bootweights(dstrat; replicates=1000) # creating a ReplicateDesign using bootweights
ReplicateDesign:
ReplicateDesign{BootstrapReplicates}:
data: 200×1044 DataFrame
strata: stype
[E, E, E … H]
Expand Down Expand Up @@ -210,8 +235,8 @@ julia> CSV.write("apistrat_withreplicates.csv", bootstrat.data);
We can now pass the replicate weights directly to the `ReplicateDesign` constructor, either as a `Vector{Symbol}`, a `UnitRange` or a `Regex`.
```jldoctest replicate-design
julia> bootstrat_direct = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), [Symbol("r_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw)
ReplicateDesign:
julia> bootstrat_direct = ReplicateDesign{BootstrapReplicates}(CSV.read("apistrat_withreplicates.csv", DataFrame), [Symbol("r_"*string(replicate)) for replicate in 1:1000]; strata=:stype, weights=:pw)
ReplicateDesign{BootstrapReplicates}:
data: 200×1044 DataFrame
strata: stype
[E, E, E … H]
Expand All @@ -223,8 +248,8 @@ allprobs: [0.0226, 0.0226, 0.0226 … 0.0662]
type: bootstrap
replicates: 1000
julia> bootstrat_unitrange = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), UnitRange(45:1044);strata=:stype, weights=:pw)
ReplicateDesign:
julia> bootstrat_unitrange = ReplicateDesign{BootstrapReplicates}(CSV.read("apistrat_withreplicates.csv", DataFrame), UnitRange(45:1044);strata=:stype, weights=:pw)
ReplicateDesign{BootstrapReplicates}:
data: 200×1044 DataFrame
strata: stype
[E, E, E … H]
Expand All @@ -236,8 +261,8 @@ allprobs: [0.0226, 0.0226, 0.0226 … 0.0662]
type: bootstrap
replicates: 1000
julia> bootstrat_regex = ReplicateDesign(CSV.read("apistrat_withreplicates.csv", DataFrame), r"r_\\d";strata=:stype, weights=:pw)
ReplicateDesign:
julia> bootstrat_regex = ReplicateDesign{BootstrapReplicates}(CSV.read("apistrat_withreplicates.csv", DataFrame), r"r_\\d";strata=:stype, weights=:pw)
ReplicateDesign{BootstrapReplicates}:
data: 200×1044 DataFrame
strata: stype
[E, E, E … H]
Expand All @@ -252,7 +277,7 @@ replicates: 1000
```
"""
struct ReplicateDesign <: AbstractSurveyDesign
struct ReplicateDesign{ReplicateType} <: AbstractSurveyDesign
data::AbstractDataFrame
cluster::Symbol
popsize::Symbol
Expand All @@ -264,9 +289,10 @@ struct ReplicateDesign <: AbstractSurveyDesign
type::String
replicates::UInt
replicate_weights::Vector{Symbol}
inference_method::ReplicateType

# default constructor
function ReplicateDesign(
function ReplicateDesign{ReplicateType}(
data::DataFrame,
cluster::Symbol,
popsize::Symbol,
Expand All @@ -277,21 +303,21 @@ struct ReplicateDesign <: AbstractSurveyDesign
pps::Bool,
type::String,
replicates::UInt,
replicate_weights::Vector{Symbol}
)
new(data, cluster, popsize, sampsize, strata, weights, allprobs,
pps, type, replicates, replicate_weights)
replicate_weights::Vector{Symbol},
) where {ReplicateType <: InferenceMethod}
new{ReplicateType}(data, cluster, popsize, sampsize, strata, weights, allprobs,
pps, type, replicates, replicate_weights, ReplicateType(replicates))
end

# constructor with given replicate_weights
function ReplicateDesign(
function ReplicateDesign{ReplicateType}(
data::AbstractDataFrame,
replicate_weights::Vector{Symbol};
clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
strata::Union{Nothing,Symbol} = nothing,
popsize::Union{Nothing,Symbol} = nothing,
weights::Union{Nothing,Symbol} = nothing
)
) where {ReplicateType <: InferenceMethod}
# rename the replicate weights if needed
rename!(data, [replicate_weights[index] => "replicate_"*string(index) for index in 1:length(replicate_weights)])

Expand All @@ -303,7 +329,7 @@ struct ReplicateDesign <: AbstractSurveyDesign
popsize=popsize,
weights=weights
)
new(
new{ReplicateType}(
base_design.data,
base_design.cluster,
base_design.popsize,
Expand All @@ -314,20 +340,21 @@ struct ReplicateDesign <: AbstractSurveyDesign
base_design.pps,
"bootstrap",
length(replicate_weights),
replicate_weights
replicate_weights,
ReplicateType(length(replicate_weights))
)
end

# replicate weights given as a range of columns
ReplicateDesign(
ReplicateDesign{ReplicateType}(
data::AbstractDataFrame,
replicate_weights::UnitRange{Int};
clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
strata::Union{Nothing,Symbol} = nothing,
popsize::Union{Nothing,Symbol} = nothing,
weights::Union{Nothing,Symbol} = nothing
) =
ReplicateDesign(
) where {ReplicateType <: InferenceMethod} =
ReplicateDesign{ReplicateType}(
data,
Symbol.(names(data)[replicate_weights]);
clusters=clusters,
Expand All @@ -337,15 +364,15 @@ struct ReplicateDesign <: AbstractSurveyDesign
)

# replicate weights given as regular expression
ReplicateDesign(
ReplicateDesign{ReplicateType}(
data::AbstractDataFrame,
replicate_weights::Regex;
clusters::Union{Nothing,Symbol,Vector{Symbol}} = nothing,
strata::Union{Nothing,Symbol} = nothing,
popsize::Union{Nothing,Symbol} = nothing,
weights::Union{Nothing,Symbol} = nothing
) =
ReplicateDesign(
) where {ReplicateType <: InferenceMethod} =
ReplicateDesign{ReplicateType}(
data,
Symbol.(names(data)[findall(name -> occursin(replicate_weights, name), names(data))]);
clusters=clusters,
Expand Down
51 changes: 47 additions & 4 deletions src/bootstrap.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Use bootweights to create replicate weights using Rao-Wu bootstrap. The function accepts a `SurveyDesign` and returns a `ReplicateDesign` which has additional columns for replicate weights.
Use bootweights to create replicate weights using Rao-Wu bootstrap. The function accepts a `SurveyDesign` and returns a `ReplicateDesign{BootstrapReplicates}` which has additional columns for replicate weights.
```jldoctest
julia> using Random
Expand All @@ -9,7 +9,7 @@ julia> apiclus1 = load_data("apiclus1");
julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, popsize=:fpc);
julia> bootweights(dclus1; replicates=1000, rng=MersenneTwister(111)) # choose a seed for deterministic results
ReplicateDesign:
ReplicateDesign{BootstrapReplicates}:
data: 183×1044 DataFrame
strata: none
cluster: dnum
Expand All @@ -20,6 +20,7 @@ weights: [50.4667, 50.4667, 50.4667 … 50.4667]
allprobs: [0.0198, 0.0198, 0.0198 … 0.0198]
type: bootstrap
replicates: 1000
```
"""
function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwister(1234))
Expand All @@ -37,7 +38,7 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis
substrata_dfs[h] = cluster_sorted
end
df = reduce(vcat, substrata_dfs)
return ReplicateDesign(
return ReplicateDesign{BootstrapReplicates}(
df,
design.cluster,
design.popsize,
Expand All @@ -48,10 +49,52 @@ function bootweights(design::SurveyDesign; replicates = 4000, rng = MersenneTwis
design.pps,
"bootstrap",
UInt(replicates),
[Symbol("replicate_"*string(replicate)) for replicate in 1:replicates]
[Symbol("replicate_"*string(replicate)) for replicate in 1:replicates],
)
end

"""
variance(x::Symbol, func::Function, design::ReplicateDesign{BootstrapReplicates})
Use replicate weights to compute the standard error of the estimated mean using the bootstrap method. The variance is calculated using the formula
```math
\\hat{V}(\\hat{\\theta}) = \\dfrac{1}{R}\\sum_{i = 1}^R(\\theta_i - \\hat{\\theta})^2
```
where above ``R`` is the number of replicate weights, ``\\theta_i`` is the estimator computed using the ``i``th set of replicate weights, and ``\\hat{\\theta}`` is the estimator computed using the original weights.
```jldoctest
julia> using Survey, StatsBase;
julia> apiclus1 = load_data("apiclus1");
julia> dclus1 = SurveyDesign(apiclus1; clusters = :dnum, weights = :pw);
julia> bclus1 = dclus1 |> bootweights;
julia> weightedmean(x, y) = mean(x, weights(y));
julia> variance(:api00, weightedmean, bclus1)
1×2 DataFrame
Row │ estimator SE
│ Float64 Float64
─────┼────────────────────
1 │ 644.169 23.4107
```
"""
function variance(x::Symbol, func::Function, design::ReplicateDesign{BootstrapReplicates})
θ̂ = func(design.data[!, x], design.data[!, design.weights])
θ̂t = [
func(design.data[!, x], design.data[!, "replicate_"*string(i)]) for
i = 1:design.replicates
]
variance = sum((θ̂t .- θ̂) .^ 2) / design.replicates
return DataFrame(estimator = θ̂, SE = sqrt(variance))
end

function _bootweights_cluster_sorted!(cluster_sorted,
cluster_weights, cluster_sorted_designcluster, replicates, rng)

Expand Down
Loading

0 comments on commit 6383e51

Please sign in to comment.