From 49cccb11ddcf1418a17da3a4278cee1096ec0dfa Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Thu, 22 Jan 2015 13:43:33 -0500 Subject: [PATCH 01/74] [WIP] contrast coding flexibility --- src/DataFrames.jl | 6 ++ src/statsmodels/contrasts.jl | 184 +++++++++++++++++++++++++++++++++++ src/statsmodels/formula.jl | 40 +++++--- test/contrasts.jl | 33 +++++++ 4 files changed, 247 insertions(+), 16 deletions(-) create mode 100644 src/statsmodels/contrasts.jl create mode 100644 test/contrasts.jl diff --git a/src/DataFrames.jl b/src/DataFrames.jl index ee00817651..a58d011ce5 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -35,6 +35,7 @@ export @~, @wsv_str, AbstractDataFrame, + AbstractContrast, DataFrame, DataFrameRow, Formula, @@ -43,6 +44,9 @@ export @~, ModelFrame, ModelMatrix, SubDataFrame, + SumContrast, + TreatmentContrast, + HelmertContrast, aggregate, by, @@ -51,6 +55,7 @@ export @~, combine, complete_cases, complete_cases!, + contrast_matrix, deleterows!, describe, eachcol, @@ -109,6 +114,7 @@ for (dir, filename) in [ ("abstractdataframe", "sort.jl"), ("dataframe", "sort.jl"), + ("statsmodels", "contrasts.jl"), ("statsmodels", "formula.jl"), ("statsmodels", "statsmodel.jl"), diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl new file mode 100644 index 0000000000..9a003e39c9 --- /dev/null +++ b/src/statsmodels/contrasts.jl @@ -0,0 +1,184 @@ +## Specifying contrast coding for categorical data. General strategy is to +## define types for each kind of contrast coding (treatment, sum, normalized sum, +## Helmert, etc.). Constructing object with data will wrap the data and +## calculate the contrast matrix. Objects will provide methods for generating +## contrast coded matrix for ModelMatrix construction, and term names. +## + + +## Okay what's the goal here? Just to be able to call the `cols` and +## `termnames` functions on the column in the DataFrame and have it work +## automagically. +## +## The operations that need to happen are +## * contrast matrix given data (or number of contrasts) +## * make contrast-coded design matrix columns +## * make column names +## +## What about from the user's point of view? How do you say that you want to +## use a particular kind of contrast? Replace or assign the contrast wrapped +## data to the parent data frame. +## +## OKay, but having a container type is going to be a huge pain in the ass: +## you have to duplicate all the PooledDataArray functionality so that it can +## be used interchangeably. I mean, maybe not, if it's enough to just write +## down all the methods that need to be delegated and shove them all through +## a macro (although I suspect it's not quite that simple). +## +## So instead the right place to specify this is in the ModelFrame. Perhaps +## a Dict{Symbol,T<:AbstractContrast}, initialized to empty, and checked on +## construction of ModelMatrix. Only change would be to change the cols() calls +## in the ModelMatrix constructor to take the contrasts into account, and the +## call to coefnames +## +## Other issue: what about specifying a contrast matrix manually? Maybe have +## a type called ManualContrast that has fields for the matrix and the name? +## And if a matrix is passed to the ModelFrame then it'll first wrap it in the +## contrast type, then wrap the data, then call cols. This seems needlessly +## complicated to say the least. Better migth just be to add a version of the +## `cols` function that takes a Contrast or a matrix and does the right thing, +## and can still benefit from the multiple dispatch. +## +## +## +## So I'll have two kinds of types: +## ContrastedDataArray - container for data, parametrized by kind of contrast +## ContrastTreatment <: AbstractContrast - For dispatching methods for making +## specific contrast matrix, contrast columns, and names. +## +## AbstractContrast provides interface +## +## contrast_matrix(::AbstractContrast, v::PooledDataVector, ...) - construct +## the contrast matrix for this contrast type and data. Might optionally +## provide other arguments (like the base level, whether to make it sparse, +## etc.). The columns of the contrast matrix correspond to the columsn that +## will be put into the design matrix, and the rows correspond to the levels of +## the input data. + + +abstract AbstractContrast + +## contrast_matrix{T<:AbstractContrast}(C::Type{T}, v::PooledDataArray, args...; kwargs...) = +## contrast_matrix(C(), length(levels(v)), args...; kwargs...) + +## Allow specification of contrast Types (e.g. `SumContrast`) in addition to +## instantiated contrast objects (e.g. `SumContrast(base=1)`). +contrast_matrix{T<:AbstractContrast}(C::Type{T}, args...; kwargs...) = + contrast_matrix(C(), args...; kwargs...) + +## Generic generation of columns for design matrix based on a contrast matrix +function cols(v::PooledDataVector, contr_mat::Matrix) + ## validate provided matrix + n = length(levels(v)) + ## number of columns has to be < n-1 + dims = size(contr_mat) + if dims[2] >= n + error("Too many columns in contrast matrix: $(dims[2]) (only $n levels)") + elseif dims[1] < n + error("Not enough rows in contrast matrix: $(dims[1]) ($n levels)") + end + + return contr_mat[v.refs, :] +end + +## Default contrast is treatment: +cols(v::PooledDataVector) = cols(v, TreatmentContrast) +## Make contrast columns from contrast object: +cols(v::PooledDataVector, C::AbstractContrast) = cols(v, contrast_matrix(C,v)) +## Make contrast columsn from contrast _type_: +cols{T<:AbstractContrast}(v::PooledDataVector, C::Type{T}) = cols(v, C()) + + +## Default names for contrasts are just the level of the PDV used to construct. +termnames(term::Symbol, +function contrast_names{T<:AbstractContrast}(C::Type{T}, v::PooledDataVector) + levs = levels(v) + return levs[2:end] +end + + + +################################################################################ +## Treatment (dummy-coded) contrast +################################################################################ + +type TreatmentContrast <: AbstractContrast + sparse::Bool + base::Integer +end + +TreatmentContrast(; sparse::Bool=false, base::Integer=1) = TreatmentContrast(sparse, base) + +function contrast_matrix(C::TreatmentContrast, v::PooledDataVector) + n = length(levels(v)) # number of levels for contrast + if n < 2 error("not enought degrees of freedom to define contrasts") end + + contr = C.sparse ? speye(n) : eye(n) + + ## Drop the base level column from the contrast matrix + if !(1 <= C.base <= n) error("base = $(C.base) is not allowed for n = $n") end + contr[:,vcat(1:(C.base-1),(C.base+1):end)] +end + +################################################################################ +## Sum-coded contrast +## +## -1 for base level and +1 for contrast level. +################################################################################ + +type SumContrast <: AbstractContrast + base::Integer +end + +SumContrast(; base::Integer=1) = SumContrast(base) + +function contrast_matrix(C::SumContrast, v::PooledDataVector) + n = length(levels(v)) + if n < 2 error("not enought degrees of freedom to define contrasts") end + if !(1 <= C.base <= n) error("base = $(C.base) is not allowed for n = $n") end + contr = eye(n)[:, [1:(C.base-1), (C.base+1):end]] + contr[C.base, :] = -1 + return contr +end + +################################################################################ +## Helmert-coded contrast +## +## -1 for each of n levels below contrast, n for contrast level, and 0 above. +## Produces something like: +## +## [-1 -1 -1 +## 1 -1 -1 +## 0 2 -1 +## 0 0 3] +## +## Interpretation is that the nth contrast column is the difference between +## level n+1 and the average of levels 1:n +################################################################################ + +type HelmertContrast <: AbstractContrast + base::Integer +end + +HelmertContrast(; base::Integer=1) = HelmertContrast(base) + +function contrast_matrix(C::HelmertContrast, v::PooledDataVector) + n = length(levels(v)) + if n < 2 error("not enought degrees of freedom to define contrasts") end + if !(1 <= C.base <= n) error("base = $(C.base) is not allowed for n = $n") end + + contr = zeros(Integer, n, n-1) + for i in 1:n-1 + contr[1:i, i] = -1 + contr[i+1, i] = i + end + + return contr[[C.base, 1:(C.base-1), (C.base+1):end], :] +end + + +################################################################################ + +function contrast_matrix(::AbstractContrast, args...) + error("Contrast not implemented") +end diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index bf0ec22e87..6bcaeff6f4 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -37,6 +37,9 @@ type ModelFrame df::AbstractDataFrame terms::Terms msng::BitArray + ## mapping from df keys to contrasts. Rather than specify types allowed, + ## leave that to cols() to check. Allows more seamless later extension + contrasts::Dict{Any, Any} end type ModelMatrix{T <: @compat(Union{Float32, Float64})} @@ -236,32 +239,22 @@ function dropUnusedLevels!(da::PooledDataArray) end dropUnusedLevels!(x) = x -function ModelFrame(trms::Terms, d::AbstractDataFrame) +function ModelFrame(trms::Terms, d::AbstractDataFrame; + contrasts::Dict{Symbol, Type} = Dict{Symbol, Type}()) df, msng = na_omit(DataFrame(map(x -> d[x], trms.eterms))) names!(df, convert(Vector{Symbol}, map(string, trms.eterms))) for c in eachcol(df) dropUnusedLevels!(c[2]) end - ModelFrame(df, trms, msng) + ModelFrame(df, trms, msng, contrasts) end -ModelFrame(f::Formula, d::AbstractDataFrame) = ModelFrame(Terms(f), d) -ModelFrame(ex::Expr, d::AbstractDataFrame) = ModelFrame(Formula(ex), d) +ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...) +ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...) function StatsBase.model_response(mf::ModelFrame) mf.terms.response || error("Model formula one-sided") convert(Array, mf.df[round(Bool, mf.terms.factors[:, 1])][:, 1]) end -function contr_treatment(n::Integer, contrasts::Bool, sparse::Bool, base::Integer) - if n < 2 error("not enought degrees of freedom to define contrasts") end - contr = sparse ? speye(n) : eye(n) .== 1. - if !contrasts return contr end - if !(1 <= base <= n) error("base = $base is not allowed for n = $n") end - contr[:,vcat(1:(base-1),(base+1):end)] -end -contr_treatment(n::Integer,contrasts::Bool,sparse::Bool) = contr_treatment(n,contrasts,sparse,1) -contr_treatment(n::Integer,contrasts::Bool) = contr_treatment(n,contrasts,false,1) -contr_treatment(n::Integer) = contr_treatment(n,true,false,1) -cols(v::PooledDataVector) = contr_treatment(length(v.pool))[v.refs,:] cols(v::DataVector) = convert(Vector{Float64}, v.data) cols(v::Vector) = convert(Vector{Float64}, v) @@ -303,7 +296,22 @@ function ModelMatrix(mf::ModelFrame) ## if the factor doesn't occur in the fetrms rows = vec(sum(ff, 2) .!= 0) ff = ff[rows, :] - cc = [cols(col) for col in columns(mf.df[:, rows])] + ## cc = [cols(col) for col in columns(mf.df[:, rows])] + + ## construct model matrix columns from each of the factors, checking for + ## contrasts that have been manually specified. Categorical data + ## (PooledDataArray) will expand to a matrix with multiple columns, one + ## or each column of the contrast matrix, either specified in the ModelFrame + ## or the default TreatmentContrast. + cc = Any[] + for (name, x) in eachcol(mf.df[:,rows]) + if name in keys(mf.contrasts) + push!(cc, cols(x, mf.contrasts[name])) + else + push!(cc, cols(x)) + end + end + for j in 1:size(ff,2) trm = cc[round(Bool, ff[:, j])] push!(aa, trm) diff --git a/test/contrasts.jl b/test/contrasts.jl new file mode 100644 index 0000000000..cd7ad3b445 --- /dev/null +++ b/test/contrasts.jl @@ -0,0 +1,33 @@ +## module TestContrasts + +using Base.Test +using DataFrames + +x = @pdata [1, 2, 3, 1] + +@test contrast_matrix(TreatmentContrast, x) == [false false + true false + false true] + +@test contrast_matrix(TreatmentContrast(base=2), x) == [true false + false false + false true] + +@test contrast_matrix(SumContrast, x) == [-1 -1 + 1 0 + 0 1] + +@test contrast_matrix(SumContrast(base=2), x) == [1 0 + -1 -1 + 0 1] + +@test contrast_matrix(HelmertContrast, x) == [-1 -1 + 1 -1 + 0 2] + +@test contrast_matrix(HelmertContrast(base=2), x) == [1 -1 + -1 -1 + 0 2] + + +## end From ef415f7c4dbeae689475b732f5348c7eadd5e243 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 19 Sep 2015 21:41:43 -0400 Subject: [PATCH 02/74] actually store matrix in contrast type --- src/DataFrames.jl | 1 - src/statsmodels/contrasts.jl | 185 ++++++++++++----------------------- 2 files changed, 63 insertions(+), 123 deletions(-) diff --git a/src/DataFrames.jl b/src/DataFrames.jl index a58d011ce5..977209b770 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -55,7 +55,6 @@ export @~, combine, complete_cases, complete_cases!, - contrast_matrix, deleterows!, describe, eachcol, diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 9a003e39c9..c45aec4edf 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -1,146 +1,83 @@ -## Specifying contrast coding for categorical data. General strategy is to -## define types for each kind of contrast coding (treatment, sum, normalized sum, -## Helmert, etc.). Constructing object with data will wrap the data and -## calculate the contrast matrix. Objects will provide methods for generating -## contrast coded matrix for ModelMatrix construction, and term names. +## Specify contrasts for coding categorical data in model matrix. Contrast types +## are constructed with data and other options, creating a subtype of +## AbstractContrast. AbstractContrast provides the interface for creating modle +## matrix columns and coefficient names ## - - -## Okay what's the goal here? Just to be able to call the `cols` and -## `termnames` functions on the column in the DataFrame and have it work -## automagically. -## -## The operations that need to happen are -## * contrast matrix given data (or number of contrasts) -## * make contrast-coded design matrix columns -## * make column names -## -## What about from the user's point of view? How do you say that you want to -## use a particular kind of contrast? Replace or assign the contrast wrapped -## data to the parent data frame. -## -## OKay, but having a container type is going to be a huge pain in the ass: -## you have to duplicate all the PooledDataArray functionality so that it can -## be used interchangeably. I mean, maybe not, if it's enough to just write -## down all the methods that need to be delegated and shove them all through -## a macro (although I suspect it's not quite that simple). -## -## So instead the right place to specify this is in the ModelFrame. Perhaps -## a Dict{Symbol,T<:AbstractContrast}, initialized to empty, and checked on -## construction of ModelMatrix. Only change would be to change the cols() calls -## in the ModelMatrix constructor to take the contrasts into account, and the -## call to coefnames -## -## Other issue: what about specifying a contrast matrix manually? Maybe have -## a type called ManualContrast that has fields for the matrix and the name? -## And if a matrix is passed to the ModelFrame then it'll first wrap it in the -## contrast type, then wrap the data, then call cols. This seems needlessly -## complicated to say the least. Better migth just be to add a version of the -## `cols` function that takes a Contrast or a matrix and does the right thing, -## and can still benefit from the multiple dispatch. -## -## -## -## So I'll have two kinds of types: -## ContrastedDataArray - container for data, parametrized by kind of contrast -## ContrastTreatment <: AbstractContrast - For dispatching methods for making -## specific contrast matrix, contrast columns, and names. -## -## AbstractContrast provides interface -## -## contrast_matrix(::AbstractContrast, v::PooledDataVector, ...) - construct -## the contrast matrix for this contrast type and data. Might optionally -## provide other arguments (like the base level, whether to make it sparse, -## etc.). The columns of the contrast matrix correspond to the columsn that -## will be put into the design matrix, and the rows correspond to the levels of -## the input data. - +## ModelFrame will hold a Dict{Symbol, T<:AbstractContrast} that maps column +## names to contrasts. ModelMatrix will check this dict when evaluating terms, +## falling back to a default for any categorical data without a specified +## contrast. abstract AbstractContrast -## contrast_matrix{T<:AbstractContrast}(C::Type{T}, v::PooledDataArray, args...; kwargs...) = -## contrast_matrix(C(), length(levels(v)), args...; kwargs...) - -## Allow specification of contrast Types (e.g. `SumContrast`) in addition to -## instantiated contrast objects (e.g. `SumContrast(base=1)`). -contrast_matrix{T<:AbstractContrast}(C::Type{T}, args...; kwargs...) = - contrast_matrix(C(), args...; kwargs...) - -## Generic generation of columns for design matrix based on a contrast matrix -function cols(v::PooledDataVector, contr_mat::Matrix) - ## validate provided matrix - n = length(levels(v)) - ## number of columns has to be < n-1 - dims = size(contr_mat) - if dims[2] >= n - error("Too many columns in contrast matrix: $(dims[2]) (only $n levels)") - elseif dims[1] < n - error("Not enough rows in contrast matrix: $(dims[1]) ($n levels)") - end - - return contr_mat[v.refs, :] -end - -## Default contrast is treatment: -cols(v::PooledDataVector) = cols(v, TreatmentContrast) -## Make contrast columns from contrast object: -cols(v::PooledDataVector, C::AbstractContrast) = cols(v, contrast_matrix(C,v)) -## Make contrast columsn from contrast _type_: -cols{T<:AbstractContrast}(v::PooledDataVector, C::Type{T}) = cols(v, C()) +termnames(term::Symbol, col::Any, contrast::AbstractContrast) = contrast.termnames +function cols(v::PooledDataVector, contrast::AbstractContrast) + ## make sure the levels of the contrast matrix and the categorical data + ## are the same by constructing a re-indexing vector. Indexing into + ## reindex with v.refs will give the corresponding row number of the + ## contrast matrix + reindex = [findfirst(l .== contrast.levels) for l in levels(v)] -## Default names for contrasts are just the level of the PDV used to construct. -termnames(term::Symbol, -function contrast_names{T<:AbstractContrast}(C::Type{T}, v::PooledDataVector) - levs = levels(v) - return levs[2:end] + return contrast.matrix[reindex[v.refs], :] end - +Base.call{T<: AbstractContrast}(Type{T}, v::DataVector, args...; kwargs...) = + Base.call(Type{T}, pool(v), args...; kwargs...) ################################################################################ ## Treatment (dummy-coded) contrast ################################################################################ type TreatmentContrast <: AbstractContrast - sparse::Bool base::Integer + matrix::Array{Any,2} + termnames::Vector{Any} + levels::Vector{Any} end -TreatmentContrast(; sparse::Bool=false, base::Integer=1) = TreatmentContrast(sparse, base) +function TreatmentContrast(v::PooledDataVector; base::Integer=1) + lvls = levels(v) -function contrast_matrix(C::TreatmentContrast, v::PooledDataVector) - n = length(levels(v)) # number of levels for contrast + n = length(lvls) if n < 2 error("not enought degrees of freedom to define contrasts") end - contr = C.sparse ? speye(n) : eye(n) + not_base = [1:(base-1), (base+1):n] + tnames = lvls[ not_base ] + mat = eye(n)[:, not_base] - ## Drop the base level column from the contrast matrix - if !(1 <= C.base <= n) error("base = $(C.base) is not allowed for n = $n") end - contr[:,vcat(1:(C.base-1),(C.base+1):end)] + return TreatmentContrast(base, mat, tnames, lvls) end + ################################################################################ ## Sum-coded contrast ## ## -1 for base level and +1 for contrast level. ################################################################################ -type SumContrast <: AbstractContrast +type SumContrast <: AbstractContrast base::Integer + matrix::Array{Any,2} + termnames::Vector{Any} + levels::Vector{Any} end -SumContrast(; base::Integer=1) = SumContrast(base) +function SumContrast(v::PooledDataVector; base::Integer=1) + lvls = levels(v) -function contrast_matrix(C::SumContrast, v::PooledDataVector) - n = length(levels(v)) + n = length(lvls) if n < 2 error("not enought degrees of freedom to define contrasts") end - if !(1 <= C.base <= n) error("base = $(C.base) is not allowed for n = $n") end - contr = eye(n)[:, [1:(C.base-1), (C.base+1):end]] - contr[C.base, :] = -1 - return contr + + not_base = [1:(base-1), (base+1):n] + tnames = lvls[ not_base ] + mat = eye(n)[:, not_base] + mat[base, :] = -1 + + return SumContrast(base, mat, tnames, lvls) end + ################################################################################ ## Helmert-coded contrast ## @@ -154,31 +91,35 @@ end ## ## Interpretation is that the nth contrast column is the difference between ## level n+1 and the average of levels 1:n +## +## Has the nice property of each column in the resulting model matrix being +## orthogonal and with mean 0. ################################################################################ -type HelmertContrast <: AbstractContrast +type HelmertContrast <: AbstractContrast base::Integer + matrix::Array{Any,2} + termnames::Vector{Any} + levels::Vector{Any} end -HelmertContrast(; base::Integer=1) = HelmertContrast(base) +function HelmertContrast(v::PooledDataVector; base::Integer=1) + lvls = levels(v) -function contrast_matrix(C::HelmertContrast, v::PooledDataVector) - n = length(levels(v)) + n = length(lvls) if n < 2 error("not enought degrees of freedom to define contrasts") end - if !(1 <= C.base <= n) error("base = $(C.base) is not allowed for n = $n") end + if !(1 <= base <= n) error("base = $(base) is not allowed for n = $n") end + + not_base = [1:(base-1), (base+1):n] + tnames = lvls[ not_base ] - contr = zeros(Integer, n, n-1) + mat = zeros(n, n-1) for i in 1:n-1 - contr[1:i, i] = -1 - contr[i+1, i] = i + mat[1:i, i] = -1 + mat[i+1, i] = i end - return contr[[C.base, 1:(C.base-1), (C.base+1):end], :] -end - - -################################################################################ + ## re-shuffle the rows such that base is the all -1.0 row (currently first) + mat = mat[[C.base, 1:(C.base-1), (C.base+1):end], :] -function contrast_matrix(::AbstractContrast, args...) - error("Contrast not implemented") end From 45f704f425a2535e0e5fe3397c25f79342c578f9 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 20 Sep 2015 16:25:13 -0400 Subject: [PATCH 03/74] fix concat syntax, add defaults for cols/termnames, notes, and overloaded call --- src/statsmodels/contrasts.jl | 37 ++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index c45aec4edf..363b41a6bd 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -4,13 +4,15 @@ ## matrix columns and coefficient names ## ## ModelFrame will hold a Dict{Symbol, T<:AbstractContrast} that maps column -## names to contrasts. ModelMatrix will check this dict when evaluating terms, -## falling back to a default for any categorical data without a specified -## contrast. +## names to contrasts. +## +## ModelMatrix will check this dict when evaluating terms, falling back to a +## default for any categorical data without a specified contrast. abstract AbstractContrast -termnames(term::Symbol, col::Any, contrast::AbstractContrast) = contrast.termnames +termnames(term::Symbol, col::Any, contrast::AbstractContrast) = + ["$term - $name" for name in contrast.termnames] function cols(v::PooledDataVector, contrast::AbstractContrast) ## make sure the levels of the contrast matrix and the categorical data @@ -19,16 +21,31 @@ function cols(v::PooledDataVector, contrast::AbstractContrast) ## contrast matrix reindex = [findfirst(l .== contrast.levels) for l in levels(v)] + ## TODO: add kwarg for full-rank contrasts (e.g., when intercept isn't + ## specified in a model frame). + return contrast.matrix[reindex[v.refs], :] end -Base.call{T<: AbstractContrast}(Type{T}, v::DataVector, args...; kwargs...) = - Base.call(Type{T}, pool(v), args...; kwargs...) +## Default contrast is treatment coding: +cols(v::PooledDataVector) = cols(v, TreatmentContrast(v)) +termnames(term::Symbol, col::PooledDataArray) = termnames(term, col, TreatmentContrast(col)) + + +## Constructing a contrast from a non-pooled data vector will first pool it +## +## (NOT SURE THIS IS GOOD: constructing columns also depends on levels, so for +## _that_ to work would need to somehow hold onto pooled data...) +Base.call{T<: AbstractContrast}(C::Type{T}, v::DataVector, args...; kwargs...) = + Base.call(C, pool(v), args...; kwargs...) ################################################################################ ## Treatment (dummy-coded) contrast ################################################################################ +## TODO: factor out some of this repetition between different contrast types +## using metaprogramming + type TreatmentContrast <: AbstractContrast base::Integer matrix::Array{Any,2} @@ -42,7 +59,7 @@ function TreatmentContrast(v::PooledDataVector; base::Integer=1) n = length(lvls) if n < 2 error("not enought degrees of freedom to define contrasts") end - not_base = [1:(base-1), (base+1):n] + not_base = [1:(base-1); (base+1):n] tnames = lvls[ not_base ] mat = eye(n)[:, not_base] @@ -69,7 +86,7 @@ function SumContrast(v::PooledDataVector; base::Integer=1) n = length(lvls) if n < 2 error("not enought degrees of freedom to define contrasts") end - not_base = [1:(base-1), (base+1):n] + not_base = [1:(base-1); (base+1):n] tnames = lvls[ not_base ] mat = eye(n)[:, not_base] mat[base, :] = -1 @@ -110,7 +127,7 @@ function HelmertContrast(v::PooledDataVector; base::Integer=1) if n < 2 error("not enought degrees of freedom to define contrasts") end if !(1 <= base <= n) error("base = $(base) is not allowed for n = $n") end - not_base = [1:(base-1), (base+1):n] + not_base = [1:(base-1); (base+1):n] tnames = lvls[ not_base ] mat = zeros(n, n-1) @@ -120,6 +137,6 @@ function HelmertContrast(v::PooledDataVector; base::Integer=1) end ## re-shuffle the rows such that base is the all -1.0 row (currently first) - mat = mat[[C.base, 1:(C.base-1), (C.base+1):end], :] + mat = mat[[base; 1:(base-1); (base+1):end], :] end From d921818d00b9b7001a902485b28f593c5cc1d66f Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 20 Sep 2015 16:26:38 -0400 Subject: [PATCH 04/74] evaluate contrast _type_ passed in dict --- src/statsmodels/formula.jl | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 6bcaeff6f4..1e5ef0a5d3 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -239,12 +239,29 @@ function dropUnusedLevels!(da::PooledDataArray) end dropUnusedLevels!(x) = x +## Goal here is to allow specification of _either_ a "naked" contrast type, +## or an instantiated contrast object itself. This might be achieved in a more +## julian way by overloading call for c::AbstractContrast to just return c. +evaluateContrast(c::AbstractContrast, col::AbstractDataVector) = c +evaluateContrast{C <: AbstractContrast}(c::Type{C}, col::AbstractDataVector) = C(col) + function ModelFrame(trms::Terms, d::AbstractDataFrame; - contrasts::Dict{Symbol, Type} = Dict{Symbol, Type}()) + contrasts::Dict = Dict()) df, msng = na_omit(DataFrame(map(x -> d[x], trms.eterms))) names!(df, convert(Vector{Symbol}, map(string, trms.eterms))) for c in eachcol(df) dropUnusedLevels!(c[2]) end - ModelFrame(df, trms, msng, contrasts) + + ## TODO: set default contrasts for categorical columns without specified + ## contrasts. This is to ensure that we store how to construct model matrix + ## columsn for ALL variables where the levels might change for prediction + ## with new data (e.g. if only a subset of the levels are present) + + ## evaluate any naked contrasts types based on data (creating + ## contrast matrices, term names, and levels to store) + evaledContrasts = [ col => evaluateContrast(contr, df[col]) + for (col, contr) in contrasts ] + + ModelFrame(df, trms, msng, evaledContrasts) end ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...) From 25334cfff11f4f1d834f9ac317f53fe96425dfd4 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 20 Sep 2015 16:26:58 -0400 Subject: [PATCH 05/74] update coefnames to be more like cols construction --- src/statsmodels/formula.jl | 53 +++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 1e5ef0a5d3..aadb30c9dc 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -322,7 +322,7 @@ function ModelMatrix(mf::ModelFrame) ## or the default TreatmentContrast. cc = Any[] for (name, x) in eachcol(mf.df[:,rows]) - if name in keys(mf.contrasts) + if haskey(mf.contrasts, name) push!(cc, cols(x, mf.contrasts[name])) else push!(cc, cols(x)) @@ -338,39 +338,56 @@ function ModelMatrix(mf::ModelFrame) end termnames(term::Symbol, col) = [string(term)] -function termnames(term::Symbol, col::PooledDataArray) - levs = levels(col) - [string(term, " - ", levs[i]) for i in 2:length(levs)] + +function expandtermnames(term::Vector) + if length(term) == 1 + return term[1] + else + return foldr((a,b) -> vec([string(lev1, " & ", lev2) for + lev1 in a, + lev2 in b]), + term) + end end function coefnames(mf::ModelFrame) + # Generate individual evaluation-term names. Each value in etnames is a + # vector of the names for each column that the term evaluates to. (Just a + # single column for numeric terms, and one per contrast for categorical) + etnames = Dict(); + for term in mf.terms.eterms + if haskey(mf.contrasts, term) + etnames[term] = termnames(term, mf.df[term], mf.contrasts[term]) + else + etnames[term] = termnames(term, mf.df[term]) + end + end + + # For each _term_, pull out a vector of individual evaluation termnames that + # go into it. + tnames = Vector{Vector{Compat.UTF8String}}() if mf.terms.intercept - vnames = Compat.UTF8String["(Intercept)"] - else - vnames = Compat.UTF8String[] + push!(tnames, push!([], ["(Intercept)"])) end - # Need to only include active levels for term in mf.terms.terms if isa(term, Expr) if term.head == :call && term.args[1] == :| continue # skip random-effects terms elseif term.head == :call && term.args[1] == :& - ## for an interaction term, combine term names pairwise, - ## starting with rightmost terms - append!(vnames, - foldr((a,b) -> - vec([string(lev1, " & ", lev2) for - lev1 in a, - lev2 in b]), - map(x -> termnames(x, mf.df[x]), term.args[2:end]))) + push!(tnames, map(x -> etnames[x], term.args[2:end])) else error("unrecognized term $term") end else - append!(vnames, termnames(term, mf.df[term])) + # for "main effect" terms, push length-1 vector which holds all + push!(tnames, push!([], etnames[term])) end end - return vnames + + # Finally, expand each term names (analogously with expandcols, creating + # the names of each of the interaction columns), concatenate, and return. + return mapreduce(expandtermnames, vcat, tnames) + end function Formula(t::Terms) From 66ded8e12517fe30f2b0a606e09fc1f22d99c24f Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 20 Sep 2015 23:56:18 -0400 Subject: [PATCH 06/74] fix helmert coding constructor --- src/statsmodels/contrasts.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 363b41a6bd..477be35b58 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -139,4 +139,5 @@ function HelmertContrast(v::PooledDataVector; base::Integer=1) ## re-shuffle the rows such that base is the all -1.0 row (currently first) mat = mat[[base; 1:(base-1); (base+1):end], :] + return HelmertContrast(base, mat, tnames, lvls) end From 6495dd066d1c54379cbca6036bb16ea98314714c Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 20 Sep 2015 23:56:43 -0400 Subject: [PATCH 07/74] modify ModelFrame contrasts in place --- src/DataFrames.jl | 1 + src/statsmodels/formula.jl | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 977209b770..139c57f1ad 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -55,6 +55,7 @@ export @~, combine, complete_cases, complete_cases!, + contrast!, deleterows!, describe, eachcol, diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index aadb30c9dc..8124b35e9d 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -267,6 +267,14 @@ end ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...) ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...) +## modify contrasts in place +function contrast!(mf::ModelFrame; kwargs...) + new_contrasts = [ col => evaluateContrast(contr, mf.df[col]) + for (col, contr) in kwargs ] + mf.contrasts = merge(mf.contrasts, new_contrasts) + return mf +end + function StatsBase.model_response(mf::ModelFrame) mf.terms.response || error("Model formula one-sided") convert(Array, mf.df[round(Bool, mf.terms.factors[:, 1])][:, 1]) From 1e8bc792e850f1e21b8e5570fa507d258f495e4b Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 20 Sep 2015 23:56:54 -0400 Subject: [PATCH 08/74] tests for contrasts --- test/contrasts.jl | 80 ++++++++++++++++++++++++++++++----------------- test/runtests.jl | 3 +- 2 files changed, 54 insertions(+), 29 deletions(-) diff --git a/test/contrasts.jl b/test/contrasts.jl index cd7ad3b445..d63621af10 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -1,33 +1,57 @@ -## module TestContrasts +module TestContrasts using Base.Test using DataFrames -x = @pdata [1, 2, 3, 1] -@test contrast_matrix(TreatmentContrast, x) == [false false - true false - false true] - -@test contrast_matrix(TreatmentContrast(base=2), x) == [true false - false false - false true] - -@test contrast_matrix(SumContrast, x) == [-1 -1 - 1 0 - 0 1] - -@test contrast_matrix(SumContrast(base=2), x) == [1 0 - -1 -1 - 0 1] - -@test contrast_matrix(HelmertContrast, x) == [-1 -1 - 1 -1 - 0 2] - -@test contrast_matrix(HelmertContrast(base=2), x) == [1 -1 - -1 -1 - 0 2] - - -## end +d = DataFrame(x = @pdata( [:a, :b, :c, :a, :a, :b] )) + +mf = ModelFrame(Formula(Nothing(), :x), d) + +@test ModelMatrix(mf).m == [1 0 0 + 1 1 0 + 1 0 1 + 1 0 0 + 1 0 0 + 1 1 0] +@test coefnames(mf) == ["(Intercept)"; "x - b"; "x - c"] + +contrast!(mf, x = SumContrast) +@test ModelMatrix(mf).m == [1 -1 -1 + 1 1 0 + 1 0 1 + 1 -1 -1 + 1 -1 -1 + 1 1 0] +@test coefnames(mf) == ["(Intercept)"; "x - b"; "x - c"] + +## change base level of contrast +contrast!(mf, x = SumContrast(d[:x]; base = 2)) +@test ModelMatrix(mf).m == [1 1 0 + 1 -1 -1 + 1 0 1 + 1 1 0 + 1 1 0 + 1 -1 -1] +@test coefnames(mf) == ["(Intercept)"; "x - a"; "x - c"] + +contrast!(mf, x = HelmertContrast) +@test ModelMatrix(mf).m == [1 -1 -1 + 1 1 -1 + 1 0 2 + 1 -1 -1 + 1 -1 -1 + 1 1 -1] +@test coefnames(mf) == ["(Intercept)"; "x - b"; "x - c"] + +## test for missing data (and when it clobbers one of the levels) +d[3, :x] = NA +mf = ModelFrame(Formula(Nothing(), :x), d, contrasts = [:x => SumContrast]) +@test ModelMatrix(mf).m == [1 -1 + 1 1 + 1 -1 + 1 -1 + 1 1] +@test coefnames(mf) == ["(Intercept)"; "x - b"] + +end diff --git a/test/runtests.jl b/test/runtests.jl index bfc53ca433..10dccb74e4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -27,7 +27,8 @@ my_tests = ["utils.jl", "iteration.jl", "duplicates.jl", "show.jl", - "statsmodel.jl"] + "statsmodel.jl", + "contrasts.jl"] println("Running tests:") From 919409602d0fe68e003fba09a7478cb6e36d28e5 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 21 Sep 2015 00:42:38 -0400 Subject: [PATCH 09/74] set default contrasts, copy on predict, and test --- src/statsmodels/formula.jl | 7 ++++++- src/statsmodels/statsmodel.jl | 2 +- test/statsmodel.jl | 3 +-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 8124b35e9d..cbc40b0821 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -251,10 +251,15 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; names!(df, convert(Vector{Symbol}, map(string, trms.eterms))) for c in eachcol(df) dropUnusedLevels!(c[2]) end - ## TODO: set default contrasts for categorical columns without specified + ## set default contrasts for categorical columns without specified ## contrasts. This is to ensure that we store how to construct model matrix ## columsn for ALL variables where the levels might change for prediction ## with new data (e.g. if only a subset of the levels are present) + for term in trms.eterms + if isa(df[term], PooledDataArray) && !haskey(contrasts, term) + contrasts[term] = TreatmentContrast + end + end ## evaluate any naked contrasts types based on data (creating ## contrast matrices, term names, and levels to store) diff --git a/src/statsmodels/statsmodel.jl b/src/statsmodels/statsmodel.jl index 2457a02cda..c4fb9887cb 100644 --- a/src/statsmodels/statsmodel.jl +++ b/src/statsmodels/statsmodel.jl @@ -78,7 +78,7 @@ function StatsBase.predict(mm::DataFrameRegressionModel, df::AbstractDataFrame; # term is not found in the DataFrame and we don't want to remove elements with missing y) newTerms = remove_response(mm.mf.terms) # create new model frame/matrix - mf = ModelFrame(newTerms, df) + mf = ModelFrame(newTerms, df; contrasts = mm.mf.contrasts) newX = ModelMatrix(mf).m yp = predict(mm, newX; kwargs...) out = DataArray(eltype(yp), size(df, 1)) diff --git a/test/statsmodel.jl b/test/statsmodel.jl index e0d16b3331..4e0a5a529c 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -67,8 +67,7 @@ m2 = fit(DummyMod, f2, d) @test coeftable(m2).rownms == ["(Intercept)", "x1p - 6", "x1p - 7", "x1p - 8"] ## predict w/ new data missing levels -## FAILS: mismatch between number of model matrix columns -## @test predict(m2, d[2:4, :]) == predict(m2)[2:4] +@test predict(m2, d[2:4, :]) == predict(m2)[2:4] ## Another dummy model type to test fall-through show method From 83efff3781136f5542b04a215cffb8134e15b969 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 21 Sep 2015 08:42:42 -0400 Subject: [PATCH 10/74] remove Base.call for 0.3 compatibility --- src/statsmodels/contrasts.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 477be35b58..82d2baa3fa 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -36,8 +36,8 @@ termnames(term::Symbol, col::PooledDataArray) = termnames(term, col, TreatmentCo ## ## (NOT SURE THIS IS GOOD: constructing columns also depends on levels, so for ## _that_ to work would need to somehow hold onto pooled data...) -Base.call{T<: AbstractContrast}(C::Type{T}, v::DataVector, args...; kwargs...) = - Base.call(C, pool(v), args...; kwargs...) +## Base.call{T<: AbstractContrast}(C::Type{T}, v::DataVector, args...; kwargs...) = +## Base.call(C, pool(v), args...; kwargs...) ################################################################################ ## Treatment (dummy-coded) contrast From 78dee18c337034ea4bac5cfb7a2e2794cce81fc6 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 21 Sep 2015 08:58:42 -0400 Subject: [PATCH 11/74] ...and fix push!([], ...) --- src/statsmodels/formula.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index cbc40b0821..be165fa9b4 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -380,7 +380,7 @@ function coefnames(mf::ModelFrame) # go into it. tnames = Vector{Vector{Compat.UTF8String}}() if mf.terms.intercept - push!(tnames, push!([], ["(Intercept)"])) + push!(tnames, push!(Any[], ["(Intercept)"])) end for term in mf.terms.terms if isa(term, Expr) @@ -393,7 +393,7 @@ function coefnames(mf::ModelFrame) end else # for "main effect" terms, push length-1 vector which holds all - push!(tnames, push!([], etnames[term])) + push!(tnames, push!(Any[], etnames[term])) end end From 7371fe7e0ccd7e3c8fb3f3baa2d726603af1fb08 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 21 Sep 2015 19:37:19 -0400 Subject: [PATCH 12/74] code review: type of matrix, spacing, validation --- src/statsmodels/contrasts.jl | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 82d2baa3fa..7a3db6f239 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -19,7 +19,7 @@ function cols(v::PooledDataVector, contrast::AbstractContrast) ## are the same by constructing a re-indexing vector. Indexing into ## reindex with v.refs will give the corresponding row number of the ## contrast matrix - reindex = [findfirst(l .== contrast.levels) for l in levels(v)] + reindex = [findfirst(contrast.levels, l) for l in levels(v)] ## TODO: add kwarg for full-rank contrasts (e.g., when intercept isn't ## specified in a model frame). @@ -48,7 +48,7 @@ termnames(term::Symbol, col::PooledDataArray) = termnames(term, col, TreatmentCo type TreatmentContrast <: AbstractContrast base::Integer - matrix::Array{Any,2} + matrix::Matrix{Float64} termnames::Vector{Any} levels::Vector{Any} end @@ -57,11 +57,12 @@ function TreatmentContrast(v::PooledDataVector; base::Integer=1) lvls = levels(v) n = length(lvls) - if n < 2 error("not enought degrees of freedom to define contrasts") end + n > 1 || error("not enough degrees of freedom to define contrasts") + (1 <= base <= n) || error("base = $(base) is not allowed for n = $n") not_base = [1:(base-1); (base+1):n] - tnames = lvls[ not_base ] mat = eye(n)[:, not_base] + tnames = lvls[not_base] return TreatmentContrast(base, mat, tnames, lvls) end @@ -75,7 +76,7 @@ end type SumContrast <: AbstractContrast base::Integer - matrix::Array{Any,2} + matrix::Matrix{Float64} termnames::Vector{Any} levels::Vector{Any} end @@ -84,12 +85,13 @@ function SumContrast(v::PooledDataVector; base::Integer=1) lvls = levels(v) n = length(lvls) - if n < 2 error("not enought degrees of freedom to define contrasts") end + n > 1 || error("not enough degrees of freedom to define contrasts") + (1 <= base <= n) || error("base = $(base) is not allowed for n = $n") not_base = [1:(base-1); (base+1):n] - tnames = lvls[ not_base ] mat = eye(n)[:, not_base] mat[base, :] = -1 + tnames = lvls[not_base] return SumContrast(base, mat, tnames, lvls) end @@ -115,7 +117,7 @@ end type HelmertContrast <: AbstractContrast base::Integer - matrix::Array{Any,2} + matrix::Matrix{Float64} termnames::Vector{Any} levels::Vector{Any} end @@ -124,11 +126,11 @@ function HelmertContrast(v::PooledDataVector; base::Integer=1) lvls = levels(v) n = length(lvls) - if n < 2 error("not enought degrees of freedom to define contrasts") end - if !(1 <= base <= n) error("base = $(base) is not allowed for n = $n") end + n > 1 || error("not enough degrees of freedom to define contrasts") + 1 <= base <= n || error("base = $(base) is not allowed for n = $n") not_base = [1:(base-1); (base+1):n] - tnames = lvls[ not_base ] + tnames = lvls[not_base] mat = zeros(n, n-1) for i in 1:n-1 From 2808f70349dd292c449765e00ca4a9a4d8fdbd07 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 21 Sep 2015 19:37:52 -0400 Subject: [PATCH 13/74] factor out contrast matrix construction highlights parallel structure in different types --- src/statsmodels/contrasts.jl | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 7a3db6f239..60149b253c 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -61,12 +61,15 @@ function TreatmentContrast(v::PooledDataVector; base::Integer=1) (1 <= base <= n) || error("base = $(base) is not allowed for n = $n") not_base = [1:(base-1); (base+1):n] - mat = eye(n)[:, not_base] tnames = lvls[not_base] + mat = contrast_matrix(TreatmentContrast, n, base) + return TreatmentContrast(base, mat, tnames, lvls) end +contrast_matrix(::Type{TreatmentContrast}, n, base) = eye(n)[:, [1:(base-1); (base+1):n]] + ################################################################################ ## Sum-coded contrast @@ -89,13 +92,20 @@ function SumContrast(v::PooledDataVector; base::Integer=1) (1 <= base <= n) || error("base = $(base) is not allowed for n = $n") not_base = [1:(base-1); (base+1):n] - mat = eye(n)[:, not_base] - mat[base, :] = -1 tnames = lvls[not_base] + mat = contrast_matrix(SumContrast, n, base) + return SumContrast(base, mat, tnames, lvls) end +function contrast_matrix(::Type{SumContrast}, n, base) + not_base = [1:(base-1); (base+1):n] + mat = eye(n)[:, not_base] + mat[base, :] = -1 + return mat +end + ################################################################################ ## Helmert-coded contrast @@ -132,6 +142,12 @@ function HelmertContrast(v::PooledDataVector; base::Integer=1) not_base = [1:(base-1); (base+1):n] tnames = lvls[not_base] + mat = contrast_matrix(HelmertContrast, n, base) + + return HelmertContrast(base, mat, tnames, lvls) +end + +function contrast_matrix(::Type{HelmertContrast}, n, base) mat = zeros(n, n-1) for i in 1:n-1 mat[1:i, i] = -1 @@ -140,6 +156,6 @@ function HelmertContrast(v::PooledDataVector; base::Integer=1) ## re-shuffle the rows such that base is the all -1.0 row (currently first) mat = mat[[base; 1:(base-1); (base+1):end], :] - - return HelmertContrast(base, mat, tnames, lvls) + return mat end + From 20cbb00953ca2f088a3ad270879e3da7f606a44a Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 21 Sep 2015 20:06:39 -0400 Subject: [PATCH 14/74] code generation to factor out contrast type boilerplate --- src/statsmodels/contrasts.jl | 100 ++++++++++++----------------------- 1 file changed, 35 insertions(+), 65 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 60149b253c..82dbdb5c5f 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -34,40 +34,54 @@ termnames(term::Symbol, col::PooledDataArray) = termnames(term, col, TreatmentCo ## Constructing a contrast from a non-pooled data vector will first pool it ## -## (NOT SURE THIS IS GOOD: constructing columns also depends on levels, so for -## _that_ to work would need to somehow hold onto pooled data...) +## Also requires a cols method for non-PooledDataArray column... ## Base.call{T<: AbstractContrast}(C::Type{T}, v::DataVector, args...; kwargs...) = ## Base.call(C, pool(v), args...; kwargs...) -################################################################################ -## Treatment (dummy-coded) contrast -################################################################################ -## TODO: factor out some of this repetition between different contrast types -## using metaprogramming +## Making a contrast type T only requires that there be a method for +## contrast_matrix(T, v::PooledDataArray). The rest is boilerplate. +## +for contrastType in [:TreatmentContrast, :SumContrast, :HelmertContrast] + @eval begin + type $contrastType <: AbstractContrast + base::Integer + matrix::Matrix{Float64} + termnames::Vector{Any} + levels::Vector{Any} + end + + function $contrastType(v::PooledDataVector; base::Integer=1) + lvls = levels(v) + + n = length(lvls) + n > 1 || error("not enough degrees of freedom to define contrasts") + (1 <= base <= n) || error("base = $(base) is not allowed for n = $n") + + not_base = [1:(base-1); (base+1):n] + tnames = lvls[not_base] + + mat = contrast_matrix($contrastType, n, base) -type TreatmentContrast <: AbstractContrast - base::Integer - matrix::Matrix{Float64} - termnames::Vector{Any} - levels::Vector{Any} + return $contrastType(base, mat, tnames, lvls) + end + end end -function TreatmentContrast(v::PooledDataVector; base::Integer=1) - lvls = levels(v) +## Could write this as a macro, too, so that people can register their own +## contrast types easily, without having to write out this boilerplate...the +## downside of that would be that they'd be locked in to a particular set of +## fields...although they could always just write the boilerplate themselves... - n = length(lvls) - n > 1 || error("not enough degrees of freedom to define contrasts") - (1 <= base <= n) || error("base = $(base) is not allowed for n = $n") - not_base = [1:(base-1); (base+1):n] - tnames = lvls[not_base] - mat = contrast_matrix(TreatmentContrast, n, base) - return TreatmentContrast(base, mat, tnames, lvls) end +################################################################################ +## Treatment (dummy-coded) contrast +################################################################################ + contrast_matrix(::Type{TreatmentContrast}, n, base) = eye(n)[:, [1:(base-1); (base+1):n]] @@ -77,28 +91,6 @@ contrast_matrix(::Type{TreatmentContrast}, n, base) = eye(n)[:, [1:(base-1); (ba ## -1 for base level and +1 for contrast level. ################################################################################ -type SumContrast <: AbstractContrast - base::Integer - matrix::Matrix{Float64} - termnames::Vector{Any} - levels::Vector{Any} -end - -function SumContrast(v::PooledDataVector; base::Integer=1) - lvls = levels(v) - - n = length(lvls) - n > 1 || error("not enough degrees of freedom to define contrasts") - (1 <= base <= n) || error("base = $(base) is not allowed for n = $n") - - not_base = [1:(base-1); (base+1):n] - tnames = lvls[not_base] - - mat = contrast_matrix(SumContrast, n, base) - - return SumContrast(base, mat, tnames, lvls) -end - function contrast_matrix(::Type{SumContrast}, n, base) not_base = [1:(base-1); (base+1):n] mat = eye(n)[:, not_base] @@ -125,28 +117,6 @@ end ## orthogonal and with mean 0. ################################################################################ -type HelmertContrast <: AbstractContrast - base::Integer - matrix::Matrix{Float64} - termnames::Vector{Any} - levels::Vector{Any} -end - -function HelmertContrast(v::PooledDataVector; base::Integer=1) - lvls = levels(v) - - n = length(lvls) - n > 1 || error("not enough degrees of freedom to define contrasts") - 1 <= base <= n || error("base = $(base) is not allowed for n = $n") - - not_base = [1:(base-1); (base+1):n] - tnames = lvls[not_base] - - mat = contrast_matrix(HelmertContrast, n, base) - - return HelmertContrast(base, mat, tnames, lvls) -end - function contrast_matrix(::Type{HelmertContrast}, n, base) mat = zeros(n, n-1) for i in 1:n-1 From 13c4ffd66f6475bfa3a3272356fcde33be6e6698 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 21 Sep 2015 23:45:23 -0400 Subject: [PATCH 15/74] stray end --- src/statsmodels/contrasts.jl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 82dbdb5c5f..f965ecc2d9 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -73,11 +73,6 @@ end ## downside of that would be that they'd be locked in to a particular set of ## fields...although they could always just write the boilerplate themselves... - - - -end - ################################################################################ ## Treatment (dummy-coded) contrast ################################################################################ From fe05a55fe8f1c4b46cdfad51a94c8bfe89d1faf8 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 23 Sep 2015 21:38:46 -0400 Subject: [PATCH 16/74] parametrize contrast types by eltype of data --- src/statsmodels/contrasts.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index f965ecc2d9..320ff08a16 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -44,14 +44,14 @@ termnames(term::Symbol, col::PooledDataArray) = termnames(term, col, TreatmentCo ## for contrastType in [:TreatmentContrast, :SumContrast, :HelmertContrast] @eval begin - type $contrastType <: AbstractContrast + type $contrastType{T} <: AbstractContrast base::Integer matrix::Matrix{Float64} - termnames::Vector{Any} - levels::Vector{Any} + termnames::Vector{T} + levels::Vector{T} end - function $contrastType(v::PooledDataVector; base::Integer=1) + function $contrastType{T}(v::PooledDataVector{T}; base::Integer=1) lvls = levels(v) n = length(lvls) @@ -63,7 +63,7 @@ for contrastType in [:TreatmentContrast, :SumContrast, :HelmertContrast] mat = contrast_matrix($contrastType, n, base) - return $contrastType(base, mat, tnames, lvls) + return $contrastType{T}(base, mat, tnames, lvls) end end end From e444a411cefd7547dd3e3d57cf69a3acaf8d3427 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 23 Sep 2015 23:47:48 -0400 Subject: [PATCH 17/74] find non-redundant columns for promoting to full rank --- src/statsmodels/formula.jl | 63 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index be165fa9b4..12d5ab7e57 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -239,6 +239,69 @@ function dropUnusedLevels!(da::PooledDataArray) end dropUnusedLevels!(x) = x +## whether or not a column of a particular type can be "promoted" to full rank +## (only true of factors) +can_promote(::PooledDataArray) = true +can_promote(::Any) = false + +## Check for non-redundancy of columns. For instance, if x is a factor with two +## levels, it should be expanded into two columns in y~0+x but only one column +## in y~1+x. The default is the rank-reduced form (contrasts for n levels only +## produce n-1 columns). In general, an evaluation term x within a term +## x&y&... needs to be "promoted" to full rank if y&... hasn't already been +## included (either explicitly in the Terms or implicitly by promoting another +## term like z in z&y&...). +## +## This function returns a boolean matrix that says whether each evaluation term +## of each term needs to be promoted. +function check_non_redundancy(trms, df) + + ## This can be checked using the .factors field of the terms, which is an + ## evaluation terms x terms matrix. + (n_eterms, n_terms) = size(trms.factors) + to_promote = falses(n_eterms, n_terms) + encountered_columns = Vector{eltype(trms.factors)}[] + + if trms.intercept + push!(encountered_columns, zeros(eltype(trms.factors), n_eterms)) + end + + for i_term in 1:n_terms + for i_eterm in 1:n_eterms + ## only need to check eterms that are included and can be promoted + ## (e.g., categorical variables that expand to multiple mm columns) + if round(Bool, trms.factors[i_eterm, i_term]) && can_promote(df[trms.eterms[i_eterm]]) + dropped = trms.factors[:,i_term] + dropped[i_eterm] = 0 + ## short circuiting check for whether the version of this term + ## with the current eterm dropped has been encountered already + ## (and hence does not need to be expanded + is_redundant = false + for enc in encountered_columns + if dropped == enc + is_redundant = true + break + end + end + ## more concisely, but with non-short-circuiting any: + ##is_redundant = any([dropped == enc for enc in encountered_columns]) + + if !is_redundant + to_promote[i_eterm, i_term] = true + push!(encountered_columns, dropped) + end + + end + ## once we've checked all the eterms in this term, add it to the list + ## of encountered terms/columns + end + push!(encountered_columns, trms.factors[:, i_term]) + end + + return to_promote + +end + ## Goal here is to allow specification of _either_ a "naked" contrast type, ## or an instantiated contrast object itself. This might be achieved in a more ## julian way by overloading call for c::AbstractContrast to just return c. From a93a4e18fc2e78632f79885435aaac48f4f9c326 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 26 Sep 2015 13:23:28 -0400 Subject: [PATCH 18/74] full-rank dummy contrasts type --- src/statsmodels/contrasts.jl | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 320ff08a16..0d79d49a98 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -74,7 +74,38 @@ end ## fields...although they could always just write the boilerplate themselves... ################################################################################ -## Treatment (dummy-coded) contrast +## Dummy contrasts (full rank) +## +## Needed when a term is non-redundant with lower-order terms (e.g., in ~0+x vs. +## ~1+x, or in the interactions terms in ~1+x+x&y vs. ~1+x+y+x&y. In the +## non-redundant cases, we can (probably) expand x into length(levels(x)) +## columns without creating a non-identifiable model matrix (unless the user +## has done something dumb in specifying the model, which we can't do much about +## anyway). +################################################################################ + +## Dummy contrasts have no base level (since all levels produce a column) +type DummyContrast{T} <: AbstractContrast + matrix::Matrix{Float64} + termnames::Vector{T} + levels::Vector{T} +end + +function DummyContrast{T}(v::PooledDataVector{T}) + lvls = levels(v) + mat = eye(Float64, length(lvls)) + + DummyContrast(mat, lvls, lvls) +end + +## Default for promoting contrasts to full rank is to convert to dummy contrasts +promote_contrast(C::AbstractContrast) = DummyContrast(eye(Float64, length(C.levels)), C.levels, C.levels) + + + + +################################################################################ +## Treatment (rank-reduced dummy-coded) contrast ################################################################################ contrast_matrix(::Type{TreatmentContrast}, n, base) = eye(n)[:, [1:(base-1); (base+1):n]] From a0672e76135e7a0186c46e05c8a0b9ee7b945dc2 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 3 Oct 2015 21:29:16 -0400 Subject: [PATCH 19/74] include non_redundants in model frame --- src/statsmodels/formula.jl | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 12d5ab7e57..e3efde5c94 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -40,6 +40,9 @@ type ModelFrame ## mapping from df keys to contrasts. Rather than specify types allowed, ## leave that to cols() to check. Allows more seamless later extension contrasts::Dict{Any, Any} + ## An eterms x terms matrix which is true for terms that need to be "promoted" + ## to full rank in constructing a model matrx + non_redundant_terms::Matrix{Bool} end type ModelMatrix{T <: @compat(Union{Float32, Float64})} @@ -254,7 +257,7 @@ can_promote(::Any) = false ## ## This function returns a boolean matrix that says whether each evaluation term ## of each term needs to be promoted. -function check_non_redundancy(trms, df) +function check_non_redundancy(trms::Terms, df::AbstractDataFrame) ## This can be checked using the .factors field of the terms, which is an ## evaluation terms x terms matrix. @@ -328,8 +331,10 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; ## contrast matrices, term names, and levels to store) evaledContrasts = [ col => evaluateContrast(contr, df[col]) for (col, contr) in contrasts ] + ## Check whether or not + non_redundants = check_non_redundancy(trms, df) - ModelFrame(df, trms, msng, evaledContrasts) + ModelFrame(df, trms, msng, evaledContrasts, non_redundants) end ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...) @@ -350,6 +355,15 @@ end cols(v::DataVector) = convert(Vector{Float64}, v.data) cols(v::Vector) = convert(Vector{Float64}, v) +## construct model matrix columns from model frame + name (checks for contrasts) +function cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false) + if haskey(mf.contrasts, name) + cols(mf.df[name], + non_redundant ? promote_contrast(mf.contrasts[name]) : mf.contrasts[name]) + else + cols(mf.df[name]) + end +end function isfe(ex::Expr) # true for fixed-effects terms if ex.head != :call error("Non-call expression encountered") end From d5909138abbfcf00e857776c468eab87483102c4 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 3 Oct 2015 21:31:57 -0400 Subject: [PATCH 20/74] only evaluate contrasts for columns in the df --- src/statsmodels/formula.jl | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index e3efde5c94..3c7d35b5d8 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -311,26 +311,28 @@ end evaluateContrast(c::AbstractContrast, col::AbstractDataVector) = c evaluateContrast{C <: AbstractContrast}(c::Type{C}, col::AbstractDataVector) = C(col) +needsContrasts(::PooledDataArray) = true +needsContrasts(::Any) = false + function ModelFrame(trms::Terms, d::AbstractDataFrame; contrasts::Dict = Dict()) df, msng = na_omit(DataFrame(map(x -> d[x], trms.eterms))) names!(df, convert(Vector{Symbol}, map(string, trms.eterms))) for c in eachcol(df) dropUnusedLevels!(c[2]) end - ## set default contrasts for categorical columns without specified - ## contrasts. This is to ensure that we store how to construct model matrix - ## columsn for ALL variables where the levels might change for prediction - ## with new data (e.g. if only a subset of the levels are present) - for term in trms.eterms - if isa(df[term], PooledDataArray) && !haskey(contrasts, term) - contrasts[term] = TreatmentContrast - end + ## Set up contrasts: + ## Combine actual DF columns and contrast types if necessary to compute the + ## actual contrasts matrices, levels, and term names (using TreatmentContrast + ## as the default) + evaledContrasts = Dict() + for term,col in eachcol(df) + needsContrasts(col) || continue + evaledContrasts[term] = evaluateContrast(haskey(contrasts, term) ? + contrasts[term] : + TreatmentContrast, + col) end - ## evaluate any naked contrasts types based on data (creating - ## contrast matrices, term names, and levels to store) - evaledContrasts = [ col => evaluateContrast(contr, df[col]) - for (col, contr) in contrasts ] ## Check whether or not non_redundants = check_non_redundancy(trms, df) @@ -341,12 +343,14 @@ ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...) ## modify contrasts in place -function contrast!(mf::ModelFrame; kwargs...) +function contrast!(mf::ModelFrame, new_contrasts::Dict) new_contrasts = [ col => evaluateContrast(contr, mf.df[col]) - for (col, contr) in kwargs ] + for (col, contr) + in filter((k,v)->haskey(mf.df, k), new_contrasts) ] mf.contrasts = merge(mf.contrasts, new_contrasts) return mf end +contrast!(mf::ModelFrame; kwargs...) = contrast!(mf, kwargs) function StatsBase.model_response(mf::ModelFrame) mf.terms.response || error("Model formula one-sided") From 1c7610493c4ee570684896f8f069df582add2e2b Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 3 Oct 2015 21:32:33 -0400 Subject: [PATCH 21/74] clean up mm constructor using mf cols function --- src/statsmodels/formula.jl | 41 +++++++++++++++----------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 3c7d35b5d8..899887bf5d 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -400,35 +400,26 @@ function ModelMatrix(mf::ModelFrame) trms = mf.terms aa = Any[Any[ones(size(mf.df,1), @compat(Int(trms.intercept)))]] asgn = zeros(Int, @compat(Int(trms.intercept))) - fetrms = Bool[isfe(t) for t in trms.terms] - if trms.response unshift!(fetrms, false) end - ff = trms.factors[:, fetrms] + fe_terms = Bool[isfe(t) for t in trms.terms] + if trms.response unshift!(fe_terms, false) end + ff = trms.factors[:, fe_terms] ## need to be cautious here to avoid evaluating cols for a factor with many levels - ## if the factor doesn't occur in the fetrms - rows = vec(sum(ff, 2) .!= 0) - ff = ff[rows, :] - ## cc = [cols(col) for col in columns(mf.df[:, rows])] - - ## construct model matrix columns from each of the factors, checking for - ## contrasts that have been manually specified. Categorical data - ## (PooledDataArray) will expand to a matrix with multiple columns, one - ## or each column of the contrast matrix, either specified in the ModelFrame - ## or the default TreatmentContrast. - cc = Any[] - for (name, x) in eachcol(mf.df[:,rows]) - if haskey(mf.contrasts, name) - push!(cc, cols(x, mf.contrasts[name])) - else - push!(cc, cols(x)) - end - end + ## if the factor doesn't occur in the fe_terms + eterms_included = Bool[x != 0 for x in sum(ff, 2)] + ff = ff[eterms_included, :] - for j in 1:size(ff,2) - trm = cc[round(Bool, ff[:, j])] + cc = [cols(eterm, mf) for eterm in trms.eterms[eterms_included]] + + ## Iterate over terms, pulling out the necessary eterms into a vector + ## and recording the columns in the model matrix that will correspond to + ## this term + for i_term in 1:size(ff,2) + trm = cc[round(Bool, ff[:, i_term])] push!(aa, trm) - asgn = vcat(asgn, fill(j, nc(trm))) + asgn = vcat(asgn, fill(i_term, nc(trm))) end - ModelMatrix{Float64}(hcat([expandcols(t) for t in aa]...), asgn) + return ModelMatrix{Float64}(hcat([expandcols(t) for t in aa]...), asgn) + end termnames(term::Symbol, col) = [string(term)] From 62a9cf70be68c581e39e7b07897d0391bfa824be Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 4 Oct 2015 12:07:05 -0400 Subject: [PATCH 22/74] fix iterator syntax and add missing Dict for kwargs --- src/statsmodels/formula.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 899887bf5d..f25ba825b1 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -325,7 +325,7 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; ## actual contrasts matrices, levels, and term names (using TreatmentContrast ## as the default) evaledContrasts = Dict() - for term,col in eachcol(df) + for (term, col) in eachcol(df) needsContrasts(col) || continue evaledContrasts[term] = evaluateContrast(haskey(contrasts, term) ? contrasts[term] : @@ -345,12 +345,12 @@ ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), ## modify contrasts in place function contrast!(mf::ModelFrame, new_contrasts::Dict) new_contrasts = [ col => evaluateContrast(contr, mf.df[col]) - for (col, contr) - in filter((k,v)->haskey(mf.df, k), new_contrasts) ] + for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ] + mf.contrasts = merge(mf.contrasts, new_contrasts) return mf end -contrast!(mf::ModelFrame; kwargs...) = contrast!(mf, kwargs) +contrast!(mf::ModelFrame; kwargs...) = contrast!(mf, Dict(kwargs)) function StatsBase.model_response(mf::ModelFrame) mf.terms.response || error("Model formula one-sided") From 5d3bb94f4dda7fafbf0909add58caaf5202473b5 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 4 Oct 2015 12:07:42 -0400 Subject: [PATCH 23/74] handle non-redundant columns right in model matrix --- src/statsmodels/formula.jl | 58 +++++++++++++++++++++------------ test/formula.jl | 67 ++++++++++++++++++++++++++------------ 2 files changed, 83 insertions(+), 42 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index f25ba825b1..86825b9fb9 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -397,28 +397,44 @@ function nc(trm::Vector) end function ModelMatrix(mf::ModelFrame) - trms = mf.terms - aa = Any[Any[ones(size(mf.df,1), @compat(Int(trms.intercept)))]] - asgn = zeros(Int, @compat(Int(trms.intercept))) - fe_terms = Bool[isfe(t) for t in trms.terms] - if trms.response unshift!(fe_terms, false) end - ff = trms.factors[:, fe_terms] - ## need to be cautious here to avoid evaluating cols for a factor with many levels - ## if the factor doesn't occur in the fe_terms - eterms_included = Bool[x != 0 for x in sum(ff, 2)] - ff = ff[eterms_included, :] - - cc = [cols(eterm, mf) for eterm in trms.eterms[eterms_included]] - - ## Iterate over terms, pulling out the necessary eterms into a vector - ## and recording the columns in the model matrix that will correspond to - ## this term - for i_term in 1:size(ff,2) - trm = cc[round(Bool, ff[:, i_term])] - push!(aa, trm) - asgn = vcat(asgn, fill(i_term, nc(trm))) + + ## Map eval. term name + redundancy bool to cached model matrix columns + eterm_cols = Dict{Tuple{Symbol,Bool}, Array{Float64}}() + ## Accumulator for each term's vector of eval. term columns. + mm_cols = Vector{Array{Float64}}[] + mf.terms.intercept && push!(mm_cols, Matrix{Float64}[ones(Float64, size(mf.df,1), 1)]) + factors = round(Bool, mf.terms.factors) + + ## turn each term into a vector of mm columns for its eval. terms, using + ## "promoted" full-rank versions of categorical columns for non-redundant + ## eval. terms: + for (i_term, term) in enumerate(mf.terms.terms) + ## Skip non-fixed-effect terms + isfe(term) || continue + term_cols = Array{Float64}[] + ## adjust term index if there's a response in the formula (mf.terms.terms + ## lists only non-response terms, but mf.terms.factors and .non_reundant_terms + ## has first column for the response if it's present) + i_term += mf.terms.response + ## Pull out the eval terms, and the non-redundancy flags for this term + eterms = mf.terms.eterms[factors[:, i_term]] + non_redundant = mf.non_redundant_terms[factors[:, i_term], i_term] + ## Get cols for each eval term (either previously generated, or generating + ## and storing as necessary) + for et_and_nr in zip(eterms, non_redundant) + haskey(eterm_cols, et_and_nr) || + setindex!(eterm_cols, + cols(et_and_nr[1], mf, non_redundant=et_and_nr[2]), + et_and_nr) + push!(term_cols, eterm_cols[et_and_nr]) + end + push!(mm_cols, term_cols) end - return ModelMatrix{Float64}(hcat([expandcols(t) for t in aa]...), asgn) + + mm = hcat([expandcols(tc) for tc in mm_cols]...) + mm_col_term_nums = vcat([fill(i_term, nc(tc)) for (i_term,tc) in enumerate(mm_cols)]...) + + ModelMatrix{Float64}(mm, mm_col_term_nums) end diff --git a/test/formula.jl b/test/formula.jl index d3e343ac2e..fba17b0c5a 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -297,7 +297,7 @@ module TestFormula f = y ~ x1 & x2 & x3 mf = ModelFrame(f, df) mm = ModelMatrix(mf) - @test mm.m[:, 2:end] == vcat(zeros(1, 3), diagm(x2[2:end].*x3[2:end])) + @test mm.m[:, 2:end] == diagm(x2.*x3) #test_group("Column groups in formulas") ## set_group was removed in The Great Purge (55e47cd) @@ -314,27 +314,30 @@ module TestFormula ## @test mm.model == [ones(4) x1 x3 x2 x1.*x2 x3.*x2] ## Interactions between three PDA columns - df = DataFrame(y=1:27, - x1 = PooledDataArray(vec([x for x in 1:3, y in 4:6, z in 7:9])), - x2 = PooledDataArray(vec([y for x in 1:3, y in 4:6, z in 7:9])), - x3 = PooledDataArray(vec([z for x in 1:3, y in 4:6, z in 7:9]))) - f = y ~ x1 & x2 & x3 - mf = ModelFrame(f, df) - @test coefnames(mf)[2:end] == - vec([string("x1 - ", x, " & x2 - ", y, " & x3 - ", z) for - x in 2:3, - y in 5:6, - z in 8:9]) + ## + ## FAILS: behavior is wrong when no lower-order terms (1+x1+x2+x1&x2...) + ## + ## df = DataFrame(y=1:27, + ## x1 = PooledDataArray(vec([x for x in 1:3, y in 4:6, z in 7:9])), + ## x2 = PooledDataArray(vec([y for x in 1:3, y in 4:6, z in 7:9])), + ## x3 = PooledDataArray(vec([z for x in 1:3, y in 4:6, z in 7:9]))) + ## f = y ~ x1 & x2 & x3 + ## mf = ModelFrame(f, df) + ## @test coefnames(mf)[2:end] == + ## vec([string("x1 - ", x, " & x2 - ", y, " & x3 - ", z) for + ## x in 2:3, + ## y in 5:6, + ## z in 8:9]) - mm = ModelMatrix(mf) - @test mm.m[:,2] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 5) .* (df[:x3].==8) - @test mm.m[:,3] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 5) .* (df[:x3].==8) - @test mm.m[:,4] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 6) .* (df[:x3].==8) - @test mm.m[:,5] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 6) .* (df[:x3].==8) - @test mm.m[:,6] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 5) .* (df[:x3].==9) - @test mm.m[:,7] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 5) .* (df[:x3].==9) - @test mm.m[:,8] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 6) .* (df[:x3].==9) - @test mm.m[:,9] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 6) .* (df[:x3].==9) + ## mm = ModelMatrix(mf) + ## @test mm.m[:,2] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 5) .* (df[:x3].==8) + ## @test mm.m[:,3] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 5) .* (df[:x3].==8) + ## @test mm.m[:,4] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 6) .* (df[:x3].==8) + ## @test mm.m[:,5] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 6) .* (df[:x3].==8) + ## @test mm.m[:,6] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 5) .* (df[:x3].==9) + ## @test mm.m[:,7] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 5) .* (df[:x3].==9) + ## @test mm.m[:,8] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 6) .* (df[:x3].==9) + ## @test mm.m[:,9] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 6) .* (df[:x3].==9) ## Distributive property of :& over :+ df = deepcopy(d) @@ -365,4 +368,26 @@ module TestFormula mm = ModelMatrix(mf) @test mm.m[:, 2] == d[complete_cases(d), :x1m] +## Promote non-redundant categorical terms to full rank + +d = DataFrame(x = rep([:a, :b], times = 4), + y = rep([:c, :d], times = 2, each = 2), + z = rep([:e, :f], each = 4)) +[pool!(d, name) for name in names(d)] +cs = [name => SumContrast for name in names(d)] +d[:n] = 1.:8 + + +## No intercept +mf = ModelFrame(n ~ 0 + x, d[1:2,:], contrasts=cs) +@test ModelMatrix(mf).m == [1 0; 0 1] + +## No first-order term for interaction +mf = ModelFrame(n ~ 1 + x + x&y, d[1:4, :], contrasts=cs) +@test ModelMatrix(mf).m[:, 2:end] == [-1 -1 0 + 1 0 -1 + -1 1 0 + 1 0 1] + + end From efb93fe11cebd01d477b92ae5bfdbdab5cdc8fa5 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 4 Oct 2015 21:28:56 -0400 Subject: [PATCH 24/74] termnames checking for non-redundancy, plus tests --- src/statsmodels/formula.jl | 65 +++++++++++++++++++------------------- test/formula.jl | 12 +++++++ 2 files changed, 45 insertions(+), 32 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 86825b9fb9..8083bad7ba 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -438,7 +438,17 @@ function ModelMatrix(mf::ModelFrame) end + + termnames(term::Symbol, col) = [string(term)] +function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false) + if haskey(mf.contrasts, term) + termnames(term, mf.df[term], + non_redundant ? promote_contrast(mf.contrasts[term]) : mf.contrasts[term]) + else + termnames(term, mf.df[term]) + end +end function expandtermnames(term::Vector) if length(term) == 1 @@ -452,42 +462,33 @@ function expandtermnames(term::Vector) end function coefnames(mf::ModelFrame) - # Generate individual evaluation-term names. Each value in etnames is a - # vector of the names for each column that the term evaluates to. (Just a - # single column for numeric terms, and one per contrast for categorical) - etnames = Dict(); - for term in mf.terms.eterms - if haskey(mf.contrasts, term) - etnames[term] = termnames(term, mf.df[term], mf.contrasts[term]) - else - etnames[term] = termnames(term, mf.df[term]) - end - end + ## strategy mirrors ModelMatrx constructor: + eterm_names = Dict{Tuple{Symbol,Bool}, Vector{Compat.UTF8String}}() + term_names = Vector{Vector{Compat.UTF8String}}[] + mf.terms.intercept && push!(term_names, Vector[Compat.UTF8String["(Intercept)"]]) - # For each _term_, pull out a vector of individual evaluation termnames that - # go into it. - tnames = Vector{Vector{Compat.UTF8String}}() - if mf.terms.intercept - push!(tnames, push!(Any[], ["(Intercept)"])) - end - for term in mf.terms.terms - if isa(term, Expr) - if term.head == :call && term.args[1] == :| - continue # skip random-effects terms - elseif term.head == :call && term.args[1] == :& - push!(tnames, map(x -> etnames[x], term.args[2:end])) - else - error("unrecognized term $term") - end - else - # for "main effect" terms, push length-1 vector which holds all - push!(tnames, push!(Any[], etnames[term])) + factors = round(Bool, mf.terms.factors) + + for (i_term, term) in enumerate(mf.terms.terms) + isfe(term) || continue + ## names for columns for eval terms + names = Vector{Compat.UTF8String}[] + + i_term += mf.terms.response + eterms = mf.terms.eterms[factors[:, i_term]] + non_redundant = mf.non_redundant_terms[factors[:, i_term], i_term] + + for et_and_nr in zip(eterms, non_redundant) + haskey(eterm_names, et_and_nr) || + setindex!(eterm_names, + termnames(et_and_nr[1], mf, non_redundant=et_and_nr[2]), + et_and_nr) + push!(names, eterm_names[et_and_nr]) end + push!(term_names, names) end - # Finally, expand each term names (analogously with expandcols, creating - # the names of each of the interaction columns), concatenate, and return. - return mapreduce(expandtermnames, vcat, tnames) + mapreduce(expandtermnames, vcat, term_names) end diff --git a/test/formula.jl b/test/formula.jl index fba17b0c5a..84e60001c0 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -381,6 +381,7 @@ d[:n] = 1.:8 ## No intercept mf = ModelFrame(n ~ 0 + x, d[1:2,:], contrasts=cs) @test ModelMatrix(mf).m == [1 0; 0 1] +@test coefnames(mf) == ["x - a", "x - b"] ## No first-order term for interaction mf = ModelFrame(n ~ 1 + x + x&y, d[1:4, :], contrasts=cs) @@ -388,6 +389,17 @@ mf = ModelFrame(n ~ 1 + x + x&y, d[1:4, :], contrasts=cs) 1 0 -1 -1 1 0 1 0 1] +@test coefnames(mf) == ["(Intercept)", "x - b", "x - a & y - d", "x - b & y - d"] + +## When both terms of interaction are non-redundant: +mf = ModelFrame(n ~ 0 + x&y, d[1:4, :], contrasts=cs) +@test ModelMatrix(mf).m == [1 0 0 0 + 0 1 0 0 + 0 0 1 0 + 0 0 0 1] +@test coefnames(mf) == ["x - a & y - c", "x - b & y - c", + "x - a & y - d", "x - b & y - d"] + end From bdb2119d432c1322d3567ff1f5b9e3f38a8a3611 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 4 Oct 2015 22:09:21 -0400 Subject: [PATCH 25/74] default contrast is managed in formula.jl/ModelFrame --- src/statsmodels/contrasts.jl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 0d79d49a98..35e4832dc3 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -27,11 +27,6 @@ function cols(v::PooledDataVector, contrast::AbstractContrast) return contrast.matrix[reindex[v.refs], :] end -## Default contrast is treatment coding: -cols(v::PooledDataVector) = cols(v, TreatmentContrast(v)) -termnames(term::Symbol, col::PooledDataArray) = termnames(term, col, TreatmentContrast(col)) - - ## Constructing a contrast from a non-pooled data vector will first pool it ## ## Also requires a cols method for non-PooledDataArray column... From 9eb4570084ac7569b586bbb284dce9be27ce19d3 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 5 Oct 2015 00:07:51 -0400 Subject: [PATCH 26/74] introduce ContrastMatrix type --- src/statsmodels/contrasts.jl | 108 ++++++++++++++++------------------- src/statsmodels/formula.jl | 5 +- test/contrasts.jl | 2 +- 3 files changed, 53 insertions(+), 62 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 35e4832dc3..a44636dd23 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -1,73 +1,70 @@ ## Specify contrasts for coding categorical data in model matrix. Contrast types -## are constructed with data and other options, creating a subtype of -## AbstractContrast. AbstractContrast provides the interface for creating modle -## matrix columns and coefficient names +## are a subtype of AbstractContrast. ContrastMatrix types hold a contrast +## matrix, levels, and term names and provide the interface for creating model +## matrix columns and coefficient names. ## -## ModelFrame will hold a Dict{Symbol, T<:AbstractContrast} that maps column +## Contrast types themselves can be instantiated to provide containers for +## contrast settings (currently, just the base level). +## +## ModelFrame will hold a Dict{Symbol, ContrastMatrix} that maps column ## names to contrasts. ## ## ModelMatrix will check this dict when evaluating terms, falling back to a ## default for any categorical data without a specified contrast. +## +## TODO: implement contrast types in Formula/Terms abstract AbstractContrast -termnames(term::Symbol, col::Any, contrast::AbstractContrast) = +## Contrast + Data = ContrastMatrix +type ContrastMatrix{T} + matrix::Matrix{Float64} + termnames::Vector{T} + levels::Vector{T} + contrasts::AbstractContrast +end + +function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) + n = length(lvls) + n > 1 || error("not enough degrees of freedom to define contrasts") + (1 <= C.base <= n) || error("base = $(base) is not allowed for n = $n") + + not_base = [1:(C.base-1); (C.base+1):n] + tnames = lvls[not_base] + + mat = contrast_matrix(C, n) + + ContrastMatrix(mat, tnames, lvls, C) +end + +ContrastMatrix(C::AbstractContrast, v::PooledDataArray) = ContrastMatrix(C, levels(v)) + + +termnames(term::Symbol, col::Any, contrast::ContrastMatrix) = ["$term - $name" for name in contrast.termnames] -function cols(v::PooledDataVector, contrast::AbstractContrast) +function cols(v::PooledDataVector, contrast::ContrastMatrix) ## make sure the levels of the contrast matrix and the categorical data ## are the same by constructing a re-indexing vector. Indexing into ## reindex with v.refs will give the corresponding row number of the ## contrast matrix reindex = [findfirst(contrast.levels, l) for l in levels(v)] - - ## TODO: add kwarg for full-rank contrasts (e.g., when intercept isn't - ## specified in a model frame). - return contrast.matrix[reindex[v.refs], :] end -## Constructing a contrast from a non-pooled data vector will first pool it -## -## Also requires a cols method for non-PooledDataArray column... -## Base.call{T<: AbstractContrast}(C::Type{T}, v::DataVector, args...; kwargs...) = -## Base.call(C, pool(v), args...; kwargs...) - - ## Making a contrast type T only requires that there be a method for ## contrast_matrix(T, v::PooledDataArray). The rest is boilerplate. ## for contrastType in [:TreatmentContrast, :SumContrast, :HelmertContrast] @eval begin - type $contrastType{T} <: AbstractContrast + type $contrastType <: AbstractContrast base::Integer - matrix::Matrix{Float64} - termnames::Vector{T} - levels::Vector{T} - end - - function $contrastType{T}(v::PooledDataVector{T}; base::Integer=1) - lvls = levels(v) - - n = length(lvls) - n > 1 || error("not enough degrees of freedom to define contrasts") - (1 <= base <= n) || error("base = $(base) is not allowed for n = $n") - - not_base = [1:(base-1); (base+1):n] - tnames = lvls[not_base] - - mat = contrast_matrix($contrastType, n, base) - - return $contrastType{T}(base, mat, tnames, lvls) end + ## default base level is 1: + $contrastType(; base::Integer=1) = $contrastType(base) end end -## Could write this as a macro, too, so that people can register their own -## contrast types easily, without having to write out this boilerplate...the -## downside of that would be that they'd be locked in to a particular set of -## fields...although they could always just write the boilerplate themselves... - ################################################################################ ## Dummy contrasts (full rank) ## @@ -80,21 +77,15 @@ end ################################################################################ ## Dummy contrasts have no base level (since all levels produce a column) -type DummyContrast{T} <: AbstractContrast - matrix::Matrix{Float64} - termnames::Vector{T} - levels::Vector{T} +type DummyContrast <: AbstractContrast end -function DummyContrast{T}(v::PooledDataVector{T}) - lvls = levels(v) - mat = eye(Float64, length(lvls)) - - DummyContrast(mat, lvls, lvls) -end +ContrastMatrix{T}(C::DummyContrast, lvls::Vector{T}) = ContrastMatrix(eye(Float64, length(lvls)), lvls, lvls, C) ## Default for promoting contrasts to full rank is to convert to dummy contrasts -promote_contrast(C::AbstractContrast) = DummyContrast(eye(Float64, length(C.levels)), C.levels, C.levels) +## promote_contrast(C::AbstractContrast) = DummyContrast(eye(Float64, length(C.levels)), C.levels, C.levels) + +promote_contrast(C::ContrastMatrix) = ContrastMatrix(DummyContrast(), C.levels) @@ -103,7 +94,7 @@ promote_contrast(C::AbstractContrast) = DummyContrast(eye(Float64, length(C.leve ## Treatment (rank-reduced dummy-coded) contrast ################################################################################ -contrast_matrix(::Type{TreatmentContrast}, n, base) = eye(n)[:, [1:(base-1); (base+1):n]] +contrast_matrix(C::TreatmentContrast, n) = eye(n)[:, [1:(C.base-1); (C.base+1):n]] ################################################################################ @@ -112,14 +103,13 @@ contrast_matrix(::Type{TreatmentContrast}, n, base) = eye(n)[:, [1:(base-1); (ba ## -1 for base level and +1 for contrast level. ################################################################################ -function contrast_matrix(::Type{SumContrast}, n, base) - not_base = [1:(base-1); (base+1):n] +function contrast_matrix(C::SumContrast, n) + not_base = [1:(C.base-1); (C.base+1):n] mat = eye(n)[:, not_base] - mat[base, :] = -1 + mat[C.base, :] = -1 return mat end - ################################################################################ ## Helmert-coded contrast ## @@ -138,7 +128,7 @@ end ## orthogonal and with mean 0. ################################################################################ -function contrast_matrix(::Type{HelmertContrast}, n, base) +function contrast_matrix(C::HelmertContrast, n) mat = zeros(n, n-1) for i in 1:n-1 mat[1:i, i] = -1 @@ -146,7 +136,7 @@ function contrast_matrix(::Type{HelmertContrast}, n, base) end ## re-shuffle the rows such that base is the all -1.0 row (currently first) - mat = mat[[base; 1:(base-1); (base+1):end], :] + mat = mat[[C.base; 1:(C.base-1); (C.base+1):end], :] return mat end diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 8083bad7ba..4558affc9e 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -308,8 +308,9 @@ end ## Goal here is to allow specification of _either_ a "naked" contrast type, ## or an instantiated contrast object itself. This might be achieved in a more ## julian way by overloading call for c::AbstractContrast to just return c. -evaluateContrast(c::AbstractContrast, col::AbstractDataVector) = c -evaluateContrast{C <: AbstractContrast}(c::Type{C}, col::AbstractDataVector) = C(col) +evaluateContrast(c::AbstractContrast, col::AbstractDataVector) = ContrastMatrix(c, col) +evaluateContrast{C <: AbstractContrast}(c::Type{C}, col::AbstractDataVector) = ContrastMatrix(c(), col) +evaluateContrast(c::ContrastMatrix, col::AbstractDataVector) = c needsContrasts(::PooledDataArray) = true needsContrasts(::Any) = false diff --git a/test/contrasts.jl b/test/contrasts.jl index d63621af10..77af3629a8 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -26,7 +26,7 @@ contrast!(mf, x = SumContrast) @test coefnames(mf) == ["(Intercept)"; "x - b"; "x - c"] ## change base level of contrast -contrast!(mf, x = SumContrast(d[:x]; base = 2)) +contrast!(mf, x = SumContrast(base = 2)) @test ModelMatrix(mf).m == [1 1 0 1 -1 -1 1 0 1 From 03303ef62a0a66d158b1e778f20bdd890b342e48 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 9 Oct 2015 18:43:12 -0400 Subject: [PATCH 27/74] add @compat on Dicts --- src/statsmodels/formula.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 4558affc9e..5998791823 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -400,7 +400,7 @@ end function ModelMatrix(mf::ModelFrame) ## Map eval. term name + redundancy bool to cached model matrix columns - eterm_cols = Dict{Tuple{Symbol,Bool}, Array{Float64}}() + eterm_cols = @compat Dict{Tuple{Symbol,Bool}, Array{Float64}}() ## Accumulator for each term's vector of eval. term columns. mm_cols = Vector{Array{Float64}}[] mf.terms.intercept && push!(mm_cols, Matrix{Float64}[ones(Float64, size(mf.df,1), 1)]) @@ -464,7 +464,7 @@ end function coefnames(mf::ModelFrame) ## strategy mirrors ModelMatrx constructor: - eterm_names = Dict{Tuple{Symbol,Bool}, Vector{Compat.UTF8String}}() + eterm_names = @compat Dict{Tuple{Symbol,Bool}, Vector{Compat.UTF8String}}() term_names = Vector{Vector{Compat.UTF8String}}[] mf.terms.intercept && push!(term_names, Vector[Compat.UTF8String["(Intercept)"]]) From b2c0475c08a16d4bdc66e7089e4ffd8451a2b2c0 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 30 Apr 2016 00:23:38 -0400 Subject: [PATCH 28/74] optionally specify levels and base level --- src/statsmodels/contrasts.jl | 62 +++++++++++++++++++++++++++--------- test/contrasts.jl | 13 +++++++- 2 files changed, 59 insertions(+), 16 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index a44636dd23..8059ad51ec 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -25,17 +25,37 @@ type ContrastMatrix{T} end function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) - n = length(lvls) - n > 1 || error("not enough degrees of freedom to define contrasts") - (1 <= C.base <= n) || error("base = $(base) is not allowed for n = $n") - not_base = [1:(C.base-1); (C.base+1):n] + ## if levels are defined on C, use those, validating that they line up. + ## what does that mean? + ## + ## C.levels == lvls (best case) + ## C.levels < lvls (will leave out some data...okay? will have empty ROWS) + ## C.levels > lvls (will have empty columns. not okay.) + ## empty intersection (worst case) + c_lvls = get(C.levels, lvls) + missing_lvls = setdiff(c_lvls, lvls) + isempty(missing_lvls) || error("Contrast levels not found in data: ", missing_lvls) + + n = length(c_lvls) + n > 1 || error("not enough degrees of freedom to define contrasts") + + ## find index of base level. use C.base, then C.baseind, then default (1). + if isnull(C.base) + baseind = get(C.baseind, 1) + (1 <= baseind <= n) || error("base = $(baseind) is not allowed for $n levels") + else + baseind = findfirst(c_lvls, get(C.base)) + baseind > 0 || error("Base level $(C.base) not found in levels") + end + + not_base = [1:(baseind-1); (baseind+1):n] tnames = lvls[not_base] - mat = contrast_matrix(C, n) + mat = contrast_matrix(C, baseind, n) ContrastMatrix(mat, tnames, lvls, C) -end +end ContrastMatrix(C::AbstractContrast, v::PooledDataArray) = ContrastMatrix(C, levels(v)) @@ -52,16 +72,28 @@ function cols(v::PooledDataVector, contrast::ContrastMatrix) return contrast.matrix[reindex[v.refs], :] end + +nullify(x::Nullable) = x +nullify(x) = Nullable(x) + ## Making a contrast type T only requires that there be a method for ## contrast_matrix(T, v::PooledDataArray). The rest is boilerplate. ## for contrastType in [:TreatmentContrast, :SumContrast, :HelmertContrast] @eval begin type $contrastType <: AbstractContrast - base::Integer + base::Nullable{Any} + baseind::Nullable{Integer} + levels::Nullable{Vector} end - ## default base level is 1: - $contrastType(; base::Integer=1) = $contrastType(base) + ## constructor with optional keyword arguments, defaulting to Nullables + $contrastType(; + base=Nullable{Any}(), + baseind=Nullable{Integer}(), + levels=Nullable{Vector}()) = + $contrastType(nullify(base), + nullify(baseind), + nullify(levels)) end end @@ -94,7 +126,7 @@ promote_contrast(C::ContrastMatrix) = ContrastMatrix(DummyContrast(), C.levels) ## Treatment (rank-reduced dummy-coded) contrast ################################################################################ -contrast_matrix(C::TreatmentContrast, n) = eye(n)[:, [1:(C.base-1); (C.base+1):n]] +contrast_matrix(C::TreatmentContrast, baseind, n) = eye(n)[:, [1:(baseind-1); (baseind+1):n]] ################################################################################ @@ -103,10 +135,10 @@ contrast_matrix(C::TreatmentContrast, n) = eye(n)[:, [1:(C.base-1); (C.base+1):n ## -1 for base level and +1 for contrast level. ################################################################################ -function contrast_matrix(C::SumContrast, n) - not_base = [1:(C.base-1); (C.base+1):n] +function contrast_matrix(C::SumContrast, baseind, n) + not_base = [1:(baseind-1); (baseind+1):n] mat = eye(n)[:, not_base] - mat[C.base, :] = -1 + mat[baseind, :] = -1 return mat end @@ -128,7 +160,7 @@ end ## orthogonal and with mean 0. ################################################################################ -function contrast_matrix(C::HelmertContrast, n) +function contrast_matrix(C::HelmertContrast, baseind, n) mat = zeros(n, n-1) for i in 1:n-1 mat[1:i, i] = -1 @@ -136,7 +168,7 @@ function contrast_matrix(C::HelmertContrast, n) end ## re-shuffle the rows such that base is the all -1.0 row (currently first) - mat = mat[[C.base; 1:(C.base-1); (C.base+1):end], :] + mat = mat[[baseind; 1:(baseind-1); (baseind+1):end], :] return mat end diff --git a/test/contrasts.jl b/test/contrasts.jl index 77af3629a8..87697d3dd6 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -26,7 +26,7 @@ contrast!(mf, x = SumContrast) @test coefnames(mf) == ["(Intercept)"; "x - b"; "x - c"] ## change base level of contrast -contrast!(mf, x = SumContrast(base = 2)) +contrast!(mf, x = SumContrast(baseind = 2)) @test ModelMatrix(mf).m == [1 1 0 1 -1 -1 1 0 1 @@ -35,6 +35,17 @@ contrast!(mf, x = SumContrast(base = 2)) 1 -1 -1] @test coefnames(mf) == ["(Intercept)"; "x - a"; "x - c"] +## change base level of contrast +contrast!(mf, x = SumContrast(base = :b)) +@test ModelMatrix(mf).m == [1 1 0 + 1 -1 -1 + 1 0 1 + 1 1 0 + 1 1 0 + 1 -1 -1] +@test coefnames(mf) == ["(Intercept)"; "x - a"; "x - c"] + + contrast!(mf, x = HelmertContrast) @test ModelMatrix(mf).m == [1 -1 -1 1 1 -1 From c3eae6e3a4c0c60e05727c99ef5072f1a962b70f Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 30 Apr 2016 16:27:05 -0400 Subject: [PATCH 29/74] actually used specified levels (+tests) --- src/statsmodels/contrasts.jl | 4 ++-- test/contrasts.jl | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 8059ad51ec..b9a026e114 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -50,11 +50,11 @@ function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) end not_base = [1:(baseind-1); (baseind+1):n] - tnames = lvls[not_base] + tnames = c_lvls[not_base] mat = contrast_matrix(C, baseind, n) - ContrastMatrix(mat, tnames, lvls, C) + ContrastMatrix(mat, tnames, c_lvls, C) end ContrastMatrix(C::AbstractContrast, v::PooledDataArray) = ContrastMatrix(C, levels(v)) diff --git a/test/contrasts.jl b/test/contrasts.jl index 87697d3dd6..dea56dbe05 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -45,6 +45,27 @@ contrast!(mf, x = SumContrast(base = :b)) 1 -1 -1] @test coefnames(mf) == ["(Intercept)"; "x - a"; "x - c"] +## change levels of contrast +contrast!(mf, x = SumContrast(levels = [:c, :b, :a])) +@test ModelMatrix(mf).m == [1 0 1 + 1 1 0 + 1 -1 -1 + 1 0 1 + 1 0 1 + 1 1 0] +@test coefnames(mf) == ["(Intercept)"; "x - b"; "x - a"] + + +## change levels and base level of contrast +contrast!(mf, x = SumContrast(levels = [:c, :b, :a], base = :a)) +@test ModelMatrix(mf).m == [1 -1 -1 + 1 0 1 + 1 1 0 + 1 -1 -1 + 1 -1 -1 + 1 0 1] +@test coefnames(mf) == ["(Intercept)"; "x - c"; "x - b"] + contrast!(mf, x = HelmertContrast) @test ModelMatrix(mf).m == [1 -1 -1 From 1e05fd0d4a0c88d32b3e4a4f6001ab336b705f3c Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 30 Apr 2016 16:50:34 -0400 Subject: [PATCH 30/74] actually, don't allow baseind in Contrast(...) --- src/statsmodels/contrasts.jl | 12 ++---------- test/contrasts.jl | 10 ---------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index b9a026e114..e71e3def87 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -41,13 +41,8 @@ function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) n > 1 || error("not enough degrees of freedom to define contrasts") ## find index of base level. use C.base, then C.baseind, then default (1). - if isnull(C.base) - baseind = get(C.baseind, 1) - (1 <= baseind <= n) || error("base = $(baseind) is not allowed for $n levels") - else - baseind = findfirst(c_lvls, get(C.base)) - baseind > 0 || error("Base level $(C.base) not found in levels") - end + baseind = isnull(C.base) ? 1 : findfirst(c_lvls, get(C.base)) + baseind > 0 || error("Base level $(C.base) not found in levels") not_base = [1:(baseind-1); (baseind+1):n] tnames = c_lvls[not_base] @@ -83,16 +78,13 @@ for contrastType in [:TreatmentContrast, :SumContrast, :HelmertContrast] @eval begin type $contrastType <: AbstractContrast base::Nullable{Any} - baseind::Nullable{Integer} levels::Nullable{Vector} end ## constructor with optional keyword arguments, defaulting to Nullables $contrastType(; base=Nullable{Any}(), - baseind=Nullable{Integer}(), levels=Nullable{Vector}()) = $contrastType(nullify(base), - nullify(baseind), nullify(levels)) end end diff --git a/test/contrasts.jl b/test/contrasts.jl index dea56dbe05..8ec792e154 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -25,16 +25,6 @@ contrast!(mf, x = SumContrast) 1 1 0] @test coefnames(mf) == ["(Intercept)"; "x - b"; "x - c"] -## change base level of contrast -contrast!(mf, x = SumContrast(baseind = 2)) -@test ModelMatrix(mf).m == [1 1 0 - 1 -1 -1 - 1 0 1 - 1 1 0 - 1 1 0 - 1 -1 -1] -@test coefnames(mf) == ["(Intercept)"; "x - a"; "x - c"] - ## change base level of contrast contrast!(mf, x = SumContrast(base = :b)) @test ModelMatrix(mf).m == [1 1 0 From 0b87e93481a373128306dfaeb46e26e4c4d4b07a Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 30 Apr 2016 17:23:21 -0400 Subject: [PATCH 31/74] docstrings for contrasts --- src/statsmodels/contrasts.jl | 124 +++++++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 36 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index e71e3def87..e2f55fd1dc 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -14,6 +14,28 @@ ## ## TODO: implement contrast types in Formula/Terms + +""" + AbstractContrast(; base::Any=NULL, levels::Vector=NULL) + +Interface to describe contrast coding schemes for categorical variables. + +Concrete subtypes of `AbstractContrast` describe a particular way of converting a +categorical column in a `DataFrame` into numeric columns in a `ModelMatrix`. Each +instantiation optionally includes the levels to generate columns for and the base +level. If not specified these will be taken from the data when a `ContrastMatrix` is +generated (during `ModelFrame` construction). + +# Arguments + +* `levels::Nullable{Vector}=NULL`: If specified, will be checked against data when + generating a `ContrastMatrix`. Levels that are specified here but missing in the + data will result in an error, because this would lead to empty columns in the + resulting ModelMatrix. +* `base::Nullable{Any}=NULL`: The base level of the contrast. The + actual interpretation of this depends on the particular contrast type, but in + general it can be thought of as a "reference" level. +""" abstract AbstractContrast ## Contrast + Data = ContrastMatrix @@ -24,6 +46,14 @@ type ContrastMatrix{T} contrasts::AbstractContrast end +""" + ContrastMatrix{T}(::AbstractContrast, levels::Vector{T}) + +Instantiate contrasts matrix for given data (categorical levels) + +If levels are specified in the AbstractContrast, those will be used, and likewise +for the base level (which defaults to the first level). +""" function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) ## if levels are defined on C, use those, validating that they line up. @@ -58,6 +88,12 @@ ContrastMatrix(C::AbstractContrast, v::PooledDataArray) = ContrastMatrix(C, leve termnames(term::Symbol, col::Any, contrast::ContrastMatrix) = ["$term - $name" for name in contrast.termnames] +""" + cols(v::PooledDataVector, contrast::ContrastMatrix) + +Construct `ModelMatrix` columns based on specified contrasts, ensuring that +levels align properly. +""" function cols(v::PooledDataVector, contrast::ContrastMatrix) ## make sure the levels of the contrast matrix and the categorical data ## are the same by constructing a re-indexing vector. Indexing into @@ -89,19 +125,19 @@ for contrastType in [:TreatmentContrast, :SumContrast, :HelmertContrast] end end -################################################################################ -## Dummy contrasts (full rank) -## -## Needed when a term is non-redundant with lower-order terms (e.g., in ~0+x vs. -## ~1+x, or in the interactions terms in ~1+x+x&y vs. ~1+x+y+x&y. In the -## non-redundant cases, we can (probably) expand x into length(levels(x)) -## columns without creating a non-identifiable model matrix (unless the user -## has done something dumb in specifying the model, which we can't do much about -## anyway). -################################################################################ +""" + DummyContrast -## Dummy contrasts have no base level (since all levels produce a column) +One indicator (1 or 0) column for each level, __including__ the base level. + +Needed internally when a term is non-redundant with lower-order terms (e.g., in +~0+x vs. ~1+x, or in the interactions terms in ~1+x+x&y vs. ~1+x+y+x&y. In the +non-redundant cases, we can (probably) expand x into length(levels(x)) columns +without creating a non-identifiable model matrix (unless the user has done +something dumb in specifying the model, which we can't do much about anyway). +""" type DummyContrast <: AbstractContrast +## Dummy contrasts have no base level (since all levels produce a column) end ContrastMatrix{T}(C::DummyContrast, lvls::Vector{T}) = ContrastMatrix(eye(Float64, length(lvls)), lvls, lvls, C) @@ -109,23 +145,36 @@ ContrastMatrix{T}(C::DummyContrast, lvls::Vector{T}) = ContrastMatrix(eye(Float6 ## Default for promoting contrasts to full rank is to convert to dummy contrasts ## promote_contrast(C::AbstractContrast) = DummyContrast(eye(Float64, length(C.levels)), C.levels, C.levels) +"Promote a contrast to full rank" promote_contrast(C::ContrastMatrix) = ContrastMatrix(DummyContrast(), C.levels) -################################################################################ -## Treatment (rank-reduced dummy-coded) contrast -################################################################################ +""" + TreatmentContrast + +One indicator column (1 or 0) for each non-base level. + +The default in R. Columns have non-zero mean and are collinear with an intercept +column (and lower-order columns for interactions) but are orthogonal to each other. +""" +TreatmentContrast contrast_matrix(C::TreatmentContrast, baseind, n) = eye(n)[:, [1:(baseind-1); (baseind+1):n]] -################################################################################ -## Sum-coded contrast -## -## -1 for base level and +1 for contrast level. -################################################################################ +""" + SumContrast + +Column for level `x` of column `col` is 1 where `col .== x` and -1 where +`col .== base`. + +Produces mean-0 (centered) columns _only_ when all levels are equally frequent. +But with more than two levels, the generated columns are guaranteed to be non- +orthogonal (so beware of collinearity). +""" +SumContrast function contrast_matrix(C::SumContrast, baseind, n) not_base = [1:(baseind-1); (baseind+1):n] @@ -134,23 +183,26 @@ function contrast_matrix(C::SumContrast, baseind, n) return mat end -################################################################################ -## Helmert-coded contrast -## -## -1 for each of n levels below contrast, n for contrast level, and 0 above. -## Produces something like: -## -## [-1 -1 -1 -## 1 -1 -1 -## 0 2 -1 -## 0 0 3] -## -## Interpretation is that the nth contrast column is the difference between -## level n+1 and the average of levels 1:n -## -## Has the nice property of each column in the resulting model matrix being -## orthogonal and with mean 0. -################################################################################ +""" + HelmertContrast + +Produces contrast columns with -1 for each of n levels below contrast, n for +contrast level, and 0 above. Produces something like: + +``` +[-1 -1 -1 + 1 -1 -1 + 0 2 -1 + 0 0 3] +``` + +This is a good choice when you have more than two levels that are equally frequent. +Interpretation is that the nth contrast column is the difference between +level n+1 and the average of levels 1 to n. When balanced, it has the +nice property of each column in the resulting model matrix being orthogonal and +with mean 0. +""" +HelmertContrast function contrast_matrix(C::HelmertContrast, baseind, n) mat = zeros(n, n-1) From ca5be00747393d48160a2240c1aa96a234ee891f Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 2 May 2016 22:12:10 -0400 Subject: [PATCH 32/74] settled on behavior for mismatched levels --- src/statsmodels/contrasts.jl | 13 +++++++------ test/contrasts.jl | 6 ++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index e2f55fd1dc..a0e65324c3 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -60,17 +60,18 @@ function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) ## what does that mean? ## ## C.levels == lvls (best case) - ## C.levels < lvls (will leave out some data...okay? will have empty ROWS) - ## C.levels > lvls (will have empty columns. not okay.) - ## empty intersection (worst case) + ## data levels missing from contrast: would generate empty/undefined rows. + ## better to filter data frame first + ## contrast levels missing from data: would have empty columns, generate a + ## rank-deficient model matrix. c_lvls = get(C.levels, lvls) - missing_lvls = setdiff(c_lvls, lvls) - isempty(missing_lvls) || error("Contrast levels not found in data: ", missing_lvls) + mismatched_lvls = symdiff(c_lvls, lvls) + isempty(mismatched_lvls) || error("Contrast levels not found in data or vice-versa: ", mismatched_lvls) n = length(c_lvls) n > 1 || error("not enough degrees of freedom to define contrasts") - ## find index of base level. use C.base, then C.baseind, then default (1). + ## find index of base level. use C.base, then default (1). baseind = isnull(C.base) ? 1 : findfirst(c_lvls, get(C.base)) baseind > 0 || error("Base level $(C.base) not found in levels") diff --git a/test/contrasts.jl b/test/contrasts.jl index 8ec792e154..9508eb3209 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -56,7 +56,13 @@ contrast!(mf, x = SumContrast(levels = [:c, :b, :a], base = :a)) 1 0 1] @test coefnames(mf) == ["(Intercept)"; "x - c"; "x - b"] +## restricting to only a subset of levels +@test_throws ErrorException contrast!(mf, x = SumContrast(levels = [:a, :b])) +## asking for levels that are not in the data raises an error +@test_throws ErrorException contrast!(mf, x = SumContrast(levels = [:a, :b, :c, :d])) + +## Helmert coded contrasts contrast!(mf, x = HelmertContrast) @test ModelMatrix(mf).m == [1 -1 -1 1 1 -1 From 74eed8356bd70e0c997b39946c0d25297df13759 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 2 May 2016 22:29:45 -0400 Subject: [PATCH 33/74] coerce levels when constructing ContrastMatrix --- src/statsmodels/contrasts.jl | 6 +++--- test/contrasts.jl | 21 ++++++++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index a0e65324c3..583d7b6e6d 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -51,7 +51,7 @@ end Instantiate contrasts matrix for given data (categorical levels) -If levels are specified in the AbstractContrast, those will be used, and likewise +If levels are specified in the `AbstractContrast`, those will be used, and likewise for the base level (which defaults to the first level). """ function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) @@ -64,7 +64,7 @@ function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) ## better to filter data frame first ## contrast levels missing from data: would have empty columns, generate a ## rank-deficient model matrix. - c_lvls = get(C.levels, lvls) + c_lvls = convert(typeof(lvls), get(C.levels, lvls)) mismatched_lvls = symdiff(c_lvls, lvls) isempty(mismatched_lvls) || error("Contrast levels not found in data or vice-versa: ", mismatched_lvls) @@ -72,7 +72,7 @@ function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) n > 1 || error("not enough degrees of freedom to define contrasts") ## find index of base level. use C.base, then default (1). - baseind = isnull(C.base) ? 1 : findfirst(c_lvls, get(C.base)) + baseind = isnull(C.base) ? 1 : findfirst(c_lvls, convert(eltype(lvls), get(C.base))) baseind > 0 || error("Base level $(C.base) not found in levels") not_base = [1:(baseind-1); (baseind+1):n] diff --git a/test/contrasts.jl b/test/contrasts.jl index 9508eb3209..6cddb4bed4 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -56,12 +56,6 @@ contrast!(mf, x = SumContrast(levels = [:c, :b, :a], base = :a)) 1 0 1] @test coefnames(mf) == ["(Intercept)"; "x - c"; "x - b"] -## restricting to only a subset of levels -@test_throws ErrorException contrast!(mf, x = SumContrast(levels = [:a, :b])) - -## asking for levels that are not in the data raises an error -@test_throws ErrorException contrast!(mf, x = SumContrast(levels = [:a, :b, :c, :d])) - ## Helmert coded contrasts contrast!(mf, x = HelmertContrast) @test ModelMatrix(mf).m == [1 -1 -1 @@ -72,7 +66,12 @@ contrast!(mf, x = HelmertContrast) 1 1 -1] @test coefnames(mf) == ["(Intercept)"; "x - b"; "x - c"] -## test for missing data (and when it clobbers one of the levels) +## Types for contrast levels are coerced to data levels when constructing +## ContrastMatrix +contrast!(mf, x = SumContrast(levels = ["a", "b", "c"])) +@test mf.contrasts[:x].levels == levels(d[:x]) + +## Missing data is handled gracefully, dropping columns when a level is lost d[3, :x] = NA mf = ModelFrame(Formula(Nothing(), :x), d, contrasts = [:x => SumContrast]) @test ModelMatrix(mf).m == [1 -1 @@ -82,4 +81,12 @@ mf = ModelFrame(Formula(Nothing(), :x), d, contrasts = [:x => SumContrast]) 1 1] @test coefnames(mf) == ["(Intercept)"; "x - b"] +## Things that are bad to do: +## Applying a contrast that only has a subset of data levels: +@test_throws ErrorException contrast!(mf, x = SumContrast(levels = [:a, :b])) +## Applying a contrast that expects levels not found in data: +@test_throws ErrorException contrast!(mf, x = SumContrast(levels = [:a, :b, :c, :d])) + + + end From 5415c3b7a884708b4b6add10babfe21ebee974cb Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 2 May 2016 22:32:21 -0400 Subject: [PATCH 34/74] move cols and termnames to be w/ friends --- src/statsmodels/contrasts.jl | 19 ------------------- src/statsmodels/formula.jl | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 583d7b6e6d..54a4b37245 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -86,25 +86,6 @@ end ContrastMatrix(C::AbstractContrast, v::PooledDataArray) = ContrastMatrix(C, levels(v)) -termnames(term::Symbol, col::Any, contrast::ContrastMatrix) = - ["$term - $name" for name in contrast.termnames] - -""" - cols(v::PooledDataVector, contrast::ContrastMatrix) - -Construct `ModelMatrix` columns based on specified contrasts, ensuring that -levels align properly. -""" -function cols(v::PooledDataVector, contrast::ContrastMatrix) - ## make sure the levels of the contrast matrix and the categorical data - ## are the same by constructing a re-indexing vector. Indexing into - ## reindex with v.refs will give the corresponding row number of the - ## contrast matrix - reindex = [findfirst(contrast.levels, l) for l in levels(v)] - return contrast.matrix[reindex[v.refs], :] -end - - nullify(x::Nullable) = x nullify(x) = Nullable(x) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 5998791823..0e397ede25 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -370,6 +370,22 @@ function cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false) end end +""" + cols(v::PooledDataVector, contrast::ContrastMatrix) + +Construct `ModelMatrix` columns based on specified contrasts, ensuring that +levels align properly. +""" +function cols(v::PooledDataVector, contrast::ContrastMatrix) + ## make sure the levels of the contrast matrix and the categorical data + ## are the same by constructing a re-indexing vector. Indexing into + ## reindex with v.refs will give the corresponding row number of the + ## contrast matrix + reindex = [findfirst(contrast.levels, l) for l in levels(v)] + return contrast.matrix[reindex[v.refs], :] +end + + function isfe(ex::Expr) # true for fixed-effects terms if ex.head != :call error("Non-call expression encountered") end ex.args[1] != :| @@ -450,6 +466,9 @@ function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false) termnames(term, mf.df[term]) end end +termnames(term::Symbol, col::Any, contrast::ContrastMatrix) = + ["$term - $name" for name in contrast.termnames] + function expandtermnames(term::Vector) if length(term) == 1 From b09b8478926f81df9699c9615cbd305990f95e32 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 2 May 2016 22:58:25 -0400 Subject: [PATCH 35/74] use "x: a" instead of "x - a" for termnames --- src/statsmodels/formula.jl | 2 +- test/contrasts.jl | 26 +++++++++++++------------- test/formula.jl | 12 ++++++------ test/statsmodel.jl | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 0e397ede25..a718e89b58 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -467,7 +467,7 @@ function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false) end end termnames(term::Symbol, col::Any, contrast::ContrastMatrix) = - ["$term - $name" for name in contrast.termnames] + ["$term: $name" for name in contrast.termnames] function expandtermnames(term::Vector) diff --git a/test/contrasts.jl b/test/contrasts.jl index 6cddb4bed4..f2f57cfec3 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -14,7 +14,7 @@ mf = ModelFrame(Formula(Nothing(), :x), d) 1 0 0 1 0 0 1 1 0] -@test coefnames(mf) == ["(Intercept)"; "x - b"; "x - c"] +@test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] contrast!(mf, x = SumContrast) @test ModelMatrix(mf).m == [1 -1 -1 @@ -23,7 +23,7 @@ contrast!(mf, x = SumContrast) 1 -1 -1 1 -1 -1 1 1 0] -@test coefnames(mf) == ["(Intercept)"; "x - b"; "x - c"] +@test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] ## change base level of contrast contrast!(mf, x = SumContrast(base = :b)) @@ -33,7 +33,7 @@ contrast!(mf, x = SumContrast(base = :b)) 1 1 0 1 1 0 1 -1 -1] -@test coefnames(mf) == ["(Intercept)"; "x - a"; "x - c"] +@test coefnames(mf) == ["(Intercept)"; "x: a"; "x: c"] ## change levels of contrast contrast!(mf, x = SumContrast(levels = [:c, :b, :a])) @@ -43,7 +43,7 @@ contrast!(mf, x = SumContrast(levels = [:c, :b, :a])) 1 0 1 1 0 1 1 1 0] -@test coefnames(mf) == ["(Intercept)"; "x - b"; "x - a"] +@test coefnames(mf) == ["(Intercept)"; "x: b"; "x: a"] ## change levels and base level of contrast @@ -54,7 +54,7 @@ contrast!(mf, x = SumContrast(levels = [:c, :b, :a], base = :a)) 1 -1 -1 1 -1 -1 1 0 1] -@test coefnames(mf) == ["(Intercept)"; "x - c"; "x - b"] +@test coefnames(mf) == ["(Intercept)"; "x: c"; "x: b"] ## Helmert coded contrasts contrast!(mf, x = HelmertContrast) @@ -64,7 +64,7 @@ contrast!(mf, x = HelmertContrast) 1 -1 -1 1 -1 -1 1 1 -1] -@test coefnames(mf) == ["(Intercept)"; "x - b"; "x - c"] +@test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] ## Types for contrast levels are coerced to data levels when constructing ## ContrastMatrix @@ -73,13 +73,13 @@ contrast!(mf, x = SumContrast(levels = ["a", "b", "c"])) ## Missing data is handled gracefully, dropping columns when a level is lost d[3, :x] = NA -mf = ModelFrame(Formula(Nothing(), :x), d, contrasts = [:x => SumContrast]) -@test ModelMatrix(mf).m == [1 -1 - 1 1 - 1 -1 - 1 -1 - 1 1] -@test coefnames(mf) == ["(Intercept)"; "x - b"] +mf_missing = ModelFrame(Formula(Nothing(), :x), d, contrasts = [:x => SumContrast]) +@test ModelMatrix(mf_missing).m == [1 -1 + 1 1 + 1 -1 + 1 -1 + 1 1] +@test coefnames(mf_missing) == ["(Intercept)"; "x: b"] ## Things that are bad to do: ## Applying a contrast that only has a subset of data levels: diff --git a/test/formula.jl b/test/formula.jl index 84e60001c0..f98917009f 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -135,7 +135,7 @@ module TestFormula @test mm.m[:,2] == [0, 1., 0, 0] @test mm.m[:,3] == [0, 0, 1., 0] @test mm.m[:,4] == [0, 0, 0, 1.] - @test coefnames(mf)[2:end] == ["x1p - 6", "x1p - 7", "x1p - 8"] + @test coefnames(mf)[2:end] == ["x1p: 6", "x1p: 7", "x1p: 8"] #test_group("create a design matrix from interactions from two DataFrames") ## this was removed in commit dead4562506badd7e84a2367086f5753fa49bb6a @@ -324,7 +324,7 @@ module TestFormula ## f = y ~ x1 & x2 & x3 ## mf = ModelFrame(f, df) ## @test coefnames(mf)[2:end] == - ## vec([string("x1 - ", x, " & x2 - ", y, " & x3 - ", z) for + ## vec([string("x1: ", x, " & x2: ", y, " & x3: ", z) for ## x in 2:3, ## y in 5:6, ## z in 8:9]) @@ -381,7 +381,7 @@ d[:n] = 1.:8 ## No intercept mf = ModelFrame(n ~ 0 + x, d[1:2,:], contrasts=cs) @test ModelMatrix(mf).m == [1 0; 0 1] -@test coefnames(mf) == ["x - a", "x - b"] +@test coefnames(mf) == ["x: a", "x: b"] ## No first-order term for interaction mf = ModelFrame(n ~ 1 + x + x&y, d[1:4, :], contrasts=cs) @@ -389,7 +389,7 @@ mf = ModelFrame(n ~ 1 + x + x&y, d[1:4, :], contrasts=cs) 1 0 -1 -1 1 0 1 0 1] -@test coefnames(mf) == ["(Intercept)", "x - b", "x - a & y - d", "x - b & y - d"] +@test coefnames(mf) == ["(Intercept)", "x: b", "x: a & y: d", "x: b & y: d"] ## When both terms of interaction are non-redundant: mf = ModelFrame(n ~ 0 + x&y, d[1:4, :], contrasts=cs) @@ -397,8 +397,8 @@ mf = ModelFrame(n ~ 0 + x&y, d[1:4, :], contrasts=cs) 0 1 0 0 0 0 1 0 0 0 0 1] -@test coefnames(mf) == ["x - a & y - c", "x - b & y - c", - "x - a & y - d", "x - b & y - d"] +@test coefnames(mf) == ["x: a & y: c", "x: b & y: c", + "x: a & y: d", "x: b & y: d"] diff --git a/test/statsmodel.jl b/test/statsmodel.jl index 4e0a5a529c..395072ca9d 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -64,7 +64,7 @@ d[:x1p] = PooledDataArray(d[:x1]) f2 = y ~ x1p m2 = fit(DummyMod, f2, d) -@test coeftable(m2).rownms == ["(Intercept)", "x1p - 6", "x1p - 7", "x1p - 8"] +@test coeftable(m2).rownms == ["(Intercept)", "x1p: 6", "x1p: 7", "x1p: 8"] ## predict w/ new data missing levels @test predict(m2, d[2:4, :]) == predict(m2)[2:4] From 04c702ed82403826135ddb77abd05edb38e52f2f Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 2 May 2016 23:13:59 -0400 Subject: [PATCH 36/74] Contrast -> Contrasts --- src/DataFrames.jl | 8 ++-- src/statsmodels/contrasts.jl | 74 ++++++++++++++++++------------------ src/statsmodels/formula.jl | 26 ++++++------- test/contrasts.jl | 24 ++++++------ test/formula.jl | 2 +- 5 files changed, 68 insertions(+), 66 deletions(-) diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 139c57f1ad..864dd7b254 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -35,7 +35,7 @@ export @~, @wsv_str, AbstractDataFrame, - AbstractContrast, + AbstractContrasts, DataFrame, DataFrameRow, Formula, @@ -44,9 +44,9 @@ export @~, ModelFrame, ModelMatrix, SubDataFrame, - SumContrast, - TreatmentContrast, - HelmertContrast, + SumContrasts, + TreatmentContrasts, + HelmertContrasts, aggregate, by, diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 54a4b37245..5411f71840 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -1,60 +1,62 @@ -## Specify contrasts for coding categorical data in model matrix. Contrast types -## are a subtype of AbstractContrast. ContrastMatrix types hold a contrast +## Specify contrasts for coding categorical data in model matrix. Contrasts types +## are a subtype of AbstractContrasts. ContrastsMatrix types hold a contrast ## matrix, levels, and term names and provide the interface for creating model ## matrix columns and coefficient names. ## -## Contrast types themselves can be instantiated to provide containers for +## Contrasts types themselves can be instantiated to provide containers for ## contrast settings (currently, just the base level). ## -## ModelFrame will hold a Dict{Symbol, ContrastMatrix} that maps column +## ModelFrame will hold a Dict{Symbol, ContrastsMatrix} that maps column ## names to contrasts. ## ## ModelMatrix will check this dict when evaluating terms, falling back to a ## default for any categorical data without a specified contrast. ## ## TODO: implement contrast types in Formula/Terms +## +## Dave Kleinschmidt 2015-2016 """ - AbstractContrast(; base::Any=NULL, levels::Vector=NULL) + AbstractContrasts(; base::Any=NULL, levels::Vector=NULL) Interface to describe contrast coding schemes for categorical variables. -Concrete subtypes of `AbstractContrast` describe a particular way of converting a +Concrete subtypes of `AbstractContrasts` describe a particular way of converting a categorical column in a `DataFrame` into numeric columns in a `ModelMatrix`. Each instantiation optionally includes the levels to generate columns for and the base -level. If not specified these will be taken from the data when a `ContrastMatrix` is +level. If not specified these will be taken from the data when a `ContrastsMatrix` is generated (during `ModelFrame` construction). # Arguments * `levels::Nullable{Vector}=NULL`: If specified, will be checked against data when - generating a `ContrastMatrix`. Levels that are specified here but missing in the + generating a `ContrastsMatrix`. Levels that are specified here but missing in the data will result in an error, because this would lead to empty columns in the resulting ModelMatrix. * `base::Nullable{Any}=NULL`: The base level of the contrast. The actual interpretation of this depends on the particular contrast type, but in general it can be thought of as a "reference" level. """ -abstract AbstractContrast +abstract AbstractContrasts -## Contrast + Data = ContrastMatrix -type ContrastMatrix{T} +## Contrasts + Data = ContrastsMatrix +type ContrastsMatrix{T} matrix::Matrix{Float64} termnames::Vector{T} levels::Vector{T} - contrasts::AbstractContrast + contrasts::AbstractContrasts end """ - ContrastMatrix{T}(::AbstractContrast, levels::Vector{T}) + ContrastsMatrix{T}(::AbstractContrasts, levels::Vector{T}) Instantiate contrasts matrix for given data (categorical levels) -If levels are specified in the `AbstractContrast`, those will be used, and likewise +If levels are specified in the `AbstractContrasts`, those will be used, and likewise for the base level (which defaults to the first level). """ -function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) +function ContrastsMatrix{T}(C::AbstractContrasts, lvls::Vector{T}) ## if levels are defined on C, use those, validating that they line up. ## what does that mean? @@ -66,7 +68,7 @@ function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) ## rank-deficient model matrix. c_lvls = convert(typeof(lvls), get(C.levels, lvls)) mismatched_lvls = symdiff(c_lvls, lvls) - isempty(mismatched_lvls) || error("Contrast levels not found in data or vice-versa: ", mismatched_lvls) + isempty(mismatched_lvls) || error("Contrasts levels not found in data or vice-versa: ", mismatched_lvls) n = length(c_lvls) n > 1 || error("not enough degrees of freedom to define contrasts") @@ -78,23 +80,23 @@ function ContrastMatrix{T}(C::AbstractContrast, lvls::Vector{T}) not_base = [1:(baseind-1); (baseind+1):n] tnames = c_lvls[not_base] - mat = contrast_matrix(C, baseind, n) + mat = contrasts_matrix(C, baseind, n) - ContrastMatrix(mat, tnames, c_lvls, C) + ContrastsMatrix(mat, tnames, c_lvls, C) end -ContrastMatrix(C::AbstractContrast, v::PooledDataArray) = ContrastMatrix(C, levels(v)) +ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = ContrastsMatrix(C, levels(v)) nullify(x::Nullable) = x nullify(x) = Nullable(x) ## Making a contrast type T only requires that there be a method for -## contrast_matrix(T, v::PooledDataArray). The rest is boilerplate. +## contrasts_matrix(T, v::PooledDataArray). The rest is boilerplate. ## -for contrastType in [:TreatmentContrast, :SumContrast, :HelmertContrast] +for contrastType in [:TreatmentContrasts, :SumContrasts, :HelmertContrasts] @eval begin - type $contrastType <: AbstractContrast + type $contrastType <: AbstractContrasts base::Nullable{Any} levels::Nullable{Vector} end @@ -108,7 +110,7 @@ for contrastType in [:TreatmentContrast, :SumContrast, :HelmertContrast] end """ - DummyContrast + DummyContrasts One indicator (1 or 0) column for each level, __including__ the base level. @@ -118,36 +120,36 @@ non-redundant cases, we can (probably) expand x into length(levels(x)) columns without creating a non-identifiable model matrix (unless the user has done something dumb in specifying the model, which we can't do much about anyway). """ -type DummyContrast <: AbstractContrast +type DummyContrasts <: AbstractContrasts ## Dummy contrasts have no base level (since all levels produce a column) end -ContrastMatrix{T}(C::DummyContrast, lvls::Vector{T}) = ContrastMatrix(eye(Float64, length(lvls)), lvls, lvls, C) +ContrastsMatrix{T}(C::DummyContrasts, lvls::Vector{T}) = ContrastsMatrix(eye(Float64, length(lvls)), lvls, lvls, C) ## Default for promoting contrasts to full rank is to convert to dummy contrasts -## promote_contrast(C::AbstractContrast) = DummyContrast(eye(Float64, length(C.levels)), C.levels, C.levels) +## promote_contrast(C::AbstractContrasts) = DummyContrasts(eye(Float64, length(C.levels)), C.levels, C.levels) "Promote a contrast to full rank" -promote_contrast(C::ContrastMatrix) = ContrastMatrix(DummyContrast(), C.levels) +promote_contrast(C::ContrastsMatrix) = ContrastsMatrix(DummyContrasts(), C.levels) """ - TreatmentContrast + TreatmentContrasts One indicator column (1 or 0) for each non-base level. The default in R. Columns have non-zero mean and are collinear with an intercept column (and lower-order columns for interactions) but are orthogonal to each other. """ -TreatmentContrast +TreatmentContrasts -contrast_matrix(C::TreatmentContrast, baseind, n) = eye(n)[:, [1:(baseind-1); (baseind+1):n]] +contrasts_matrix(C::TreatmentContrasts, baseind, n) = eye(n)[:, [1:(baseind-1); (baseind+1):n]] """ - SumContrast + SumContrasts Column for level `x` of column `col` is 1 where `col .== x` and -1 where `col .== base`. @@ -156,9 +158,9 @@ Produces mean-0 (centered) columns _only_ when all levels are equally frequent. But with more than two levels, the generated columns are guaranteed to be non- orthogonal (so beware of collinearity). """ -SumContrast +SumContrasts -function contrast_matrix(C::SumContrast, baseind, n) +function contrasts_matrix(C::SumContrasts, baseind, n) not_base = [1:(baseind-1); (baseind+1):n] mat = eye(n)[:, not_base] mat[baseind, :] = -1 @@ -166,7 +168,7 @@ function contrast_matrix(C::SumContrast, baseind, n) end """ - HelmertContrast + HelmertContrasts Produces contrast columns with -1 for each of n levels below contrast, n for contrast level, and 0 above. Produces something like: @@ -184,9 +186,9 @@ level n+1 and the average of levels 1 to n. When balanced, it has the nice property of each column in the resulting model matrix being orthogonal and with mean 0. """ -HelmertContrast +HelmertContrasts -function contrast_matrix(C::HelmertContrast, baseind, n) +function contrasts_matrix(C::HelmertContrasts, baseind, n) mat = zeros(n, n-1) for i in 1:n-1 mat[1:i, i] = -1 diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index a718e89b58..bf9203f3ad 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -307,10 +307,10 @@ end ## Goal here is to allow specification of _either_ a "naked" contrast type, ## or an instantiated contrast object itself. This might be achieved in a more -## julian way by overloading call for c::AbstractContrast to just return c. -evaluateContrast(c::AbstractContrast, col::AbstractDataVector) = ContrastMatrix(c, col) -evaluateContrast{C <: AbstractContrast}(c::Type{C}, col::AbstractDataVector) = ContrastMatrix(c(), col) -evaluateContrast(c::ContrastMatrix, col::AbstractDataVector) = c +## julian way by overloading call for c::AbstractContrasts to just return c. +evaluateContrasts(c::AbstractContrasts, col::AbstractDataVector) = ContrastsMatrix(c, col) +evaluateContrasts{C <: AbstractContrasts}(c::Type{C}, col::AbstractDataVector) = ContrastsMatrix(c(), col) +evaluateContrasts(c::ContrastsMatrix, col::AbstractDataVector) = c needsContrasts(::PooledDataArray) = true needsContrasts(::Any) = false @@ -323,15 +323,15 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; ## Set up contrasts: ## Combine actual DF columns and contrast types if necessary to compute the - ## actual contrasts matrices, levels, and term names (using TreatmentContrast + ## actual contrasts matrices, levels, and term names (using TreatmentContrasts ## as the default) evaledContrasts = Dict() for (term, col) in eachcol(df) needsContrasts(col) || continue - evaledContrasts[term] = evaluateContrast(haskey(contrasts, term) ? - contrasts[term] : - TreatmentContrast, - col) + evaledContrasts[term] = evaluateContrasts(haskey(contrasts, term) ? + contrasts[term] : + TreatmentContrasts, + col) end ## Check whether or not @@ -345,7 +345,7 @@ ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), ## modify contrasts in place function contrast!(mf::ModelFrame, new_contrasts::Dict) - new_contrasts = [ col => evaluateContrast(contr, mf.df[col]) + new_contrasts = [ col => evaluateContrasts(contr, mf.df[col]) for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ] mf.contrasts = merge(mf.contrasts, new_contrasts) @@ -371,12 +371,12 @@ function cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false) end """ - cols(v::PooledDataVector, contrast::ContrastMatrix) + cols(v::PooledDataVector, contrast::ContrastsMatrix) Construct `ModelMatrix` columns based on specified contrasts, ensuring that levels align properly. """ -function cols(v::PooledDataVector, contrast::ContrastMatrix) +function cols(v::PooledDataVector, contrast::ContrastsMatrix) ## make sure the levels of the contrast matrix and the categorical data ## are the same by constructing a re-indexing vector. Indexing into ## reindex with v.refs will give the corresponding row number of the @@ -466,7 +466,7 @@ function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false) termnames(term, mf.df[term]) end end -termnames(term::Symbol, col::Any, contrast::ContrastMatrix) = +termnames(term::Symbol, col::Any, contrast::ContrastsMatrix) = ["$term: $name" for name in contrast.termnames] diff --git a/test/contrasts.jl b/test/contrasts.jl index f2f57cfec3..1cf51cd6f9 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -16,7 +16,7 @@ mf = ModelFrame(Formula(Nothing(), :x), d) 1 1 0] @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] -contrast!(mf, x = SumContrast) +contrast!(mf, x = SumContrasts) @test ModelMatrix(mf).m == [1 -1 -1 1 1 0 1 0 1 @@ -26,7 +26,7 @@ contrast!(mf, x = SumContrast) @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] ## change base level of contrast -contrast!(mf, x = SumContrast(base = :b)) +contrast!(mf, x = SumContrasts(base = :b)) @test ModelMatrix(mf).m == [1 1 0 1 -1 -1 1 0 1 @@ -36,7 +36,7 @@ contrast!(mf, x = SumContrast(base = :b)) @test coefnames(mf) == ["(Intercept)"; "x: a"; "x: c"] ## change levels of contrast -contrast!(mf, x = SumContrast(levels = [:c, :b, :a])) +contrast!(mf, x = SumContrasts(levels = [:c, :b, :a])) @test ModelMatrix(mf).m == [1 0 1 1 1 0 1 -1 -1 @@ -47,7 +47,7 @@ contrast!(mf, x = SumContrast(levels = [:c, :b, :a])) ## change levels and base level of contrast -contrast!(mf, x = SumContrast(levels = [:c, :b, :a], base = :a)) +contrast!(mf, x = SumContrasts(levels = [:c, :b, :a], base = :a)) @test ModelMatrix(mf).m == [1 -1 -1 1 0 1 1 1 0 @@ -57,7 +57,7 @@ contrast!(mf, x = SumContrast(levels = [:c, :b, :a], base = :a)) @test coefnames(mf) == ["(Intercept)"; "x: c"; "x: b"] ## Helmert coded contrasts -contrast!(mf, x = HelmertContrast) +contrast!(mf, x = HelmertContrasts) @test ModelMatrix(mf).m == [1 -1 -1 1 1 -1 1 0 2 @@ -67,13 +67,13 @@ contrast!(mf, x = HelmertContrast) @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] ## Types for contrast levels are coerced to data levels when constructing -## ContrastMatrix -contrast!(mf, x = SumContrast(levels = ["a", "b", "c"])) +## ContrastsMatrix +contrast!(mf, x = SumContrasts(levels = ["a", "b", "c"])) @test mf.contrasts[:x].levels == levels(d[:x]) ## Missing data is handled gracefully, dropping columns when a level is lost d[3, :x] = NA -mf_missing = ModelFrame(Formula(Nothing(), :x), d, contrasts = [:x => SumContrast]) +mf_missing = ModelFrame(Formula(Nothing(), :x), d, contrasts = Dict(:x => SumContrasts)) @test ModelMatrix(mf_missing).m == [1 -1 1 1 1 -1 @@ -82,10 +82,10 @@ mf_missing = ModelFrame(Formula(Nothing(), :x), d, contrasts = [:x => SumContras @test coefnames(mf_missing) == ["(Intercept)"; "x: b"] ## Things that are bad to do: -## Applying a contrast that only has a subset of data levels: -@test_throws ErrorException contrast!(mf, x = SumContrast(levels = [:a, :b])) -## Applying a contrast that expects levels not found in data: -@test_throws ErrorException contrast!(mf, x = SumContrast(levels = [:a, :b, :c, :d])) +## Applying contrasts that only have a subset of data levels: +@test_throws ErrorException contrast!(mf, x = SumContrasts(levels = [:a, :b])) +## Applying contrasts that expect levels not found in data: +@test_throws ErrorException contrast!(mf, x = SumContrasts(levels = [:a, :b, :c, :d])) diff --git a/test/formula.jl b/test/formula.jl index f98917009f..b5f248f925 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -374,7 +374,7 @@ d = DataFrame(x = rep([:a, :b], times = 4), y = rep([:c, :d], times = 2, each = 2), z = rep([:e, :f], each = 4)) [pool!(d, name) for name in names(d)] -cs = [name => SumContrast for name in names(d)] +cs = [name => SumContrasts for name in names(d)] d[:n] = 1.:8 From eaee627fc5ffa1b1dc6229984869c31c5ea44e78 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 2 May 2016 23:19:55 -0400 Subject: [PATCH 37/74] cols -> modelmat_cols --- src/statsmodels/formula.jl | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index bf9203f3ad..a06e903478 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -38,7 +38,7 @@ type ModelFrame terms::Terms msng::BitArray ## mapping from df keys to contrasts. Rather than specify types allowed, - ## leave that to cols() to check. Allows more seamless later extension + ## leave that to modelmat_cols() to check. Allows more seamless later extension contrasts::Dict{Any, Any} ## An eterms x terms matrix which is true for terms that need to be "promoted" ## to full rank in constructing a model matrx @@ -358,25 +358,25 @@ function StatsBase.model_response(mf::ModelFrame) convert(Array, mf.df[round(Bool, mf.terms.factors[:, 1])][:, 1]) end -cols(v::DataVector) = convert(Vector{Float64}, v.data) -cols(v::Vector) = convert(Vector{Float64}, v) +modelmat_cols(v::DataVector) = convert(Vector{Float64}, v.data) +modelmat_cols(v::Vector) = convert(Vector{Float64}, v) ## construct model matrix columns from model frame + name (checks for contrasts) -function cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false) +function modelmat_cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false) if haskey(mf.contrasts, name) - cols(mf.df[name], + modelmat_cols(mf.df[name], non_redundant ? promote_contrast(mf.contrasts[name]) : mf.contrasts[name]) else - cols(mf.df[name]) + modelmat_cols(mf.df[name]) end end """ - cols(v::PooledDataVector, contrast::ContrastsMatrix) + modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix) Construct `ModelMatrix` columns based on specified contrasts, ensuring that levels align properly. """ -function cols(v::PooledDataVector, contrast::ContrastsMatrix) +function modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix) ## make sure the levels of the contrast matrix and the categorical data ## are the same by constructing a re-indexing vector. Indexing into ## reindex with v.refs will give the corresponding row number of the @@ -414,6 +414,9 @@ function nc(trm::Vector) end function ModelMatrix(mf::ModelFrame) + ## TODO: this method makes multiple copies of the data in the ModelFrame: + ## first in term_cols (1-2x per evaluation term, depending on redundancy), + ## second in constructing the matrix itself. ## Map eval. term name + redundancy bool to cached model matrix columns eterm_cols = @compat Dict{Tuple{Symbol,Bool}, Array{Float64}}() @@ -441,13 +444,15 @@ function ModelMatrix(mf::ModelFrame) for et_and_nr in zip(eterms, non_redundant) haskey(eterm_cols, et_and_nr) || setindex!(eterm_cols, - cols(et_and_nr[1], mf, non_redundant=et_and_nr[2]), + modelmat_cols(et_and_nr[1], mf, non_redundant=et_and_nr[2]), et_and_nr) push!(term_cols, eterm_cols[et_and_nr]) end push!(mm_cols, term_cols) end + ## TODO: could this be made more efficient by + ## first computing mm_col_term_nums, initializing mm, and directly indexing? mm = hcat([expandcols(tc) for tc in mm_cols]...) mm_col_term_nums = vcat([fill(i_term, nc(tc)) for (i_term,tc) in enumerate(mm_cols)]...) From 0dc6fa3a2aec70f7fcceba56743ed6997aabb400 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 2 May 2016 23:34:04 -0400 Subject: [PATCH 38/74] revise documentation --- src/statsmodels/contrasts.jl | 51 +++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 5411f71840..8264132acf 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -18,8 +18,6 @@ """ - AbstractContrasts(; base::Any=NULL, levels::Vector=NULL) - Interface to describe contrast coding schemes for categorical variables. Concrete subtypes of `AbstractContrasts` describe a particular way of converting a @@ -28,15 +26,48 @@ instantiation optionally includes the levels to generate columns for and the bas level. If not specified these will be taken from the data when a `ContrastsMatrix` is generated (during `ModelFrame` construction). -# Arguments +## Constructors + +For `C <: AbstractContrast`: + +```julia +C() # levels are inferred later +C(levels = ::Vector{Any}) # levels checked against data later +C(base = ::Any) # specify base level +C(levels = ::Vector{Any}, base = ::Any) +``` + +If specified, levels will be checked against data when generating a +`ContrastsMatrix`. Any mismatch will result in an error, because missing data +levels would lead to empty columns in the model matrix, and missing contrast +levels would lead to empty or undefined rows. + +You can also specify the base level of the contrasts. The +actual interpretation of this depends on the particular contrast type, but in +general it can be thought of as a "reference" level. It defaults to the first +level. + +Both `levels` and `base` will be coerced to the type of the data when +constructing a `ContrastsMatrix`. + +## Concrete types + +* `TreatmentContrasts` +* `SumContrasts` +* `HelmertContrasts` + +To implement your own concrete types, implement a constructor and a method for +constructing the actual contrasts matrix that maps from levels to `ModelMatrix` +column values: + +```julia +type MyContrasts <: AbstractContrasts + ... +end + +contrasts_matrix(C::MyContrasts, baseind, n) = ... +``` -* `levels::Nullable{Vector}=NULL`: If specified, will be checked against data when - generating a `ContrastsMatrix`. Levels that are specified here but missing in the - data will result in an error, because this would lead to empty columns in the - resulting ModelMatrix. -* `base::Nullable{Any}=NULL`: The base level of the contrast. The - actual interpretation of this depends on the particular contrast type, but in - general it can be thought of as a "reference" level. """ abstract AbstractContrasts From b3c547ccb1455ce73f4ddc021aaa48cca712ce55 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 7 May 2016 16:34:46 -0400 Subject: [PATCH 39/74] contrast! -> setcontrasts! --- src/DataFrames.jl | 2 +- src/statsmodels/formula.jl | 4 ++-- test/contrasts.jl | 16 ++++++++-------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 864dd7b254..00d9b50b10 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -55,7 +55,7 @@ export @~, combine, complete_cases, complete_cases!, - contrast!, + setcontrasts!, deleterows!, describe, eachcol, diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index a06e903478..263ecdeded 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -344,14 +344,14 @@ ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...) ## modify contrasts in place -function contrast!(mf::ModelFrame, new_contrasts::Dict) +function setcontrasts!(mf::ModelFrame, new_contrasts::Dict) new_contrasts = [ col => evaluateContrasts(contr, mf.df[col]) for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ] mf.contrasts = merge(mf.contrasts, new_contrasts) return mf end -contrast!(mf::ModelFrame; kwargs...) = contrast!(mf, Dict(kwargs)) +setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs)) function StatsBase.model_response(mf::ModelFrame) mf.terms.response || error("Model formula one-sided") diff --git a/test/contrasts.jl b/test/contrasts.jl index 1cf51cd6f9..c71463026a 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -16,7 +16,7 @@ mf = ModelFrame(Formula(Nothing(), :x), d) 1 1 0] @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] -contrast!(mf, x = SumContrasts) +setcontrasts!(mf, x = SumContrasts) @test ModelMatrix(mf).m == [1 -1 -1 1 1 0 1 0 1 @@ -26,7 +26,7 @@ contrast!(mf, x = SumContrasts) @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] ## change base level of contrast -contrast!(mf, x = SumContrasts(base = :b)) +setcontrasts!(mf, x = SumContrasts(base = :b)) @test ModelMatrix(mf).m == [1 1 0 1 -1 -1 1 0 1 @@ -36,7 +36,7 @@ contrast!(mf, x = SumContrasts(base = :b)) @test coefnames(mf) == ["(Intercept)"; "x: a"; "x: c"] ## change levels of contrast -contrast!(mf, x = SumContrasts(levels = [:c, :b, :a])) +setcontrasts!(mf, x = SumContrasts(levels = [:c, :b, :a])) @test ModelMatrix(mf).m == [1 0 1 1 1 0 1 -1 -1 @@ -47,7 +47,7 @@ contrast!(mf, x = SumContrasts(levels = [:c, :b, :a])) ## change levels and base level of contrast -contrast!(mf, x = SumContrasts(levels = [:c, :b, :a], base = :a)) +setcontrasts!(mf, x = SumContrasts(levels = [:c, :b, :a], base = :a)) @test ModelMatrix(mf).m == [1 -1 -1 1 0 1 1 1 0 @@ -57,7 +57,7 @@ contrast!(mf, x = SumContrasts(levels = [:c, :b, :a], base = :a)) @test coefnames(mf) == ["(Intercept)"; "x: c"; "x: b"] ## Helmert coded contrasts -contrast!(mf, x = HelmertContrasts) +setcontrasts!(mf, x = HelmertContrasts) @test ModelMatrix(mf).m == [1 -1 -1 1 1 -1 1 0 2 @@ -68,7 +68,7 @@ contrast!(mf, x = HelmertContrasts) ## Types for contrast levels are coerced to data levels when constructing ## ContrastsMatrix -contrast!(mf, x = SumContrasts(levels = ["a", "b", "c"])) +setcontrasts!(mf, x = SumContrasts(levels = ["a", "b", "c"])) @test mf.contrasts[:x].levels == levels(d[:x]) ## Missing data is handled gracefully, dropping columns when a level is lost @@ -83,9 +83,9 @@ mf_missing = ModelFrame(Formula(Nothing(), :x), d, contrasts = Dict(:x => SumCon ## Things that are bad to do: ## Applying contrasts that only have a subset of data levels: -@test_throws ErrorException contrast!(mf, x = SumContrasts(levels = [:a, :b])) +@test_throws ErrorException setcontrasts!(mf, x = SumContrasts(levels = [:a, :b])) ## Applying contrasts that expect levels not found in data: -@test_throws ErrorException contrast!(mf, x = SumContrasts(levels = [:a, :b, :c, :d])) +@test_throws ErrorException setcontrasts!(mf, x = SumContrasts(levels = [:a, :b, :c, :d])) From 1e875cac2f531b5ea610f7a1337e065649da2fc1 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 7 May 2016 17:34:34 -0400 Subject: [PATCH 40/74] test for 1 + x&y when x and y are categorical --- test/formula.jl | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/formula.jl b/test/formula.jl index b5f248f925..709ddf9590 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -400,6 +400,29 @@ mf = ModelFrame(n ~ 0 + x&y, d[1:4, :], contrasts=cs) @test coefnames(mf) == ["x: a & y: c", "x: b & y: c", "x: a & y: d", "x: b & y: d"] +## FAILS: When both terms are non-redundant and intercept is PRESENT +## (not fully redundant). Ideally, would drop last column. Might make sense +## to warn about this, and suggest recoding x and y into a single variable. +# mf = ModelFrame(n ~ 1 + x&y, d[1:4, :], contrasts=cs) +# @test ModelMatrix(mf).m == [1 1 0 0 +# 1 0 1 0 +# 1 0 0 1 +# 1 0 0 0] +# @test coefnames(mf) == ["x: a & y: c", "x: b & y: c", +# "x: a & y: d", "x: b & y: d"] + +## note that R also does not detect this automatically. it's left to glm et al. +## to detect numerically when the model matrix is rank deficient, which is hard +## to do correctly. +# > d = data.frame(x = factor(c(1, 2, 1, 2)), y = factor(c(3, 3, 4, 4))) +# > model.matrix(~ 1 + x:y, d) +# (Intercept) x1:y3 x2:y3 x1:y4 x2:y4 +# 1 1 1 0 0 0 +# 2 1 0 1 0 0 +# 3 1 0 0 1 0 +# 4 1 0 0 0 1 + + end From faa592f86aaa8f80870db1548d825de17bf40fd9 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 11 May 2016 09:28:29 -0400 Subject: [PATCH 41/74] code review * ## -> # * more comments * clean up extra space * type parameters * more informative error messages --- src/statsmodels/contrasts.jl | 124 +++++++++++++++++------------------ 1 file changed, 61 insertions(+), 63 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 8264132acf..7d754ea600 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -1,40 +1,38 @@ -## Specify contrasts for coding categorical data in model matrix. Contrasts types -## are a subtype of AbstractContrasts. ContrastsMatrix types hold a contrast -## matrix, levels, and term names and provide the interface for creating model -## matrix columns and coefficient names. -## -## Contrasts types themselves can be instantiated to provide containers for -## contrast settings (currently, just the base level). -## -## ModelFrame will hold a Dict{Symbol, ContrastsMatrix} that maps column -## names to contrasts. -## -## ModelMatrix will check this dict when evaluating terms, falling back to a -## default for any categorical data without a specified contrast. -## -## TODO: implement contrast types in Formula/Terms -## -## Dave Kleinschmidt 2015-2016 +# Specify contrasts for coding categorical data in model matrix. Contrasts types +# are a subtype of AbstractContrasts. ContrastsMatrix types hold a contrast +# matrix, levels, and term names and provide the interface for creating model +# matrix columns and coefficient names. +# +# Contrasts types themselves can be instantiated to provide containers for +# contrast settings (currently, just the base level). +# +# ModelFrame will hold a Dict{Symbol, ContrastsMatrix} that maps column +# names to contrasts. +# +# ModelMatrix will check this dict when evaluating terms, falling back to a +# default for any categorical data without a specified contrast. +# +# TODO: implement contrast types in Formula/Terms """ Interface to describe contrast coding schemes for categorical variables. Concrete subtypes of `AbstractContrasts` describe a particular way of converting a -categorical column in a `DataFrame` into numeric columns in a `ModelMatrix`. Each +categorical data vector into numeric columns in a `ModelMatrix`. Each instantiation optionally includes the levels to generate columns for and the base level. If not specified these will be taken from the data when a `ContrastsMatrix` is generated (during `ModelFrame` construction). -## Constructors +# Constructors For `C <: AbstractContrast`: ```julia -C() # levels are inferred later -C(levels = ::Vector{Any}) # levels checked against data later -C(base = ::Any) # specify base level -C(levels = ::Vector{Any}, base = ::Any) +C() # levels are inferred later +C(levels = ::Vector{Any}) # levels checked against data later +C(base = ::Any) # specify base level +C(levels = ::Vector{Any}, base = ::Any) # specify levels and base ``` If specified, levels will be checked against data when generating a @@ -50,15 +48,15 @@ level. Both `levels` and `base` will be coerced to the type of the data when constructing a `ContrastsMatrix`. -## Concrete types +# Concrete types * `TreatmentContrasts` * `SumContrasts` * `HelmertContrasts` -To implement your own concrete types, implement a constructor and a method for -constructing the actual contrasts matrix that maps from levels to `ModelMatrix` -column values: +To implement your own concrete types, implement a constructor and a +`contrast_matrix` method for constructing the actual contrasts matrix +that maps from levels to `ModelMatrix` column values: ```julia type MyContrasts <: AbstractContrasts @@ -71,49 +69,50 @@ contrasts_matrix(C::MyContrasts, baseind, n) = ... """ abstract AbstractContrasts -## Contrasts + Data = ContrastsMatrix -type ContrastsMatrix{T} +# Contrasts + Levels (usually from data) = ContrastsMatrix +type ContrastsMatrix{T, S <: AbstractContrasts} matrix::Matrix{Float64} termnames::Vector{T} levels::Vector{T} - contrasts::AbstractContrasts + contrasts::S end """ - ContrastsMatrix{T}(::AbstractContrasts, levels::Vector{T}) + ContrastsMatrix{T}(C::AbstractContrasts, levels::Vector{T}) -Instantiate contrasts matrix for given data (categorical levels) +Compute contrasts matrix for a given set of categorical data levels. If levels are specified in the `AbstractContrasts`, those will be used, and likewise for the base level (which defaults to the first level). """ -function ContrastsMatrix{T}(C::AbstractContrasts, lvls::Vector{T}) - - ## if levels are defined on C, use those, validating that they line up. - ## what does that mean? - ## - ## C.levels == lvls (best case) - ## data levels missing from contrast: would generate empty/undefined rows. - ## better to filter data frame first - ## contrast levels missing from data: would have empty columns, generate a - ## rank-deficient model matrix. - c_lvls = convert(typeof(lvls), get(C.levels, lvls)) - mismatched_lvls = symdiff(c_lvls, lvls) - isempty(mismatched_lvls) || error("Contrasts levels not found in data or vice-versa: ", mismatched_lvls) - - n = length(c_lvls) - n > 1 || error("not enough degrees of freedom to define contrasts") - - ## find index of base level. use C.base, then default (1). - baseind = isnull(C.base) ? 1 : findfirst(c_lvls, convert(eltype(lvls), get(C.base))) - baseind > 0 || error("Base level $(C.base) not found in levels") +function ContrastsMatrix{T}(C::AbstractContrasts, levels::Vector{T}) + + # if levels are defined on C, use those, validating that they line up. + # what does that mean? either: + # + # 1. C.levels == levels (best case) + # 2. data levels missing from contrast: would generate empty/undefined rows. + # better to filter data frame first + # 3. contrast levels missing from data: would have empty columns, generate a + # rank-deficient model matrix. + c_levels = oftype(levels, get(C.levels, levels)) + mismatched_levels = symdiff(c_levels, levels) + isempty(mismatched_levels) || error("contrasts levels not found in data or vice-versa: $mismatched_levels.\nData levels: $levels.\nContrast levels: $c_levels") + + n = length(c_levels) + n == 0 && error("empty set of levels found (need at least two to compute contrasts).") + n == 1 && error("only one level found: $(c_levels[1]). need at least two to compute contrasts.") not_base = [1:(baseind-1); (baseind+1):n] tnames = c_lvls[not_base] + # find index of base level. use C.base, then default (1). + baseind = isnull(C.base) ? 1 : findfirst(c_levels, convert(eltype(levels), get(C.base))) + baseind > 0 || error("base level $(C.base) not found in levels $c_levels.") + mat = contrasts_matrix(C, baseind, n) - ContrastsMatrix(mat, tnames, c_lvls, C) + ContrastsMatrix(mat, tnames, c_levels, C) end ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = ContrastsMatrix(C, levels(v)) @@ -122,9 +121,8 @@ ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = ContrastsMatrix(C, l nullify(x::Nullable) = x nullify(x) = Nullable(x) -## Making a contrast type T only requires that there be a method for -## contrasts_matrix(T, v::PooledDataArray). The rest is boilerplate. -## +# Making a contrast type T only requires that there be a method for +# contrasts_matrix(T, v::PooledDataArray). The rest is boilerplate. for contrastType in [:TreatmentContrasts, :SumContrasts, :HelmertContrasts] @eval begin type $contrastType <: AbstractContrasts @@ -146,19 +144,19 @@ end One indicator (1 or 0) column for each level, __including__ the base level. Needed internally when a term is non-redundant with lower-order terms (e.g., in -~0+x vs. ~1+x, or in the interactions terms in ~1+x+x&y vs. ~1+x+y+x&y. In the -non-redundant cases, we can (probably) expand x into length(levels(x)) columns +`~0+x` vs. `~1+x`, or in the interactions terms in `~1+x+x&y` vs. `~1+x+y+x&y`. In the +non-redundant cases, we can expand x into `length(levels(x))` columns without creating a non-identifiable model matrix (unless the user has done -something dumb in specifying the model, which we can't do much about anyway). +something foolish in specifying the model, which we can't do much about anyway). """ type DummyContrasts <: AbstractContrasts -## Dummy contrasts have no base level (since all levels produce a column) +# Dummy contrasts have no base level (since all levels produce a column) end ContrastsMatrix{T}(C::DummyContrasts, lvls::Vector{T}) = ContrastsMatrix(eye(Float64, length(lvls)), lvls, lvls, C) -## Default for promoting contrasts to full rank is to convert to dummy contrasts -## promote_contrast(C::AbstractContrasts) = DummyContrasts(eye(Float64, length(C.levels)), C.levels, C.levels) +# Default for promoting contrasts to full rank is to convert to dummy contrasts +# promote_contrast(C::AbstractContrasts) = DummyContrasts(eye(Float64, length(C.levels)), C.levels, C.levels) "Promote a contrast to full rank" promote_contrast(C::ContrastsMatrix) = ContrastsMatrix(DummyContrasts(), C.levels) @@ -226,7 +224,7 @@ function contrasts_matrix(C::HelmertContrasts, baseind, n) mat[i+1, i] = i end - ## re-shuffle the rows such that base is the all -1.0 row (currently first) + # re-shuffle the rows such that base is the all -1.0 row (currently first) mat = mat[[baseind; 1:(baseind-1); (baseind+1):end], :] return mat end From 566adaec96d0eda5fd85d7a96787cf1c34fbc4a8 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 11 May 2016 09:29:30 -0400 Subject: [PATCH 42/74] generalize termnames generation --- src/statsmodels/contrasts.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 7d754ea600..76e994f183 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -103,12 +103,11 @@ function ContrastsMatrix{T}(C::AbstractContrasts, levels::Vector{T}) n == 0 && error("empty set of levels found (need at least two to compute contrasts).") n == 1 && error("only one level found: $(c_levels[1]). need at least two to compute contrasts.") - not_base = [1:(baseind-1); (baseind+1):n] - tnames = c_lvls[not_base] # find index of base level. use C.base, then default (1). baseind = isnull(C.base) ? 1 : findfirst(c_levels, convert(eltype(levels), get(C.base))) baseind > 0 || error("base level $(C.base) not found in levels $c_levels.") + tnames = termnames(C, c_levels, baseind) mat = contrasts_matrix(C, baseind, n) @@ -117,6 +116,10 @@ end ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = ContrastsMatrix(C, levels(v)) +function termnames(C::AbstractContrasts, levels::Vector, baseind::Integer) + not_base = [1:(baseind-1); (baseind+1):length(levels)] + levels[not_base] +end nullify(x::Nullable) = x nullify(x) = Nullable(x) From 87e06629a5e9813034a93dbc5a76fb0d8a26a266 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 11 May 2016 22:07:04 -0400 Subject: [PATCH 43/74] rename C->contrasts in ContrastsMatrix constructor --- src/statsmodels/contrasts.jl | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 76e994f183..27a62a2866 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -85,17 +85,17 @@ Compute contrasts matrix for a given set of categorical data levels. If levels are specified in the `AbstractContrasts`, those will be used, and likewise for the base level (which defaults to the first level). """ -function ContrastsMatrix{T}(C::AbstractContrasts, levels::Vector{T}) +function ContrastsMatrix{T,C <: AbstractContrasts}(contrasts::C, levels::Vector{T}) - # if levels are defined on C, use those, validating that they line up. + # if levels are defined on contrasts, use those, validating that they line up. # what does that mean? either: # - # 1. C.levels == levels (best case) + # 1. contrasts.levels == levels (best case) # 2. data levels missing from contrast: would generate empty/undefined rows. # better to filter data frame first # 3. contrast levels missing from data: would have empty columns, generate a # rank-deficient model matrix. - c_levels = oftype(levels, get(C.levels, levels)) + c_levels = oftype(levels, get(contrasts.levels, levels)) mismatched_levels = symdiff(c_levels, levels) isempty(mismatched_levels) || error("contrasts levels not found in data or vice-versa: $mismatched_levels.\nData levels: $levels.\nContrast levels: $c_levels") @@ -103,15 +103,17 @@ function ContrastsMatrix{T}(C::AbstractContrasts, levels::Vector{T}) n == 0 && error("empty set of levels found (need at least two to compute contrasts).") n == 1 && error("only one level found: $(c_levels[1]). need at least two to compute contrasts.") - # find index of base level. use C.base, then default (1). - baseind = isnull(C.base) ? 1 : findfirst(c_levels, convert(eltype(levels), get(C.base))) - baseind > 0 || error("base level $(C.base) not found in levels $c_levels.") + # find index of base level. use contrasts.base, then default (1). + baseind = isnull(contrasts.base) ? + 1 : + findfirst(c_levels, convert(eltype(levels), get(contrasts.base))) + baseind > 0 || error("base level $(contrasts.base) not found in levels $c_levels.") - tnames = termnames(C, c_levels, baseind) + tnames = termnames(contrasts, c_levels, baseind) - mat = contrasts_matrix(C, baseind, n) + mat = contrasts_matrix(contrasts, baseind, n) - ContrastsMatrix(mat, tnames, c_levels, C) + ContrastsMatrix(mat, tnames, c_levels, contrasts) end ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = ContrastsMatrix(C, levels(v)) From c405df59e81a4a663b47d92eefac11360394065a Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 11 May 2016 22:39:35 -0400 Subject: [PATCH 44/74] type parameters + convert for promote_contrast --- src/statsmodels/contrasts.jl | 14 ++++---------- src/statsmodels/formula.jl | 8 ++++++-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 27a62a2866..21b85bfb83 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -70,7 +70,7 @@ contrasts_matrix(C::MyContrasts, baseind, n) = ... abstract AbstractContrasts # Contrasts + Levels (usually from data) = ContrastsMatrix -type ContrastsMatrix{T, S <: AbstractContrasts} +type ContrastsMatrix{S <: AbstractContrasts, T} matrix::Matrix{Float64} termnames::Vector{T} levels::Vector{T} @@ -85,7 +85,7 @@ Compute contrasts matrix for a given set of categorical data levels. If levels are specified in the `AbstractContrasts`, those will be used, and likewise for the base level (which defaults to the first level). """ -function ContrastsMatrix{T,C <: AbstractContrasts}(contrasts::C, levels::Vector{T}) +function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::Vector) # if levels are defined on contrasts, use those, validating that they line up. # what does that mean? either: @@ -160,14 +160,8 @@ end ContrastsMatrix{T}(C::DummyContrasts, lvls::Vector{T}) = ContrastsMatrix(eye(Float64, length(lvls)), lvls, lvls, C) -# Default for promoting contrasts to full rank is to convert to dummy contrasts -# promote_contrast(C::AbstractContrasts) = DummyContrasts(eye(Float64, length(C.levels)), C.levels, C.levels) - -"Promote a contrast to full rank" -promote_contrast(C::ContrastsMatrix) = ContrastsMatrix(DummyContrasts(), C.levels) - - - +"Promote contrasts matrix to full rank version" +Base.convert(::Type{ContrastsMatrix{DummyContrasts}}, C::ContrastsMatrix) = ContrastsMatrix(DummyContrasts(), C.levels) """ TreatmentContrasts diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 263ecdeded..d01a9f59cb 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -364,7 +364,9 @@ modelmat_cols(v::Vector) = convert(Vector{Float64}, v) function modelmat_cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false) if haskey(mf.contrasts, name) modelmat_cols(mf.df[name], - non_redundant ? promote_contrast(mf.contrasts[name]) : mf.contrasts[name]) + non_redundant ? + ContrastsMatrix{DummyContrasts}(mf.contrasts[name]) : + mf.contrasts[name]) else modelmat_cols(mf.df[name]) end @@ -466,7 +468,9 @@ termnames(term::Symbol, col) = [string(term)] function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false) if haskey(mf.contrasts, term) termnames(term, mf.df[term], - non_redundant ? promote_contrast(mf.contrasts[term]) : mf.contrasts[term]) + non_redundant ? + ContrastsMatrix{DummyContrasts}(mf.contrasts[term]) : + mf.contrasts[term]) else termnames(term, mf.df[term]) end From bdcca01c3952e363d5c67adb24d174b1c18c79e7 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 11 May 2016 22:42:33 -0400 Subject: [PATCH 45/74] restrict type in model frame contrasts dict --- src/statsmodels/formula.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index d01a9f59cb..6927b1567a 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -37,9 +37,8 @@ type ModelFrame df::AbstractDataFrame terms::Terms msng::BitArray - ## mapping from df keys to contrasts. Rather than specify types allowed, - ## leave that to modelmat_cols() to check. Allows more seamless later extension - contrasts::Dict{Any, Any} + ## mapping from df keys to contrasts matrices + contrasts::Dict{Symbol, ContrastsMatrix} ## An eterms x terms matrix which is true for terms that need to be "promoted" ## to full rank in constructing a model matrx non_redundant_terms::Matrix{Bool} From eb44d54428e5ee64e7905d63c81692fcd01e1051 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 11 May 2016 22:51:20 -0400 Subject: [PATCH 46/74] clean up check_non_redundancy --- src/statsmodels/formula.jl | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 6927b1567a..9b24772fcb 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -258,8 +258,6 @@ can_promote(::Any) = false ## of each term needs to be promoted. function check_non_redundancy(trms::Terms, df::AbstractDataFrame) - ## This can be checked using the .factors field of the terms, which is an - ## evaluation terms x terms matrix. (n_eterms, n_terms) = size(trms.factors) to_promote = falses(n_eterms, n_terms) encountered_columns = Vector{eltype(trms.factors)}[] @@ -272,36 +270,23 @@ function check_non_redundancy(trms::Terms, df::AbstractDataFrame) for i_eterm in 1:n_eterms ## only need to check eterms that are included and can be promoted ## (e.g., categorical variables that expand to multiple mm columns) - if round(Bool, trms.factors[i_eterm, i_term]) && can_promote(df[trms.eterms[i_eterm]]) + if Bool(trms.factors[i_eterm, i_term]) && can_promote(df[trms.eterms[i_eterm]]) dropped = trms.factors[:,i_term] dropped[i_eterm] = 0 - ## short circuiting check for whether the version of this term - ## with the current eterm dropped has been encountered already - ## (and hence does not need to be expanded - is_redundant = false - for enc in encountered_columns - if dropped == enc - is_redundant = true - break - end - end - ## more concisely, but with non-short-circuiting any: - ##is_redundant = any([dropped == enc for enc in encountered_columns]) - if !is_redundant + if findfirst(encountered_columns, dropped) == 0 to_promote[i_eterm, i_term] = true push!(encountered_columns, dropped) end end - ## once we've checked all the eterms in this term, add it to the list - ## of encountered terms/columns end + ## once we've checked all the eterms in this term, add it to the list + ## of encountered terms/columns push!(encountered_columns, trms.factors[:, i_term]) end return to_promote - end ## Goal here is to allow specification of _either_ a "naked" contrast type, @@ -333,7 +318,6 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; col) end - ## Check whether or not non_redundants = check_non_redundancy(trms, df) ModelFrame(df, trms, msng, evaledContrasts, non_redundants) From dc261c2183a594ca1014a0d8bb60895dc51f3161 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 11 May 2016 22:54:25 -0400 Subject: [PATCH 47/74] replace needsContrasts and can_promote with is_categorical --- src/statsmodels/formula.jl | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 9b24772fcb..c2bf3b2a37 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -241,10 +241,8 @@ function dropUnusedLevels!(da::PooledDataArray) end dropUnusedLevels!(x) = x -## whether or not a column of a particular type can be "promoted" to full rank -## (only true of factors) -can_promote(::PooledDataArray) = true -can_promote(::Any) = false +is_categorical(::PooledDataArray) = true +is_categorical(::Any) = false ## Check for non-redundancy of columns. For instance, if x is a factor with two ## levels, it should be expanded into two columns in y~0+x but only one column @@ -270,7 +268,7 @@ function check_non_redundancy(trms::Terms, df::AbstractDataFrame) for i_eterm in 1:n_eterms ## only need to check eterms that are included and can be promoted ## (e.g., categorical variables that expand to multiple mm columns) - if Bool(trms.factors[i_eterm, i_term]) && can_promote(df[trms.eterms[i_eterm]]) + if Bool(trms.factors[i_eterm, i_term]) && is_categorical(df[trms.eterms[i_eterm]]) dropped = trms.factors[:,i_term] dropped[i_eterm] = 0 @@ -296,9 +294,6 @@ evaluateContrasts(c::AbstractContrasts, col::AbstractDataVector) = ContrastsMatr evaluateContrasts{C <: AbstractContrasts}(c::Type{C}, col::AbstractDataVector) = ContrastsMatrix(c(), col) evaluateContrasts(c::ContrastsMatrix, col::AbstractDataVector) = c -needsContrasts(::PooledDataArray) = true -needsContrasts(::Any) = false - function ModelFrame(trms::Terms, d::AbstractDataFrame; contrasts::Dict = Dict()) df, msng = na_omit(DataFrame(map(x -> d[x], trms.eterms))) @@ -311,7 +306,7 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; ## as the default) evaledContrasts = Dict() for (term, col) in eachcol(df) - needsContrasts(col) || continue + is_categorical(col) || continue evaledContrasts[term] = evaluateContrasts(haskey(contrasts, term) ? contrasts[term] : TreatmentContrasts, From 41068358b8a818e16553bcda248d4392fe9507d6 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 11 May 2016 23:08:11 -0400 Subject: [PATCH 48/74] explicit tuple and clearer if/index --- src/statsmodels/formula.jl | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index c2bf3b2a37..ad2857b259 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -421,12 +421,11 @@ function ModelMatrix(mf::ModelFrame) non_redundant = mf.non_redundant_terms[factors[:, i_term], i_term] ## Get cols for each eval term (either previously generated, or generating ## and storing as necessary) - for et_and_nr in zip(eterms, non_redundant) - haskey(eterm_cols, et_and_nr) || - setindex!(eterm_cols, - modelmat_cols(et_and_nr[1], mf, non_redundant=et_and_nr[2]), - et_and_nr) - push!(term_cols, eterm_cols[et_and_nr]) + for (et, nr) in zip(eterms, non_redundant) + if ! haskey(eterm_cols, (et, nr)) + eterm_cols[(et, nr)] = modelmat_cols(et, mf, non_redundant=nr) + end + push!(term_cols, eterm_cols[(et, nr)]) end push!(mm_cols, term_cols) end @@ -485,12 +484,11 @@ function coefnames(mf::ModelFrame) eterms = mf.terms.eterms[factors[:, i_term]] non_redundant = mf.non_redundant_terms[factors[:, i_term], i_term] - for et_and_nr in zip(eterms, non_redundant) - haskey(eterm_names, et_and_nr) || - setindex!(eterm_names, - termnames(et_and_nr[1], mf, non_redundant=et_and_nr[2]), - et_and_nr) - push!(names, eterm_names[et_and_nr]) + for (et, nr) in zip(eterms, non_redundant) + if !haskey(eterm_names, (et, nr)) + eterm_names[(et, nr)] = termnames(et, mf, non_redundant=nr) + end + push!(names, eterm_names[(et, nr)]) end push!(term_names, names) end From 34402bfed7c9a4b68be8f879445c27cad914be7b Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Thu, 12 May 2016 00:13:20 -0400 Subject: [PATCH 49/74] evaluateContrasts -> ContrastMatrix methods --- src/statsmodels/contrasts.jl | 5 +++++ src/statsmodels/formula.jl | 20 +++++++------------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 21b85bfb83..6fee944723 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -117,6 +117,11 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::Vector) end ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = ContrastsMatrix(C, levels(v)) +ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) = ContrastsMatrix(c(), col) +ContrastsMatrix(c::ContrastsMatrix, col::PooledDataArray) = + isempty(symdiff(c.levels, levels(col))) ? + c : + error("mismatch between levels in ContrastsMatrix and data:\nData levels: $(levels(col))\nContrast levels $(c.levels)") function termnames(C::AbstractContrasts, levels::Vector, baseind::Integer) not_base = [1:(baseind-1); (baseind+1):length(levels)] diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index ad2857b259..b1d1225e28 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -287,12 +287,8 @@ function check_non_redundancy(trms::Terms, df::AbstractDataFrame) return to_promote end -## Goal here is to allow specification of _either_ a "naked" contrast type, -## or an instantiated contrast object itself. This might be achieved in a more -## julian way by overloading call for c::AbstractContrasts to just return c. -evaluateContrasts(c::AbstractContrasts, col::AbstractDataVector) = ContrastsMatrix(c, col) -evaluateContrasts{C <: AbstractContrasts}(c::Type{C}, col::AbstractDataVector) = ContrastsMatrix(c(), col) -evaluateContrasts(c::ContrastsMatrix, col::AbstractDataVector) = c + +const DEFAULT_CONTRASTS = TreatmentContrasts function ModelFrame(trms::Terms, d::AbstractDataFrame; contrasts::Dict = Dict()) @@ -307,10 +303,10 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; evaledContrasts = Dict() for (term, col) in eachcol(df) is_categorical(col) || continue - evaledContrasts[term] = evaluateContrasts(haskey(contrasts, term) ? - contrasts[term] : - TreatmentContrasts, - col) + evaledContrasts[term] = ContrastsMatrix(haskey(contrasts, term) ? + contrasts[term] : + DEFAULT_CONTRASTS, + col) end non_redundants = check_non_redundancy(trms, df) @@ -323,7 +319,7 @@ ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), ## modify contrasts in place function setcontrasts!(mf::ModelFrame, new_contrasts::Dict) - new_contrasts = [ col => evaluateContrasts(contr, mf.df[col]) + new_contrasts = [ col => ContrastsMatrix(contr, mf.df[col]) for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ] mf.contrasts = merge(mf.contrasts, new_contrasts) @@ -436,7 +432,6 @@ function ModelMatrix(mf::ModelFrame) mm_col_term_nums = vcat([fill(i_term, nc(tc)) for (i_term,tc) in enumerate(mm_cols)]...) ModelMatrix{Float64}(mm, mm_col_term_nums) - end @@ -494,7 +489,6 @@ function coefnames(mf::ModelFrame) end mapreduce(expandtermnames, vcat, term_names) - end function Formula(t::Terms) From 109df416fea7038da7937f7d9dfadfc6d8ec771e Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Thu, 12 May 2016 00:14:03 -0400 Subject: [PATCH 50/74] pass contrasts in DataFramesModel fitting --- src/statsmodels/statsmodel.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/statsmodels/statsmodel.jl b/src/statsmodels/statsmodel.jl index c4fb9887cb..b1bfb298a2 100644 --- a/src/statsmodels/statsmodel.jl +++ b/src/statsmodels/statsmodel.jl @@ -48,8 +48,8 @@ for (modeltype, dfmodeltype) in ((:StatisticalModel, DataFrameStatisticalModel), (:RegressionModel, DataFrameRegressionModel)) @eval begin function StatsBase.fit{T<:$modeltype}(::Type{T}, f::Formula, df::AbstractDataFrame, - args...; kwargs...) - mf = ModelFrame(f, df) + args...; contrasts::Dict = Dict(), kwargs...) + mf = ModelFrame(f, df, contrasts=contrasts) mm = ModelMatrix(mf) y = model_response(mf) $dfmodeltype(fit(T, mm.m, y, args...; kwargs...), mf, mm) From c26d2ec5f696885d008637af74041a71896e515a Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Thu, 12 May 2016 11:05:40 -0400 Subject: [PATCH 51/74] contrastsmatrix constructor fix and test allow contrast levels to be missing in data when calling ContrastsMatrix(::ContrastsMatrix, ::PooledDataArray) (to support predict on subset of data). add test for case when error is expected --- src/statsmodels/contrasts.jl | 14 ++++++++++++-- test/statsmodel.jl | 5 +++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 6fee944723..b797738197 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -116,12 +116,22 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::Vector) ContrastsMatrix(mat, tnames, c_levels, contrasts) end +# Methods for constructing ContrastsMatrix from data. These are called in +# ModelFrame constructor and setcontrasts!. + ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = ContrastsMatrix(C, levels(v)) +# instantiate Type{C<:AbstractContrasts} ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) = ContrastsMatrix(c(), col) +# given an existing ContrastsMatrix, check that all of the levels present in the +# data are present in the contrasts. Note that this behavior is different from the +# ContrastsMatrix constructor, which requires that the levels be exactly the same. +# This method exists to support things like `predict` that can operate on new data +# which may contain only a subset of the original data's levels. Checking here +# (instead of in `modelmat_cols`) allows an informative error message. ContrastsMatrix(c::ContrastsMatrix, col::PooledDataArray) = - isempty(symdiff(c.levels, levels(col))) ? + isempty(setdiff(levels(col), c.levels)) ? c : - error("mismatch between levels in ContrastsMatrix and data:\nData levels: $(levels(col))\nContrast levels $(c.levels)") + error("there are levels in data that are not in ContrastsMatrix: $(setdiff(levels(col), c.levels))\n Data levels: $(levels(col))\n Contrast levels $(c.levels)") function termnames(C::AbstractContrasts, levels::Vector, baseind::Integer) not_base = [1:(baseind-1); (baseind+1):length(levels)] diff --git a/test/statsmodel.jl b/test/statsmodel.jl index 395072ca9d..eb2a85dec1 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -69,6 +69,11 @@ m2 = fit(DummyMod, f2, d) ## predict w/ new data missing levels @test predict(m2, d[2:4, :]) == predict(m2)[2:4] +## predict w/ new data with _extra_ levels (throws an error) +d3 = deepcopy(d) +d3[1, :x1] = 0 +d3[:x1p] = PooledDataArray(d3[:x1]) +@test_throws ErrorException predict(m2, d3) ## Another dummy model type to test fall-through show method immutable DummyModTwo <: RegressionModel From 7493d9a76f46362aff76a87ee197b22bb47f53ab Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 13 May 2016 19:24:51 -0400 Subject: [PATCH 52/74] more tests: redundancy --- test/contrasts.jl | 24 ++++++------- test/formula.jl | 87 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 91 insertions(+), 20 deletions(-) diff --git a/test/contrasts.jl b/test/contrasts.jl index c71463026a..8957331fa0 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -25,7 +25,7 @@ setcontrasts!(mf, x = SumContrasts) 1 1 0] @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] -## change base level of contrast +# change base level of contrast setcontrasts!(mf, x = SumContrasts(base = :b)) @test ModelMatrix(mf).m == [1 1 0 1 -1 -1 @@ -35,7 +35,7 @@ setcontrasts!(mf, x = SumContrasts(base = :b)) 1 -1 -1] @test coefnames(mf) == ["(Intercept)"; "x: a"; "x: c"] -## change levels of contrast +# change levels of contrast setcontrasts!(mf, x = SumContrasts(levels = [:c, :b, :a])) @test ModelMatrix(mf).m == [1 0 1 1 1 0 @@ -46,7 +46,7 @@ setcontrasts!(mf, x = SumContrasts(levels = [:c, :b, :a])) @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: a"] -## change levels and base level of contrast +# change levels and base level of contrast setcontrasts!(mf, x = SumContrasts(levels = [:c, :b, :a], base = :a)) @test ModelMatrix(mf).m == [1 -1 -1 1 0 1 @@ -56,7 +56,7 @@ setcontrasts!(mf, x = SumContrasts(levels = [:c, :b, :a], base = :a)) 1 0 1] @test coefnames(mf) == ["(Intercept)"; "x: c"; "x: b"] -## Helmert coded contrasts +# Helmert coded contrasts setcontrasts!(mf, x = HelmertContrasts) @test ModelMatrix(mf).m == [1 -1 -1 1 1 -1 @@ -66,12 +66,12 @@ setcontrasts!(mf, x = HelmertContrasts) 1 1 -1] @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] -## Types for contrast levels are coerced to data levels when constructing -## ContrastsMatrix +# Types for contrast levels are converted to data levels when constructing +# ContrastsMatrix setcontrasts!(mf, x = SumContrasts(levels = ["a", "b", "c"])) @test mf.contrasts[:x].levels == levels(d[:x]) -## Missing data is handled gracefully, dropping columns when a level is lost +# Missing data is handled gracefully, dropping columns when a level is lost d[3, :x] = NA mf_missing = ModelFrame(Formula(Nothing(), :x), d, contrasts = Dict(:x => SumContrasts)) @test ModelMatrix(mf_missing).m == [1 -1 @@ -81,12 +81,12 @@ mf_missing = ModelFrame(Formula(Nothing(), :x), d, contrasts = Dict(:x => SumCon 1 1] @test coefnames(mf_missing) == ["(Intercept)"; "x: b"] -## Things that are bad to do: -## Applying contrasts that only have a subset of data levels: +# Things that are bad to do: +# Applying contrasts that only have a subset of data levels: @test_throws ErrorException setcontrasts!(mf, x = SumContrasts(levels = [:a, :b])) -## Applying contrasts that expect levels not found in data: +# Applying contrasts that expect levels not found in data: @test_throws ErrorException setcontrasts!(mf, x = SumContrasts(levels = [:a, :b, :c, :d])) - - +# Asking for base level that's not found in data +@test_throws ErrorException setcontrasts!(mf, x = SumContrasts(base = :e)) end diff --git a/test/formula.jl b/test/formula.jl index 709ddf9590..6aa4c1b379 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -379,27 +379,98 @@ d[:n] = 1.:8 ## No intercept -mf = ModelFrame(n ~ 0 + x, d[1:2,:], contrasts=cs) -@test ModelMatrix(mf).m == [1 0; 0 1] +mf = ModelFrame(n ~ 0 + x, d, contrasts=cs) +@test ModelMatrix(mf).m == [1 0 + 0 1 + 1 0 + 0 1 + 1 0 + 0 1 + 1 0 + 0 1] @test coefnames(mf) == ["x: a", "x: b"] ## No first-order term for interaction -mf = ModelFrame(n ~ 1 + x + x&y, d[1:4, :], contrasts=cs) -@test ModelMatrix(mf).m[:, 2:end] == [-1 -1 0 - 1 0 -1 - -1 1 0 - 1 0 1] +mf = ModelFrame(n ~ 1 + x + x&y, d, contrasts=cs) +@test ModelMatrix(mf).m[:, 2:end] == [-1 -1 0 + 1 0 -1 + -1 1 0 + 1 0 1 + -1 -1 0 + 1 0 -1 + -1 1 0 + 1 0 1] @test coefnames(mf) == ["(Intercept)", "x: b", "x: a & y: d", "x: b & y: d"] ## When both terms of interaction are non-redundant: -mf = ModelFrame(n ~ 0 + x&y, d[1:4, :], contrasts=cs) +mf = ModelFrame(n ~ 0 + x&y, d, contrasts=cs) @test ModelMatrix(mf).m == [1 0 0 0 + 0 1 0 0 + 0 0 1 0 + 0 0 0 1 + 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1] @test coefnames(mf) == ["x: a & y: c", "x: b & y: c", "x: a & y: d", "x: b & y: d"] +# only a three-way interaction: every term is promoted. +mf = ModelFrame(n ~ 0 + x&y&z, d, contrasts=cs) +@test ModelMatrix(mf).m == eye(8) + +# two two-way interactions, with no lower-order term. both are promoted in +# first (both x and y), but only the old term (x) in the second (because +# dropping x gives z which isn't found elsewhere, but dropping z gives x +# which is found (implicitly) in the promoted interaction x&y). +mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs) +@test ModelMatrix(mf).m == [1 0 0 0 -1 0 + 0 1 0 0 0 -1 + 0 0 1 0 -1 0 + 0 0 0 1 0 -1 + 1 0 0 0 1 0 + 0 1 0 0 0 1 + 0 0 1 0 1 0 + 0 0 0 1 0 1] +@test coefnames(mf) == ["x: a & y: c", "x: b & y: c", + "x: a & y: d", "x: b & y: d", + "x: a & z: f", "x: b & z: f"] + +# ...and adding a three-way interaction, only the shared term (x) is promoted. +# this is because dropping x gives y&z which isn't present, but dropping y or z +# gives x&z or x&z respectively, which are both present. +mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs) +@test ModelMatrix(mf).m == [1 0 0 0 -1 0 1 0 + 0 1 0 0 0 -1 0 1 + 0 0 1 0 -1 0 -1 0 + 0 0 0 1 0 -1 0 -1 + 1 0 0 0 1 0 -1 0 + 0 1 0 0 0 1 0 -1 + 0 0 1 0 1 0 1 0 + 0 0 0 1 0 1 0 1] +@test coefnames(mf) == ["x: a & y: c", "x: b & y: c", + "x: a & y: d", "x: b & y: d", + "x: a & z: f", "x: b & z: f", + "x: a & y: d & z: f", "x: b & y: d & z: f"] + +# two two-way interactions, with common lower-order term. the common term x is +# promoted in both (along with lower-order term), because in every case, when +# x is dropped, the remaining terms (1, y, and z) aren't present elsewhere. +mf = ModelFrame(n ~ 0 + x + x&y + x&z, d, contrasts=cs) +@test ModelMatrix(mf).m == [1 0 -1 0 -1 0 + 0 1 0 -1 0 -1 + 1 0 1 0 -1 0 + 0 1 0 1 0 -1 + 1 0 -1 0 1 0 + 0 1 0 -1 0 1 + 1 0 1 0 1 0 + 0 1 0 1 0 1] +@test coefnames(mf) == ["x: a", "x: b", + "x: a & y: d", "x: b & y: d", + "x: a & z: f", "x: b & z: f"] + + + ## FAILS: When both terms are non-redundant and intercept is PRESENT ## (not fully redundant). Ideally, would drop last column. Might make sense ## to warn about this, and suggest recoding x and y into a single variable. From 433bfd2cfa0fed9929a5c94dab7b88936d5a4988 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 22 Jul 2016 15:01:27 -0400 Subject: [PATCH 53/74] use get for Nullable contrasts.base --- src/statsmodels/contrasts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index b797738197..5c5f9492d5 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -107,7 +107,7 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::Vector) baseind = isnull(contrasts.base) ? 1 : findfirst(c_levels, convert(eltype(levels), get(contrasts.base))) - baseind > 0 || error("base level $(contrasts.base) not found in levels $c_levels.") + baseind > 0 || error("base level $(get(contrasts.base)) not found in levels $c_levels.") tnames = termnames(contrasts, c_levels, baseind) From 83b6f9639361f754d5de665d57de72ab68180365 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 22 Jul 2016 15:01:38 -0400 Subject: [PATCH 54/74] [WIP] documenting contrasts in formulas.md --- docs/src/man/formulas.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/src/man/formulas.md b/docs/src/man/formulas.md index a8221def7e..844d49d234 100644 --- a/docs/src/man/formulas.md +++ b/docs/src/man/formulas.md @@ -33,5 +33,18 @@ If you would like to specify both main effects and an interaction term at once, mm = ModelMatrix(ModelFrame(Z ~ X*Y, df)) ``` +You can control how categorical variables (e.g., `PooledDataArray` columns) are converted to `ModelMatrix` columns by specifying _contrasts_ when you construct a `ModelFrame`: + +```julia +mm = ModelMatrix(ModelFrame(Z ~ X*Y, df, contrasts = Dict(:X => HelmertContrasts))) +``` + +Contrasts can also be modified in an existing `ModelFrame`: + +```julia +mf = ModelFrame(Z ~ X*Y, df) +contrasts!(mf, X = HelmertContrasts) +``` + The construction of model matrices makes it easy to formulate complex statistical models. These are used to good effect by the [GLM Package.](https://github.com/JuliaStats/GLM.jl) From f51562826e46996ac021e772f2f1bd3e5527794c Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 22 Jul 2016 16:10:18 -0400 Subject: [PATCH 55/74] rep -> Compat.repeat --- test/formula.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/formula.jl b/test/formula.jl index 6aa4c1b379..fbdf1a5615 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -1,6 +1,7 @@ module TestFormula using Base.Test using DataFrames + using Compat # TODO: # - grouped variables in formulas with interactions @@ -370,9 +371,9 @@ module TestFormula ## Promote non-redundant categorical terms to full rank -d = DataFrame(x = rep([:a, :b], times = 4), - y = rep([:c, :d], times = 2, each = 2), - z = rep([:e, :f], each = 4)) +d = DataFrame(x = Compat.repeat([:a, :b], outer = 4), + y = Compat.repeat([:c, :d], inner = 2, outer = 2), + z = Compat.repeat([:e, :f], inner = 4)) [pool!(d, name) for name in names(d)] cs = [name => SumContrasts for name in names(d)] d[:n] = 1.:8 From 5410e21fd5f86d52c8549740e88672189c533626 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 22 Jul 2016 20:37:33 -0400 Subject: [PATCH 56/74] fix Nothing failure on nightlies --- test/contrasts.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/contrasts.jl b/test/contrasts.jl index 8957331fa0..8063aa8de3 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -6,7 +6,7 @@ using DataFrames d = DataFrame(x = @pdata( [:a, :b, :c, :a, :a, :b] )) -mf = ModelFrame(Formula(Nothing(), :x), d) +mf = ModelFrame(Formula(nothing, :x), d) @test ModelMatrix(mf).m == [1 0 0 1 1 0 @@ -73,7 +73,7 @@ setcontrasts!(mf, x = SumContrasts(levels = ["a", "b", "c"])) # Missing data is handled gracefully, dropping columns when a level is lost d[3, :x] = NA -mf_missing = ModelFrame(Formula(Nothing(), :x), d, contrasts = Dict(:x => SumContrasts)) +mf_missing = ModelFrame(Formula(nothing, :x), d, contrasts = Dict(:x => SumContrasts)) @test ModelMatrix(mf_missing).m == [1 -1 1 1 1 -1 From 95c515dfa4eafa7b0161fb2b95a0048b12739d46 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 22 Jul 2016 21:05:51 -0400 Subject: [PATCH 57/74] use throw(ArgumentError()) instead of error() --- src/statsmodels/contrasts.jl | 17 ++++++++++++----- test/contrasts.jl | 6 +++--- test/statsmodel.jl | 2 +- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 5c5f9492d5..539d6c1e10 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -97,17 +97,24 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::Vector) # rank-deficient model matrix. c_levels = oftype(levels, get(contrasts.levels, levels)) mismatched_levels = symdiff(c_levels, levels) - isempty(mismatched_levels) || error("contrasts levels not found in data or vice-versa: $mismatched_levels.\nData levels: $levels.\nContrast levels: $c_levels") + if ! isempty(mismatched_levels) + throw(ArgumentError("contrasts levels not found in data or vice-versa: $mismatched_levels.\nData levels: $levels.\nContrast levels: $c_levels")) + end n = length(c_levels) - n == 0 && error("empty set of levels found (need at least two to compute contrasts).") - n == 1 && error("only one level found: $(c_levels[1]). need at least two to compute contrasts.") + if n == 0 + throw(ArgumentError("empty set of levels found (need at least two to compute contrasts).")) + elseif n == 1 + throw(ArgumentError("only one level found: $(c_levels[1]) (need at least two to compute contrasts).")) + end # find index of base level. use contrasts.base, then default (1). baseind = isnull(contrasts.base) ? 1 : findfirst(c_levels, convert(eltype(levels), get(contrasts.base))) - baseind > 0 || error("base level $(get(contrasts.base)) not found in levels $c_levels.") + if baseind < 1 + throw(ArgumentError("base level $(get(contrasts.base)) not found in levels $c_levels.")) + end tnames = termnames(contrasts, c_levels, baseind) @@ -131,7 +138,7 @@ ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) = Cont ContrastsMatrix(c::ContrastsMatrix, col::PooledDataArray) = isempty(setdiff(levels(col), c.levels)) ? c : - error("there are levels in data that are not in ContrastsMatrix: $(setdiff(levels(col), c.levels))\n Data levels: $(levels(col))\n Contrast levels $(c.levels)") + throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: $(setdiff(levels(col), c.levels))\n Data levels: $(levels(col))\n Contrast levels $(c.levels)")) function termnames(C::AbstractContrasts, levels::Vector, baseind::Integer) not_base = [1:(baseind-1); (baseind+1):length(levels)] diff --git a/test/contrasts.jl b/test/contrasts.jl index 8063aa8de3..028a7d7f08 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -83,10 +83,10 @@ mf_missing = ModelFrame(Formula(nothing, :x), d, contrasts = Dict(:x => SumContr # Things that are bad to do: # Applying contrasts that only have a subset of data levels: -@test_throws ErrorException setcontrasts!(mf, x = SumContrasts(levels = [:a, :b])) +@test_throws ArgumentError setcontrasts!(mf, x = SumContrasts(levels = [:a, :b])) # Applying contrasts that expect levels not found in data: -@test_throws ErrorException setcontrasts!(mf, x = SumContrasts(levels = [:a, :b, :c, :d])) +@test_throws ArgumentError setcontrasts!(mf, x = SumContrasts(levels = [:a, :b, :c, :d])) # Asking for base level that's not found in data -@test_throws ErrorException setcontrasts!(mf, x = SumContrasts(base = :e)) +@test_throws ArgumentError setcontrasts!(mf, x = SumContrasts(base = :e)) end diff --git a/test/statsmodel.jl b/test/statsmodel.jl index eb2a85dec1..0cc2ab5ba3 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -73,7 +73,7 @@ m2 = fit(DummyMod, f2, d) d3 = deepcopy(d) d3[1, :x1] = 0 d3[:x1p] = PooledDataArray(d3[:x1]) -@test_throws ErrorException predict(m2, d3) +@test_throws ArgumentError predict(m2, d3) ## Another dummy model type to test fall-through show method immutable DummyModTwo <: RegressionModel From 8798f3bbc723b85399acf421d70bfbde63fff04d Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 25 Jul 2016 21:36:42 -0400 Subject: [PATCH 58/74] rename contrasts --- docs/src/man/formulas.md | 4 ++-- src/DataFrames.jl | 6 +++--- src/statsmodels/contrasts.jl | 36 +++++++++++++++++++----------------- src/statsmodels/formula.jl | 8 ++++---- test/contrasts.jl | 20 ++++++++++---------- test/formula.jl | 2 +- 6 files changed, 39 insertions(+), 37 deletions(-) diff --git a/docs/src/man/formulas.md b/docs/src/man/formulas.md index 844d49d234..2fb149f586 100644 --- a/docs/src/man/formulas.md +++ b/docs/src/man/formulas.md @@ -36,14 +36,14 @@ mm = ModelMatrix(ModelFrame(Z ~ X*Y, df)) You can control how categorical variables (e.g., `PooledDataArray` columns) are converted to `ModelMatrix` columns by specifying _contrasts_ when you construct a `ModelFrame`: ```julia -mm = ModelMatrix(ModelFrame(Z ~ X*Y, df, contrasts = Dict(:X => HelmertContrasts))) +mm = ModelMatrix(ModelFrame(Z ~ X*Y, df, contrasts = Dict(:X => HelmertCoding))) ``` Contrasts can also be modified in an existing `ModelFrame`: ```julia mf = ModelFrame(Z ~ X*Y, df) -contrasts!(mf, X = HelmertContrasts) +contrasts!(mf, X = HelmertCoding) ``` The construction of model matrices makes it easy to formulate complex statistical models. These are used to good effect by the [GLM Package.](https://github.com/JuliaStats/GLM.jl) diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 00d9b50b10..0f27457c35 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -44,9 +44,9 @@ export @~, ModelFrame, ModelMatrix, SubDataFrame, - SumContrasts, - TreatmentContrasts, - HelmertContrasts, + EffectsCoding, + DummyCoding, + HelmertCoding, aggregate, by, diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 539d6c1e10..c6821ccc37 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -50,9 +50,9 @@ constructing a `ContrastsMatrix`. # Concrete types -* `TreatmentContrasts` -* `SumContrasts` -* `HelmertContrasts` +* `DummyCoding` +* `EffectsCoding` +* `HelmertCoding` To implement your own concrete types, implement a constructor and a `contrast_matrix` method for constructing the actual contrasts matrix @@ -150,7 +150,7 @@ nullify(x) = Nullable(x) # Making a contrast type T only requires that there be a method for # contrasts_matrix(T, v::PooledDataArray). The rest is boilerplate. -for contrastType in [:TreatmentContrasts, :SumContrasts, :HelmertContrasts] +for contrastType in [:DummyCoding, :EffectsCoding, :HelmertCoding] @eval begin type $contrastType <: AbstractContrasts base::Nullable{Any} @@ -166,7 +166,7 @@ for contrastType in [:TreatmentContrasts, :SumContrasts, :HelmertContrasts] end """ - DummyContrasts + FullDummyCoding One indicator (1 or 0) column for each level, __including__ the base level. @@ -176,30 +176,32 @@ non-redundant cases, we can expand x into `length(levels(x))` columns without creating a non-identifiable model matrix (unless the user has done something foolish in specifying the model, which we can't do much about anyway). """ -type DummyContrasts <: AbstractContrasts +type FullDummyCoding <: AbstractContrasts # Dummy contrasts have no base level (since all levels produce a column) end -ContrastsMatrix{T}(C::DummyContrasts, lvls::Vector{T}) = ContrastsMatrix(eye(Float64, length(lvls)), lvls, lvls, C) +ContrastsMatrix{T}(C::FullDummyCoding, lvls::Vector{T}) = + ContrastsMatrix(eye(Float64, length(lvls)), lvls, lvls, C) "Promote contrasts matrix to full rank version" -Base.convert(::Type{ContrastsMatrix{DummyContrasts}}, C::ContrastsMatrix) = ContrastsMatrix(DummyContrasts(), C.levels) +Base.convert(::Type{ContrastsMatrix{FullDummyCoding}}, C::ContrastsMatrix) = + ContrastsMatrix(FullDummyCoding(), C.levels) """ - TreatmentContrasts + DummyCoding One indicator column (1 or 0) for each non-base level. The default in R. Columns have non-zero mean and are collinear with an intercept column (and lower-order columns for interactions) but are orthogonal to each other. """ -TreatmentContrasts +DummyCoding -contrasts_matrix(C::TreatmentContrasts, baseind, n) = eye(n)[:, [1:(baseind-1); (baseind+1):n]] +contrasts_matrix(C::DummyCoding, baseind, n) = eye(n)[:, [1:(baseind-1); (baseind+1):n]] """ - SumContrasts + EffectsCoding Column for level `x` of column `col` is 1 where `col .== x` and -1 where `col .== base`. @@ -208,9 +210,9 @@ Produces mean-0 (centered) columns _only_ when all levels are equally frequent. But with more than two levels, the generated columns are guaranteed to be non- orthogonal (so beware of collinearity). """ -SumContrasts +EffectsCoding -function contrasts_matrix(C::SumContrasts, baseind, n) +function contrasts_matrix(C::EffectsCoding, baseind, n) not_base = [1:(baseind-1); (baseind+1):n] mat = eye(n)[:, not_base] mat[baseind, :] = -1 @@ -218,7 +220,7 @@ function contrasts_matrix(C::SumContrasts, baseind, n) end """ - HelmertContrasts + HelmertCoding Produces contrast columns with -1 for each of n levels below contrast, n for contrast level, and 0 above. Produces something like: @@ -236,9 +238,9 @@ level n+1 and the average of levels 1 to n. When balanced, it has the nice property of each column in the resulting model matrix being orthogonal and with mean 0. """ -HelmertContrasts +HelmertCoding -function contrasts_matrix(C::HelmertContrasts, baseind, n) +function contrasts_matrix(C::HelmertCoding, baseind, n) mat = zeros(n, n-1) for i in 1:n-1 mat[1:i, i] = -1 diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 2b6a4ff774..4bb31925df 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -278,7 +278,7 @@ function check_non_redundancy!(trms::Terms, df::AbstractDataFrame) end -const DEFAULT_CONTRASTS = TreatmentContrasts +const DEFAULT_CONTRASTS = DummyCoding function ModelFrame(trms::Terms, d::AbstractDataFrame; contrasts::Dict = Dict()) @@ -288,7 +288,7 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; ## Set up contrasts: ## Combine actual DF columns and contrast types if necessary to compute the - ## actual contrasts matrices, levels, and term names (using TreatmentContrasts + ## actual contrasts matrices, levels, and term names (using DummyCoding ## as the default) evaledContrasts = Dict() for (term, col) in eachcol(df) @@ -341,7 +341,7 @@ function modelmat_cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false if haskey(mf.contrasts, name) modelmat_cols(mf.df[name], non_redundant ? - ContrastsMatrix{DummyContrasts}(mf.contrasts[name]) : + ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) : mf.contrasts[name]) else modelmat_cols(mf.df[name]) @@ -485,7 +485,7 @@ function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false) if haskey(mf.contrasts, term) termnames(term, mf.df[term], non_redundant ? - ContrastsMatrix{DummyContrasts}(mf.contrasts[term]) : + ContrastsMatrix{FullDummyCoding}(mf.contrasts[term]) : mf.contrasts[term]) else termnames(term, mf.df[term]) diff --git a/test/contrasts.jl b/test/contrasts.jl index 028a7d7f08..9be0e526f0 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -16,7 +16,7 @@ mf = ModelFrame(Formula(nothing, :x), d) 1 1 0] @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] -setcontrasts!(mf, x = SumContrasts) +setcontrasts!(mf, x = EffectsCoding) @test ModelMatrix(mf).m == [1 -1 -1 1 1 0 1 0 1 @@ -26,7 +26,7 @@ setcontrasts!(mf, x = SumContrasts) @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] # change base level of contrast -setcontrasts!(mf, x = SumContrasts(base = :b)) +setcontrasts!(mf, x = EffectsCoding(base = :b)) @test ModelMatrix(mf).m == [1 1 0 1 -1 -1 1 0 1 @@ -36,7 +36,7 @@ setcontrasts!(mf, x = SumContrasts(base = :b)) @test coefnames(mf) == ["(Intercept)"; "x: a"; "x: c"] # change levels of contrast -setcontrasts!(mf, x = SumContrasts(levels = [:c, :b, :a])) +setcontrasts!(mf, x = EffectsCoding(levels = [:c, :b, :a])) @test ModelMatrix(mf).m == [1 0 1 1 1 0 1 -1 -1 @@ -47,7 +47,7 @@ setcontrasts!(mf, x = SumContrasts(levels = [:c, :b, :a])) # change levels and base level of contrast -setcontrasts!(mf, x = SumContrasts(levels = [:c, :b, :a], base = :a)) +setcontrasts!(mf, x = EffectsCoding(levels = [:c, :b, :a], base = :a)) @test ModelMatrix(mf).m == [1 -1 -1 1 0 1 1 1 0 @@ -57,7 +57,7 @@ setcontrasts!(mf, x = SumContrasts(levels = [:c, :b, :a], base = :a)) @test coefnames(mf) == ["(Intercept)"; "x: c"; "x: b"] # Helmert coded contrasts -setcontrasts!(mf, x = HelmertContrasts) +setcontrasts!(mf, x = HelmertCoding) @test ModelMatrix(mf).m == [1 -1 -1 1 1 -1 1 0 2 @@ -68,12 +68,12 @@ setcontrasts!(mf, x = HelmertContrasts) # Types for contrast levels are converted to data levels when constructing # ContrastsMatrix -setcontrasts!(mf, x = SumContrasts(levels = ["a", "b", "c"])) +setcontrasts!(mf, x = EffectsCoding(levels = ["a", "b", "c"])) @test mf.contrasts[:x].levels == levels(d[:x]) # Missing data is handled gracefully, dropping columns when a level is lost d[3, :x] = NA -mf_missing = ModelFrame(Formula(nothing, :x), d, contrasts = Dict(:x => SumContrasts)) +mf_missing = ModelFrame(Formula(nothing, :x), d, contrasts = Dict(:x => EffectsCoding)) @test ModelMatrix(mf_missing).m == [1 -1 1 1 1 -1 @@ -83,10 +83,10 @@ mf_missing = ModelFrame(Formula(nothing, :x), d, contrasts = Dict(:x => SumContr # Things that are bad to do: # Applying contrasts that only have a subset of data levels: -@test_throws ArgumentError setcontrasts!(mf, x = SumContrasts(levels = [:a, :b])) +@test_throws ArgumentError setcontrasts!(mf, x = EffectsCoding(levels = [:a, :b])) # Applying contrasts that expect levels not found in data: -@test_throws ArgumentError setcontrasts!(mf, x = SumContrasts(levels = [:a, :b, :c, :d])) +@test_throws ArgumentError setcontrasts!(mf, x = EffectsCoding(levels = [:a, :b, :c, :d])) # Asking for base level that's not found in data -@test_throws ArgumentError setcontrasts!(mf, x = SumContrasts(base = :e)) +@test_throws ArgumentError setcontrasts!(mf, x = EffectsCoding(base = :e)) end diff --git a/test/formula.jl b/test/formula.jl index fbdf1a5615..c2d16762d8 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -375,7 +375,7 @@ d = DataFrame(x = Compat.repeat([:a, :b], outer = 4), y = Compat.repeat([:c, :d], inner = 2, outer = 2), z = Compat.repeat([:e, :f], inner = 4)) [pool!(d, name) for name in names(d)] -cs = [name => SumContrasts for name in names(d)] +cs = [name => EffectsCoding for name in names(d)] d[:n] = 1.:8 From d7ce429f84f691c724730f302fa65c284ef88fde Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 25 Jul 2016 23:04:25 -0400 Subject: [PATCH 59/74] wrap long error message lines --- src/statsmodels/contrasts.jl | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index c6821ccc37..612fcd0447 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -98,14 +98,19 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::Vector) c_levels = oftype(levels, get(contrasts.levels, levels)) mismatched_levels = symdiff(c_levels, levels) if ! isempty(mismatched_levels) - throw(ArgumentError("contrasts levels not found in data or vice-versa: $mismatched_levels.\nData levels: $levels.\nContrast levels: $c_levels")) + throw(ArgumentError("contrasts levels not found in data or vice-versa: " * + "$mismatched_levels." * + "\n Data levels: $levels."* + "\n Contrast levels: $c_levels")) end n = length(c_levels) if n == 0 - throw(ArgumentError("empty set of levels found (need at least two to compute contrasts).")) + throw(ArgumentError("empty set of levels found (need at least two to compute " * + "contrasts).")) elseif n == 1 - throw(ArgumentError("only one level found: $(c_levels[1]) (need at least two to compute contrasts).")) + throw(ArgumentError("only one level found: $(c_levels[1]) (need at least two to " * + "compute contrasts).")) end # find index of base level. use contrasts.base, then default (1). @@ -126,9 +131,10 @@ end # Methods for constructing ContrastsMatrix from data. These are called in # ModelFrame constructor and setcontrasts!. -ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = ContrastsMatrix(C, levels(v)) -# instantiate Type{C<:AbstractContrasts} -ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) = ContrastsMatrix(c(), col) +ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = + ContrastsMatrix(C, levels(v)) +ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) = + ContrastsMatrix(c(), col) # given an existing ContrastsMatrix, check that all of the levels present in the # data are present in the contrasts. Note that this behavior is different from the # ContrastsMatrix constructor, which requires that the levels be exactly the same. @@ -138,7 +144,10 @@ ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) = Cont ContrastsMatrix(c::ContrastsMatrix, col::PooledDataArray) = isempty(setdiff(levels(col), c.levels)) ? c : - throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: $(setdiff(levels(col), c.levels))\n Data levels: $(levels(col))\n Contrast levels $(c.levels)")) + throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: "* + "$(setdiff(levels(col), c.levels))" * + "\n Data levels: $(levels(col))" * + "\n Contrast levels $(c.levels)")) function termnames(C::AbstractContrasts, levels::Vector, baseind::Integer) not_base = [1:(baseind-1); (baseind+1):length(levels)] From 86326248d6e16d1c90f2bcbbefc25af0cd498777 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 26 Jul 2016 21:46:48 -0400 Subject: [PATCH 60/74] add ContrastsCoding --- src/DataFrames.jl | 1 + src/statsmodels/contrasts.jl | 52 ++++++++++++++++++++++++++++++++---- test/contrasts.jl | 17 ++++++++++++ 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 0f27457c35..1b1d56c722 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -47,6 +47,7 @@ export @~, EffectsCoding, DummyCoding, HelmertCoding, + ContrastsCoding, aggregate, by, diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 612fcd0447..e43b6ead5c 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -53,17 +53,27 @@ constructing a `ContrastsMatrix`. * `DummyCoding` * `EffectsCoding` * `HelmertCoding` +* `ContrastsCoding` -To implement your own concrete types, implement a constructor and a -`contrast_matrix` method for constructing the actual contrasts matrix -that maps from levels to `ModelMatrix` column values: +The last coding type, `ContrastsCoding`, provides a way to manually specify a +contrasts matrix. For a variable `x` with k levels, a contrasts matrix `M` is a +k by k-1 matrix, that maps the k levels onto k-1 model matrix columns. +Specifically, let ``X^*`` be the full-rank indicator matrix for `x`, where +``X^*_{i,j} = 1`` if `x[i]` is level `j`, and 0 otherwise. Then the model matrix +columns generated by the contrasts matrix `M` is ``X = X^*M``. + +To implement your own `AbstractContrasts` type, implement a constructor, a +`contrast_matrix` method for constructing the actual contrasts matrix that maps +from levels to `ModelMatrix` column values, and (optionally) a `termnames` +method: ```julia -type MyContrasts <: AbstractContrasts +type MyCoding <: AbstractContrasts ... end -contrasts_matrix(C::MyContrasts, baseind, n) = ... +contrasts_matrix(C::MyCoding, baseind, n) = ... +termnames(C::MyCoding, levels, baseind) = ... ``` """ @@ -261,3 +271,35 @@ function contrasts_matrix(C::HelmertCoding, baseind, n) return mat end +""" + ContrastsCoding + +Coding by manual specification of contrasts matrix. For k levels, the contrasts +must be a k by k-1 Matrix. +""" +type ContrastsCoding <: AbstractContrasts + mat::Matrix + base::Nullable{Any} + levels::Nullable{Vector} + + function ContrastsCoding(mat, base, levels) + if !isnull(levels) + check_contrasts_size(mat, length(get(levels))) + end + new(mat, base, levels) + end +end + +check_contrasts_size(mat::Matrix, n_lev) = + size(mat) == (n_lev, n_lev-1) || + throw(ArgumentError("contrasts matrix wrong size for $n_lev levels. " * + "Expected $((n_lev, n_lev-1)), got $(size(mat))")) + +## constructor with optional keyword arguments, defaulting to Nullables +ContrastsCoding(mat::Matrix; base=Nullable{Any}(), levels=Nullable{Vector}()) = + ContrastsCoding(mat, nullify(base), nullify(levels)) + +function contrasts_matrix(C::ContrastsCoding, baseind, n) + check_contrasts_size(C.mat, n) + C.mat +end diff --git a/test/contrasts.jl b/test/contrasts.jl index 9be0e526f0..a6ea79e312 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -89,4 +89,21 @@ mf_missing = ModelFrame(Formula(nothing, :x), d, contrasts = Dict(:x => EffectsC # Asking for base level that's not found in data @test_throws ArgumentError setcontrasts!(mf, x = EffectsCoding(base = :e)) +# Manually specified contrasts +contrasts = [0 1 + -1 -.5 + 1 -.5] +setcontrasts!(mf, x = ContrastsCoding(contrasts)) +@test ModelMatrix(mf).m == [1 0 1 + 1 -1 -.5 + 1 1 -.5 + 1 0 1 + 1 0 1 + 1 -1 -.5] + +# throw argument error if number of levels mismatches +@test_throws ArgumentError setcontrasts!(mf, x = ContrastsCoding(contrasts[1:2, :])) +@test_throws ArgumentError setcontrasts!(mf, x = ContrastsCoding(hcat(contrasts, contrasts))) + + end From 42c5b6f42eca5ba824f786acbe74eddf1bcdb2a1 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 29 Jul 2016 22:32:14 -0400 Subject: [PATCH 61/74] update docstrings --- src/statsmodels/contrasts.jl | 92 +++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 37 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index e43b6ead5c..fdd5119ce9 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -40,12 +40,11 @@ If specified, levels will be checked against data when generating a levels would lead to empty columns in the model matrix, and missing contrast levels would lead to empty or undefined rows. -You can also specify the base level of the contrasts. The -actual interpretation of this depends on the particular contrast type, but in -general it can be thought of as a "reference" level. It defaults to the first -level. +You can also specify the base level of the contrasts. The actual interpretation +of this depends on the particular contrast type, but in general it can be +thought of as a "reference" level. It defaults to the first level. -Both `levels` and `base` will be coerced to the type of the data when +Both `levels` and `base` will be converted to the type of the data when constructing a `ContrastsMatrix`. # Concrete types @@ -60,10 +59,10 @@ contrasts matrix. For a variable `x` with k levels, a contrasts matrix `M` is a k by k-1 matrix, that maps the k levels onto k-1 model matrix columns. Specifically, let ``X^*`` be the full-rank indicator matrix for `x`, where ``X^*_{i,j} = 1`` if `x[i]` is level `j`, and 0 otherwise. Then the model matrix -columns generated by the contrasts matrix `M` is ``X = X^*M``. +columns generated by the contrasts matrix `M` are ``X = X^* M``. To implement your own `AbstractContrasts` type, implement a constructor, a -`contrast_matrix` method for constructing the actual contrasts matrix that maps +`contrasts_matrix` method for constructing the actual contrasts matrix that maps from levels to `ModelMatrix` column values, and (optionally) a `termnames` method: @@ -80,17 +79,17 @@ termnames(C::MyCoding, levels, baseind) = ... abstract AbstractContrasts # Contrasts + Levels (usually from data) = ContrastsMatrix -type ContrastsMatrix{S <: AbstractContrasts, T} +type ContrastsMatrix{C <: AbstractContrasts, T} matrix::Matrix{Float64} termnames::Vector{T} levels::Vector{T} - contrasts::S + contrasts::C end """ - ContrastsMatrix{T}(C::AbstractContrasts, levels::Vector{T}) + ContrastsMatrix{C<:AbstractContrasts}(contrasts::C, levels::Vector) -Compute contrasts matrix for a given set of categorical data levels. +Compute contrasts matrix for given data levels. If levels are specified in the `AbstractContrasts`, those will be used, and likewise for the base level (which defaults to the first level). @@ -185,9 +184,10 @@ for contrastType in [:DummyCoding, :EffectsCoding, :HelmertCoding] end """ - FullDummyCoding + FullDummyCoding() -One indicator (1 or 0) column for each level, __including__ the base level. +Coding that generates one indicator (1 or 0) column for each level, +__including__ the base level. Needed internally when a term is non-redundant with lower-order terms (e.g., in `~0+x` vs. `~1+x`, or in the interactions terms in `~1+x+x&y` vs. `~1+x+y+x&y`. In the @@ -207,12 +207,16 @@ Base.convert(::Type{ContrastsMatrix{FullDummyCoding}}, C::ContrastsMatrix) = ContrastsMatrix(FullDummyCoding(), C.levels) """ - DummyCoding + DummyCoding([base[, levels]]) -One indicator column (1 or 0) for each non-base level. +Contrast coding that generates one indicator column (1 or 0) for each non-base level. -The default in R. Columns have non-zero mean and are collinear with an intercept -column (and lower-order columns for interactions) but are orthogonal to each other. +Columns have non-zero mean and are collinear with an intercept column (and +lower-order columns for interactions) but are orthogonal to each other. In a +regression model, dummy coding leads to an intercept that is the mean of the +dependent variable for base level. + +Also known as "treatment coding" (`contr.treatment` in R) or "one-hot encoding". """ DummyCoding @@ -220,14 +224,24 @@ contrasts_matrix(C::DummyCoding, baseind, n) = eye(n)[:, [1:(baseind-1); (basein """ - EffectsCoding + EffectsCoding([base[, levels]]) + +Contrast coding that generates columns that code each non-base level as the +deviation from the base level. For each non-base level `x` of `variable`, a +column is generated with 1 where `variable .== x` and -1 where `col .== base`. -Column for level `x` of column `col` is 1 where `col .== x` and -1 where -`col .== base`. +`EffectsCoding` is like `DummyCoding`, but using -1 for the base level instead +of 0. -Produces mean-0 (centered) columns _only_ when all levels are equally frequent. -But with more than two levels, the generated columns are guaranteed to be non- -orthogonal (so beware of collinearity). +When all levels are equally frequent, effects coding generates model matrix +columns that are mean centered (have mean 0). For more than two levels the +generated columns are not orthogonal. In a regression model with an +effects-coded variable, the intercept corresponds to the grand mean. + +Also known as "sum coding" (`contr.sum` in R) or "simple coding" (SPSS). Note +though that the default in R and SPSS is to use the _last_ level as the base. +Here we use the _first_ level as the base, for consistency with other coding +schemes. """ EffectsCoding @@ -239,23 +253,27 @@ function contrasts_matrix(C::EffectsCoding, baseind, n) end """ - HelmertCoding + HelmertCoding([base[, levels]]) -Produces contrast columns with -1 for each of n levels below contrast, n for -contrast level, and 0 above. Produces something like: +Contrasts that code each level as the difference from the average of the lower +levels. -``` -[-1 -1 -1 - 1 -1 -1 - 0 2 -1 - 0 0 3] +For each non-base level, Helmert coding generates a columns with -1 for each of +n levels below, n for that level, and 0 above. + +# Examples + +```julia +julia> ContrastsMatrix(HelmertCoding(), collect(1:4)).matrix +4x3 Array{Float64,2}: + -1.0 -1.0 -1.0 + 1.0 -1.0 -1.0 + 0.0 2.0 -1.0 + 0.0 0.0 3.0 ``` -This is a good choice when you have more than two levels that are equally frequent. -Interpretation is that the nth contrast column is the difference between -level n+1 and the average of levels 1 to n. When balanced, it has the -nice property of each column in the resulting model matrix being orthogonal and -with mean 0. +When all levels are equally frequent, Helmert coding generates columns that are +mean-centered (mean 0) and orthogonal. """ HelmertCoding @@ -272,7 +290,7 @@ function contrasts_matrix(C::HelmertCoding, baseind, n) end """ - ContrastsCoding + ContrastsCoding(mat::Matrix[, base[, levels]]) Coding by manual specification of contrasts matrix. For k levels, the contrasts must be a k by k-1 Matrix. From ea3ff236d9ef66eb6cda6e4c8d026bdfc24ace38 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 29 Jul 2016 22:32:24 -0400 Subject: [PATCH 62/74] explicit test for dummy coding --- test/contrasts.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/contrasts.jl b/test/contrasts.jl index a6ea79e312..4397b5d562 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -8,6 +8,7 @@ d = DataFrame(x = @pdata( [:a, :b, :c, :a, :a, :b] )) mf = ModelFrame(Formula(nothing, :x), d) +# Dummy coded contrasts by default: @test ModelMatrix(mf).m == [1 0 0 1 1 0 1 0 1 @@ -16,6 +17,10 @@ mf = ModelFrame(Formula(nothing, :x), d) 1 1 0] @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] +mmm = ModelMatrix(mf).m +setcontrasts!(mf, x = DummyCoding) +@test ModelMatrix(mf).m == mmm + setcontrasts!(mf, x = EffectsCoding) @test ModelMatrix(mf).m == [1 -1 -1 1 1 0 From 4108bef106f274def15b3d7d9c1053e0046156a5 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 29 Jul 2016 23:34:20 -0400 Subject: [PATCH 63/74] vector -> abstractvector --- src/statsmodels/contrasts.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index fdd5119ce9..b15b08feea 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -87,14 +87,14 @@ type ContrastsMatrix{C <: AbstractContrasts, T} end """ - ContrastsMatrix{C<:AbstractContrasts}(contrasts::C, levels::Vector) + ContrastsMatrix{C<:AbstractContrasts}(contrasts::C, levels::AbstractVector) Compute contrasts matrix for given data levels. If levels are specified in the `AbstractContrasts`, those will be used, and likewise for the base level (which defaults to the first level). """ -function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::Vector) +function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::AbstractVector) # if levels are defined on contrasts, use those, validating that they line up. # what does that mean? either: @@ -158,7 +158,7 @@ ContrastsMatrix(c::ContrastsMatrix, col::PooledDataArray) = "\n Data levels: $(levels(col))" * "\n Contrast levels $(c.levels)")) -function termnames(C::AbstractContrasts, levels::Vector, baseind::Integer) +function termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer) not_base = [1:(baseind-1); (baseind+1):length(levels)] levels[not_base] end From 666647f29d6765b2bbb31a1b4ce38652f89d05bb Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 29 Jul 2016 23:34:31 -0400 Subject: [PATCH 64/74] one-line summaries of coding schemes --- src/statsmodels/contrasts.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index b15b08feea..bf7939cd59 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -49,10 +49,11 @@ constructing a `ContrastsMatrix`. # Concrete types -* `DummyCoding` -* `EffectsCoding` -* `HelmertCoding` -* `ContrastsCoding` +* `DummyCoding` - Code each non-base level as a 0-1 indicator column. +* `EffectsCoding` - Code each non-base level as 1, and base as -1. +* `HelmertCoding` - Code each non-base level as the difference from the mean of + the lower levels +* `ContrastsCoding` - Manually specify contrasts matrix The last coding type, `ContrastsCoding`, provides a way to manually specify a contrasts matrix. For a variable `x` with k levels, a contrasts matrix `M` is a From 527e2480fc8f18105afc35832c6c392746aa7914 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 29 Jul 2016 23:53:43 -0400 Subject: [PATCH 65/74] replace ternary operator with explicit if --- src/statsmodels/contrasts.jl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index bf7939cd59..c053356d7b 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -151,13 +151,15 @@ ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) = # This method exists to support things like `predict` that can operate on new data # which may contain only a subset of the original data's levels. Checking here # (instead of in `modelmat_cols`) allows an informative error message. -ContrastsMatrix(c::ContrastsMatrix, col::PooledDataArray) = - isempty(setdiff(levels(col), c.levels)) ? - c : - throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: "* - "$(setdiff(levels(col), c.levels))" * - "\n Data levels: $(levels(col))" * - "\n Contrast levels $(c.levels)")) +function ContrastsMatrix(c::ContrastsMatrix, col::PooledDataArray) + if !isempty(setdiff(levels(col), c.levels)) + throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: "* + "$(setdiff(levels(col), c.levels))" * + "\n Data levels: $(levels(col))" * + "\n Contrast levels $(c.levels)")) + end + return c +end function termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer) not_base = [1:(baseind-1); (baseind+1):length(levels)] From 3c4a14437586340bfcfd88aecea669f179906be8 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 29 Jul 2016 23:48:58 -0400 Subject: [PATCH 66/74] do not convert levels on contrasts to data type --- src/statsmodels/contrasts.jl | 11 ++++++----- test/contrasts.jl | 6 ++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index c053356d7b..b373d3b6aa 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -44,9 +44,6 @@ You can also specify the base level of the contrasts. The actual interpretation of this depends on the particular contrast type, but in general it can be thought of as a "reference" level. It defaults to the first level. -Both `levels` and `base` will be converted to the type of the data when -constructing a `ContrastsMatrix`. - # Concrete types * `DummyCoding` - Code each non-base level as a 0-1 indicator column. @@ -105,7 +102,11 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::AbstractV # better to filter data frame first # 3. contrast levels missing from data: would have empty columns, generate a # rank-deficient model matrix. - c_levels = oftype(levels, get(contrasts.levels, levels)) + c_levels = get(contrasts.levels, levels) + if eltype(c_levels) != eltype(levels) + throw(ArgumentError("mismatching levels types: got $(eltype(levels)), expected " * + "$(eltype(c_levels)) based on contrasts levels.")) + end mismatched_levels = symdiff(c_levels, levels) if ! isempty(mismatched_levels) throw(ArgumentError("contrasts levels not found in data or vice-versa: " * @@ -126,7 +127,7 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::AbstractV # find index of base level. use contrasts.base, then default (1). baseind = isnull(contrasts.base) ? 1 : - findfirst(c_levels, convert(eltype(levels), get(contrasts.base))) + findfirst(c_levels, get(contrasts.base)) if baseind < 1 throw(ArgumentError("base level $(get(contrasts.base)) not found in levels $c_levels.")) end diff --git a/test/contrasts.jl b/test/contrasts.jl index 4397b5d562..b8a7599665 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -71,10 +71,8 @@ setcontrasts!(mf, x = HelmertCoding) 1 1 -1] @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] -# Types for contrast levels are converted to data levels when constructing -# ContrastsMatrix -setcontrasts!(mf, x = EffectsCoding(levels = ["a", "b", "c"])) -@test mf.contrasts[:x].levels == levels(d[:x]) +# Mismatching types of data and contrasts levels throws an error: +@test_throws ArgumentError setcontrasts!(mf, x = EffectsCoding(levels = ["a", "b", "c"])) # Missing data is handled gracefully, dropping columns when a level is lost d[3, :x] = NA From 709740c9178dc94d37ca9edfb929d0757ccfcee2 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 30 Jul 2016 00:08:37 -0400 Subject: [PATCH 67/74] no space after ! --- src/statsmodels/contrasts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index b373d3b6aa..242231d90c 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -108,7 +108,7 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::AbstractV "$(eltype(c_levels)) based on contrasts levels.")) end mismatched_levels = symdiff(c_levels, levels) - if ! isempty(mismatched_levels) + if !isempty(mismatched_levels) throw(ArgumentError("contrasts levels not found in data or vice-versa: " * "$mismatched_levels." * "\n Data levels: $levels."* From 0a9c7a833a0d81569f65fb07865312c394203ef6 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 30 Jul 2016 00:28:36 -0400 Subject: [PATCH 68/74] cosmetic tweaks --- src/statsmodels/contrasts.jl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 242231d90c..82e6755e28 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -111,7 +111,7 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::AbstractV if !isempty(mismatched_levels) throw(ArgumentError("contrasts levels not found in data or vice-versa: " * "$mismatched_levels." * - "\n Data levels: $levels."* + "\n Data levels: $levels." * "\n Contrast levels: $c_levels")) end @@ -129,7 +129,8 @@ function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::AbstractV 1 : findfirst(c_levels, get(contrasts.base)) if baseind < 1 - throw(ArgumentError("base level $(get(contrasts.base)) not found in levels $c_levels.")) + throw(ArgumentError("base level $(get(contrasts.base)) not found in levels " * + "$c_levels.")) end tnames = termnames(contrasts, c_levels, baseind) @@ -141,6 +142,7 @@ end # Methods for constructing ContrastsMatrix from data. These are called in # ModelFrame constructor and setcontrasts!. +# TODO: add methods for new categorical types ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = ContrastsMatrix(C, levels(v)) @@ -154,10 +156,10 @@ ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) = # (instead of in `modelmat_cols`) allows an informative error message. function ContrastsMatrix(c::ContrastsMatrix, col::PooledDataArray) if !isempty(setdiff(levels(col), c.levels)) - throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: "* + throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: " * "$(setdiff(levels(col), c.levels))" * "\n Data levels: $(levels(col))" * - "\n Contrast levels $(c.levels)")) + "\n Contrast levels: $(c.levels)")) end return c end From 94ec01e2e4a492494693b36501d6486748e5ad85 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 30 Jul 2016 00:31:26 -0400 Subject: [PATCH 69/74] contrasts types must be instantiated --- src/statsmodels/contrasts.jl | 2 +- src/statsmodels/formula.jl | 2 +- test/contrasts.jl | 10 ++++++---- test/formula.jl | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 82e6755e28..197d3b0711 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -147,7 +147,7 @@ end ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = ContrastsMatrix(C, levels(v)) ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) = - ContrastsMatrix(c(), col) + throw(ArgumentError("contrast types must be instantiated (use $c() instead of $c)")) # given an existing ContrastsMatrix, check that all of the levels present in the # data are present in the contrasts. Note that this behavior is different from the # ContrastsMatrix constructor, which requires that the levels be exactly the same. diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 4bb31925df..4a07babe0c 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -295,7 +295,7 @@ function ModelFrame(trms::Terms, d::AbstractDataFrame; is_categorical(col) || continue evaledContrasts[term] = ContrastsMatrix(haskey(contrasts, term) ? contrasts[term] : - DEFAULT_CONTRASTS, + DEFAULT_CONTRASTS(), col) end diff --git a/test/contrasts.jl b/test/contrasts.jl index b8a7599665..1ff2fed934 100644 --- a/test/contrasts.jl +++ b/test/contrasts.jl @@ -18,10 +18,10 @@ mf = ModelFrame(Formula(nothing, :x), d) @test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] mmm = ModelMatrix(mf).m -setcontrasts!(mf, x = DummyCoding) +setcontrasts!(mf, x = DummyCoding()) @test ModelMatrix(mf).m == mmm -setcontrasts!(mf, x = EffectsCoding) +setcontrasts!(mf, x = EffectsCoding()) @test ModelMatrix(mf).m == [1 -1 -1 1 1 0 1 0 1 @@ -62,7 +62,7 @@ setcontrasts!(mf, x = EffectsCoding(levels = [:c, :b, :a], base = :a)) @test coefnames(mf) == ["(Intercept)"; "x: c"; "x: b"] # Helmert coded contrasts -setcontrasts!(mf, x = HelmertCoding) +setcontrasts!(mf, x = HelmertCoding()) @test ModelMatrix(mf).m == [1 -1 -1 1 1 -1 1 0 2 @@ -76,7 +76,7 @@ setcontrasts!(mf, x = HelmertCoding) # Missing data is handled gracefully, dropping columns when a level is lost d[3, :x] = NA -mf_missing = ModelFrame(Formula(nothing, :x), d, contrasts = Dict(:x => EffectsCoding)) +mf_missing = ModelFrame(Formula(nothing, :x), d, contrasts = Dict(:x => EffectsCoding())) @test ModelMatrix(mf_missing).m == [1 -1 1 1 1 -1 @@ -108,5 +108,7 @@ setcontrasts!(mf, x = ContrastsCoding(contrasts)) @test_throws ArgumentError setcontrasts!(mf, x = ContrastsCoding(contrasts[1:2, :])) @test_throws ArgumentError setcontrasts!(mf, x = ContrastsCoding(hcat(contrasts, contrasts))) +# contrasts types must be instaniated +@test_throws ArgumentError setcontrasts!(mf, x = DummyCoding) end diff --git a/test/formula.jl b/test/formula.jl index c2d16762d8..a02be4a016 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -375,7 +375,7 @@ d = DataFrame(x = Compat.repeat([:a, :b], outer = 4), y = Compat.repeat([:c, :d], inner = 2, outer = 2), z = Compat.repeat([:e, :f], inner = 4)) [pool!(d, name) for name in names(d)] -cs = [name => EffectsCoding for name in names(d)] +cs = [name => EffectsCoding() for name in names(d)] d[:n] = 1.:8 From 6dc77552bc825ba2773fa34cc90c4cd56dc80c81 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sat, 30 Jul 2016 10:20:32 -0400 Subject: [PATCH 70/74] move todo --- src/statsmodels/contrasts.jl | 2 -- src/statsmodels/formula.jl | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 197d3b0711..095ea5da0d 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -11,8 +11,6 @@ # # ModelMatrix will check this dict when evaluating terms, falling back to a # default for any categorical data without a specified contrast. -# -# TODO: implement contrast types in Formula/Terms """ diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 4a07babe0c..2e0bf1f9d7 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -24,6 +24,9 @@ macro ~(lhs, rhs) return ex end +# +# TODO: implement contrast types in Formula/Terms +# type Terms terms::Vector eterms::Vector # evaluation terms From 796c87173a5f1a2de147ac5722a0dcf3fc3e3e5c Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 31 Jul 2016 11:55:41 -0400 Subject: [PATCH 71/74] fix doc to use concrete type --- docs/src/man/formulas.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/man/formulas.md b/docs/src/man/formulas.md index 2fb149f586..e9203439e8 100644 --- a/docs/src/man/formulas.md +++ b/docs/src/man/formulas.md @@ -36,14 +36,14 @@ mm = ModelMatrix(ModelFrame(Z ~ X*Y, df)) You can control how categorical variables (e.g., `PooledDataArray` columns) are converted to `ModelMatrix` columns by specifying _contrasts_ when you construct a `ModelFrame`: ```julia -mm = ModelMatrix(ModelFrame(Z ~ X*Y, df, contrasts = Dict(:X => HelmertCoding))) +mm = ModelMatrix(ModelFrame(Z ~ X*Y, df, contrasts = Dict(:X => HelmertCoding()))) ``` Contrasts can also be modified in an existing `ModelFrame`: ```julia mf = ModelFrame(Z ~ X*Y, df) -contrasts!(mf, X = HelmertCoding) +contrasts!(mf, X = HelmertCoding()) ``` The construction of model matrices makes it easy to formulate complex statistical models. These are used to good effect by the [GLM Package.](https://github.com/JuliaStats/GLM.jl) From a20f290e063fbb4a1d4eed779c726445ac002085 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 31 Jul 2016 12:06:05 -0400 Subject: [PATCH 72/74] add tests for specifying contrasts in fit() --- test/statsmodel.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/statsmodel.jl b/test/statsmodel.jl index 0cc2ab5ba3..b133b511c9 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -75,6 +75,17 @@ d3[1, :x1] = 0 d3[:x1p] = PooledDataArray(d3[:x1]) @test_throws ArgumentError predict(m2, d3) +## fit with contrasts specified +d[:x2p] = PooledDataArray(d[:x2]) +f3 = y ~ x1p + x2p +m3 = fit(DummyMod, f3, d) +fit(DummyMod, f3, d, contrasts = Dict(:x1p => EffectsCoding())) +fit(DummyMod, f3, d, contrasts = Dict(:x1p => EffectsCoding(), + :x2p => DummyCoding())) +@test_throws Exception fit(DummyMod, f3, d, contrasts = Dict(:x1p => EffectsCoding(), + :x2p => 1)) + + ## Another dummy model type to test fall-through show method immutable DummyModTwo <: RegressionModel msg::Compat.UTF8String From a2e3ff58e45f982171b250d94c1bdeb3720270ff Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 31 Jul 2016 12:10:06 -0400 Subject: [PATCH 73/74] typealiases for R names --- src/DataFrames.jl | 2 ++ src/statsmodels/contrasts.jl | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 1b1d56c722..10d5d293e5 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -45,7 +45,9 @@ export @~, ModelMatrix, SubDataFrame, EffectsCoding, + SumCoding, DummyCoding, + TreatmentCoding, HelmertCoding, ContrastsCoding, diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 095ea5da0d..638b6d3ffd 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -224,6 +224,14 @@ Also known as "treatment coding" (`contr.treatment` in R) or "one-hot encoding". """ DummyCoding +""" + TreatmentCoding([base[, levels]]) + +Alias for `DummyCoding`, which is also known as treatment coding (e.g., +`contr.treatment` in R). +""" +typealias TreatmentCoding DummyCoding + contrasts_matrix(C::DummyCoding, baseind, n) = eye(n)[:, [1:(baseind-1); (baseind+1):n]] @@ -249,6 +257,13 @@ schemes. """ EffectsCoding +""" + SumCoding([base[, levels]]) + +Alias for `EffectsCoding`, which is also known as sum coding (e.g., `contr.sum` in R). +""" +typealias SumCoding EffectsCoding + function contrasts_matrix(C::EffectsCoding, baseind, n) not_base = [1:(baseind-1); (baseind+1):n] mat = eye(n)[:, not_base] From c113bbe07ab1650247309dc84d8368068edaf7e6 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Sun, 31 Jul 2016 14:28:49 -0400 Subject: [PATCH 74/74] Revert "typealiases for R names" This reverts commit a2e3ff58e45f982171b250d94c1bdeb3720270ff. --- src/DataFrames.jl | 2 -- src/statsmodels/contrasts.jl | 15 --------------- 2 files changed, 17 deletions(-) diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 10d5d293e5..1b1d56c722 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -45,9 +45,7 @@ export @~, ModelMatrix, SubDataFrame, EffectsCoding, - SumCoding, DummyCoding, - TreatmentCoding, HelmertCoding, ContrastsCoding, diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl index 638b6d3ffd..095ea5da0d 100644 --- a/src/statsmodels/contrasts.jl +++ b/src/statsmodels/contrasts.jl @@ -224,14 +224,6 @@ Also known as "treatment coding" (`contr.treatment` in R) or "one-hot encoding". """ DummyCoding -""" - TreatmentCoding([base[, levels]]) - -Alias for `DummyCoding`, which is also known as treatment coding (e.g., -`contr.treatment` in R). -""" -typealias TreatmentCoding DummyCoding - contrasts_matrix(C::DummyCoding, baseind, n) = eye(n)[:, [1:(baseind-1); (baseind+1):n]] @@ -257,13 +249,6 @@ schemes. """ EffectsCoding -""" - SumCoding([base[, levels]]) - -Alias for `EffectsCoding`, which is also known as sum coding (e.g., `contr.sum` in R). -""" -typealias SumCoding EffectsCoding - function contrasts_matrix(C::EffectsCoding, baseind, n) not_base = [1:(baseind-1); (baseind+1):n] mat = eye(n)[:, not_base]