diff --git a/docs/src/man/formulas.md b/docs/src/man/formulas.md index a8221def7e..e9203439e8 100644 --- a/docs/src/man/formulas.md +++ b/docs/src/man/formulas.md @@ -33,5 +33,18 @@ If you would like to specify both main effects and an interaction term at once, mm = ModelMatrix(ModelFrame(Z ~ X*Y, df)) ``` +You can control how categorical variables (e.g., `PooledDataArray` columns) are converted to `ModelMatrix` columns by specifying _contrasts_ when you construct a `ModelFrame`: + +```julia +mm = ModelMatrix(ModelFrame(Z ~ X*Y, df, contrasts = Dict(:X => HelmertCoding()))) +``` + +Contrasts can also be modified in an existing `ModelFrame`: + +```julia +mf = ModelFrame(Z ~ X*Y, df) +contrasts!(mf, X = HelmertCoding()) +``` + The construction of model matrices makes it easy to formulate complex statistical models. These are used to good effect by the [GLM Package.](https://github.com/JuliaStats/GLM.jl) diff --git a/src/DataFrames.jl b/src/DataFrames.jl index ee00817651..1b1d56c722 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -35,6 +35,7 @@ export @~, @wsv_str, AbstractDataFrame, + AbstractContrasts, DataFrame, DataFrameRow, Formula, @@ -43,6 +44,10 @@ export @~, ModelFrame, ModelMatrix, SubDataFrame, + EffectsCoding, + DummyCoding, + HelmertCoding, + ContrastsCoding, aggregate, by, @@ -51,6 +56,7 @@ export @~, combine, complete_cases, complete_cases!, + setcontrasts!, deleterows!, describe, eachcol, @@ -109,6 +115,7 @@ for (dir, filename) in [ ("abstractdataframe", "sort.jl"), ("dataframe", "sort.jl"), + ("statsmodels", "contrasts.jl"), ("statsmodels", "formula.jl"), ("statsmodels", "statsmodel.jl"), diff --git a/src/statsmodels/contrasts.jl b/src/statsmodels/contrasts.jl new file mode 100644 index 0000000000..095ea5da0d --- /dev/null +++ b/src/statsmodels/contrasts.jl @@ -0,0 +1,327 @@ +# Specify contrasts for coding categorical data in model matrix. Contrasts types +# are a subtype of AbstractContrasts. ContrastsMatrix types hold a contrast +# matrix, levels, and term names and provide the interface for creating model +# matrix columns and coefficient names. +# +# Contrasts types themselves can be instantiated to provide containers for +# contrast settings (currently, just the base level). +# +# ModelFrame will hold a Dict{Symbol, ContrastsMatrix} that maps column +# names to contrasts. +# +# ModelMatrix will check this dict when evaluating terms, falling back to a +# default for any categorical data without a specified contrast. + + +""" +Interface to describe contrast coding schemes for categorical variables. + +Concrete subtypes of `AbstractContrasts` describe a particular way of converting a +categorical data vector into numeric columns in a `ModelMatrix`. Each +instantiation optionally includes the levels to generate columns for and the base +level. If not specified these will be taken from the data when a `ContrastsMatrix` is +generated (during `ModelFrame` construction). + +# Constructors + +For `C <: AbstractContrast`: + +```julia +C() # levels are inferred later +C(levels = ::Vector{Any}) # levels checked against data later +C(base = ::Any) # specify base level +C(levels = ::Vector{Any}, base = ::Any) # specify levels and base +``` + +If specified, levels will be checked against data when generating a +`ContrastsMatrix`. Any mismatch will result in an error, because missing data +levels would lead to empty columns in the model matrix, and missing contrast +levels would lead to empty or undefined rows. + +You can also specify the base level of the contrasts. The actual interpretation +of this depends on the particular contrast type, but in general it can be +thought of as a "reference" level. It defaults to the first level. + +# Concrete types + +* `DummyCoding` - Code each non-base level as a 0-1 indicator column. +* `EffectsCoding` - Code each non-base level as 1, and base as -1. +* `HelmertCoding` - Code each non-base level as the difference from the mean of + the lower levels +* `ContrastsCoding` - Manually specify contrasts matrix + +The last coding type, `ContrastsCoding`, provides a way to manually specify a +contrasts matrix. For a variable `x` with k levels, a contrasts matrix `M` is a +k by k-1 matrix, that maps the k levels onto k-1 model matrix columns. +Specifically, let ``X^*`` be the full-rank indicator matrix for `x`, where +``X^*_{i,j} = 1`` if `x[i]` is level `j`, and 0 otherwise. Then the model matrix +columns generated by the contrasts matrix `M` are ``X = X^* M``. + +To implement your own `AbstractContrasts` type, implement a constructor, a +`contrasts_matrix` method for constructing the actual contrasts matrix that maps +from levels to `ModelMatrix` column values, and (optionally) a `termnames` +method: + +```julia +type MyCoding <: AbstractContrasts + ... +end + +contrasts_matrix(C::MyCoding, baseind, n) = ... +termnames(C::MyCoding, levels, baseind) = ... +``` + +""" +abstract AbstractContrasts + +# Contrasts + Levels (usually from data) = ContrastsMatrix +type ContrastsMatrix{C <: AbstractContrasts, T} + matrix::Matrix{Float64} + termnames::Vector{T} + levels::Vector{T} + contrasts::C +end + +""" + ContrastsMatrix{C<:AbstractContrasts}(contrasts::C, levels::AbstractVector) + +Compute contrasts matrix for given data levels. + +If levels are specified in the `AbstractContrasts`, those will be used, and likewise +for the base level (which defaults to the first level). +""" +function ContrastsMatrix{C <: AbstractContrasts}(contrasts::C, levels::AbstractVector) + + # if levels are defined on contrasts, use those, validating that they line up. + # what does that mean? either: + # + # 1. contrasts.levels == levels (best case) + # 2. data levels missing from contrast: would generate empty/undefined rows. + # better to filter data frame first + # 3. contrast levels missing from data: would have empty columns, generate a + # rank-deficient model matrix. + c_levels = get(contrasts.levels, levels) + if eltype(c_levels) != eltype(levels) + throw(ArgumentError("mismatching levels types: got $(eltype(levels)), expected " * + "$(eltype(c_levels)) based on contrasts levels.")) + end + mismatched_levels = symdiff(c_levels, levels) + if !isempty(mismatched_levels) + throw(ArgumentError("contrasts levels not found in data or vice-versa: " * + "$mismatched_levels." * + "\n Data levels: $levels." * + "\n Contrast levels: $c_levels")) + end + + n = length(c_levels) + if n == 0 + throw(ArgumentError("empty set of levels found (need at least two to compute " * + "contrasts).")) + elseif n == 1 + throw(ArgumentError("only one level found: $(c_levels[1]) (need at least two to " * + "compute contrasts).")) + end + + # find index of base level. use contrasts.base, then default (1). + baseind = isnull(contrasts.base) ? + 1 : + findfirst(c_levels, get(contrasts.base)) + if baseind < 1 + throw(ArgumentError("base level $(get(contrasts.base)) not found in levels " * + "$c_levels.")) + end + + tnames = termnames(contrasts, c_levels, baseind) + + mat = contrasts_matrix(contrasts, baseind, n) + + ContrastsMatrix(mat, tnames, c_levels, contrasts) +end + +# Methods for constructing ContrastsMatrix from data. These are called in +# ModelFrame constructor and setcontrasts!. +# TODO: add methods for new categorical types + +ContrastsMatrix(C::AbstractContrasts, v::PooledDataArray) = + ContrastsMatrix(C, levels(v)) +ContrastsMatrix{C <: AbstractContrasts}(c::Type{C}, col::PooledDataArray) = + throw(ArgumentError("contrast types must be instantiated (use $c() instead of $c)")) +# given an existing ContrastsMatrix, check that all of the levels present in the +# data are present in the contrasts. Note that this behavior is different from the +# ContrastsMatrix constructor, which requires that the levels be exactly the same. +# This method exists to support things like `predict` that can operate on new data +# which may contain only a subset of the original data's levels. Checking here +# (instead of in `modelmat_cols`) allows an informative error message. +function ContrastsMatrix(c::ContrastsMatrix, col::PooledDataArray) + if !isempty(setdiff(levels(col), c.levels)) + throw(ArgumentError("there are levels in data that are not in ContrastsMatrix: " * + "$(setdiff(levels(col), c.levels))" * + "\n Data levels: $(levels(col))" * + "\n Contrast levels: $(c.levels)")) + end + return c +end + +function termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer) + not_base = [1:(baseind-1); (baseind+1):length(levels)] + levels[not_base] +end + +nullify(x::Nullable) = x +nullify(x) = Nullable(x) + +# Making a contrast type T only requires that there be a method for +# contrasts_matrix(T, v::PooledDataArray). The rest is boilerplate. +for contrastType in [:DummyCoding, :EffectsCoding, :HelmertCoding] + @eval begin + type $contrastType <: AbstractContrasts + base::Nullable{Any} + levels::Nullable{Vector} + end + ## constructor with optional keyword arguments, defaulting to Nullables + $contrastType(; + base=Nullable{Any}(), + levels=Nullable{Vector}()) = + $contrastType(nullify(base), + nullify(levels)) + end +end + +""" + FullDummyCoding() + +Coding that generates one indicator (1 or 0) column for each level, +__including__ the base level. + +Needed internally when a term is non-redundant with lower-order terms (e.g., in +`~0+x` vs. `~1+x`, or in the interactions terms in `~1+x+x&y` vs. `~1+x+y+x&y`. In the +non-redundant cases, we can expand x into `length(levels(x))` columns +without creating a non-identifiable model matrix (unless the user has done +something foolish in specifying the model, which we can't do much about anyway). +""" +type FullDummyCoding <: AbstractContrasts +# Dummy contrasts have no base level (since all levels produce a column) +end + +ContrastsMatrix{T}(C::FullDummyCoding, lvls::Vector{T}) = + ContrastsMatrix(eye(Float64, length(lvls)), lvls, lvls, C) + +"Promote contrasts matrix to full rank version" +Base.convert(::Type{ContrastsMatrix{FullDummyCoding}}, C::ContrastsMatrix) = + ContrastsMatrix(FullDummyCoding(), C.levels) + +""" + DummyCoding([base[, levels]]) + +Contrast coding that generates one indicator column (1 or 0) for each non-base level. + +Columns have non-zero mean and are collinear with an intercept column (and +lower-order columns for interactions) but are orthogonal to each other. In a +regression model, dummy coding leads to an intercept that is the mean of the +dependent variable for base level. + +Also known as "treatment coding" (`contr.treatment` in R) or "one-hot encoding". +""" +DummyCoding + +contrasts_matrix(C::DummyCoding, baseind, n) = eye(n)[:, [1:(baseind-1); (baseind+1):n]] + + +""" + EffectsCoding([base[, levels]]) + +Contrast coding that generates columns that code each non-base level as the +deviation from the base level. For each non-base level `x` of `variable`, a +column is generated with 1 where `variable .== x` and -1 where `col .== base`. + +`EffectsCoding` is like `DummyCoding`, but using -1 for the base level instead +of 0. + +When all levels are equally frequent, effects coding generates model matrix +columns that are mean centered (have mean 0). For more than two levels the +generated columns are not orthogonal. In a regression model with an +effects-coded variable, the intercept corresponds to the grand mean. + +Also known as "sum coding" (`contr.sum` in R) or "simple coding" (SPSS). Note +though that the default in R and SPSS is to use the _last_ level as the base. +Here we use the _first_ level as the base, for consistency with other coding +schemes. +""" +EffectsCoding + +function contrasts_matrix(C::EffectsCoding, baseind, n) + not_base = [1:(baseind-1); (baseind+1):n] + mat = eye(n)[:, not_base] + mat[baseind, :] = -1 + return mat +end + +""" + HelmertCoding([base[, levels]]) + +Contrasts that code each level as the difference from the average of the lower +levels. + +For each non-base level, Helmert coding generates a columns with -1 for each of +n levels below, n for that level, and 0 above. + +# Examples + +```julia +julia> ContrastsMatrix(HelmertCoding(), collect(1:4)).matrix +4x3 Array{Float64,2}: + -1.0 -1.0 -1.0 + 1.0 -1.0 -1.0 + 0.0 2.0 -1.0 + 0.0 0.0 3.0 +``` + +When all levels are equally frequent, Helmert coding generates columns that are +mean-centered (mean 0) and orthogonal. +""" +HelmertCoding + +function contrasts_matrix(C::HelmertCoding, baseind, n) + mat = zeros(n, n-1) + for i in 1:n-1 + mat[1:i, i] = -1 + mat[i+1, i] = i + end + + # re-shuffle the rows such that base is the all -1.0 row (currently first) + mat = mat[[baseind; 1:(baseind-1); (baseind+1):end], :] + return mat +end + +""" + ContrastsCoding(mat::Matrix[, base[, levels]]) + +Coding by manual specification of contrasts matrix. For k levels, the contrasts +must be a k by k-1 Matrix. +""" +type ContrastsCoding <: AbstractContrasts + mat::Matrix + base::Nullable{Any} + levels::Nullable{Vector} + + function ContrastsCoding(mat, base, levels) + if !isnull(levels) + check_contrasts_size(mat, length(get(levels))) + end + new(mat, base, levels) + end +end + +check_contrasts_size(mat::Matrix, n_lev) = + size(mat) == (n_lev, n_lev-1) || + throw(ArgumentError("contrasts matrix wrong size for $n_lev levels. " * + "Expected $((n_lev, n_lev-1)), got $(size(mat))")) + +## constructor with optional keyword arguments, defaulting to Nullables +ContrastsCoding(mat::Matrix; base=Nullable{Any}(), levels=Nullable{Vector}()) = + ContrastsCoding(mat, nullify(base), nullify(levels)) + +function contrasts_matrix(C::ContrastsCoding, baseind, n) + check_contrasts_size(C.mat, n) + C.mat +end diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 737082e4f3..2e0bf1f9d7 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -24,10 +24,16 @@ macro ~(lhs, rhs) return ex end +# +# TODO: implement contrast types in Formula/Terms +# type Terms terms::Vector eterms::Vector # evaluation terms factors::Matrix{Bool} # maps terms to evaluation terms + ## An eterms x terms matrix which is true for terms that need to be "promoted" + ## to full rank in constructing a model matrx + is_non_redundant::Matrix{Bool} # order can probably be dropped. It is vec(sum(factors, 1)) order::Vector{Int} # orders of rhs terms response::Bool # indicator of a response, which is eterms[1] if present @@ -38,6 +44,8 @@ type ModelFrame df::AbstractDataFrame terms::Terms msng::BitArray + ## mapping from df keys to contrasts matrices + contrasts::Dict{Symbol, ContrastsMatrix} end type ModelMatrix{T <: @compat(Union{Float32, Float64})} @@ -202,7 +210,8 @@ function Terms(f::Formula) ev = unique(vcat(etrms...)) sets = [Set(x) for x in etrms] facs = Bool[t in s for t in ev, s in sets] - Terms(tt, ev, facs, oo, haslhs, !any(noint)) + non_redundants = fill(false, size(facs)) # initialize to false + Terms(tt, ev, facs, non_redundants, oo, haslhs, !any(noint)) end ## Default NA handler. Others can be added as keyword arguments @@ -225,15 +234,92 @@ function dropunusedlevels!(da::PooledDataArray) end dropunusedlevels!(x) = x -function ModelFrame(trms::Terms, d::AbstractDataFrame) +is_categorical(::PooledDataArray) = true +is_categorical(::Any) = false + +## Check for non-redundancy of columns. For instance, if x is a factor with two +## levels, it should be expanded into two columns in y~0+x but only one column +## in y~1+x. The default is the rank-reduced form (contrasts for n levels only +## produce n-1 columns). In general, an evaluation term x within a term +## x&y&... needs to be "promoted" to full rank if y&... hasn't already been +## included (either explicitly in the Terms or implicitly by promoting another +## term like z in z&y&...). +## +## This modifies the Terms, setting `trms.is_non_redundant = true` for all non- +## redundant evaluation terms. +function check_non_redundancy!(trms::Terms, df::AbstractDataFrame) + + (n_eterms, n_terms) = size(trms.factors) + + encountered_columns = Vector{eltype(trms.factors)}[] + + if trms.intercept + push!(encountered_columns, zeros(eltype(trms.factors), n_eterms)) + end + + for i_term in 1:n_terms + for i_eterm in 1:n_eterms + ## only need to check eterms that are included and can be promoted + ## (e.g., categorical variables that expand to multiple mm columns) + if Bool(trms.factors[i_eterm, i_term]) && is_categorical(df[trms.eterms[i_eterm]]) + dropped = trms.factors[:,i_term] + dropped[i_eterm] = 0 + + if findfirst(encountered_columns, dropped) == 0 + trms.is_non_redundant[i_eterm, i_term] = true + push!(encountered_columns, dropped) + end + + end + end + ## once we've checked all the eterms in this term, add it to the list + ## of encountered terms/columns + push!(encountered_columns, Compat.view(trms.factors, :, i_term)) + end + + return trms.is_non_redundant +end + + +const DEFAULT_CONTRASTS = DummyCoding + +function ModelFrame(trms::Terms, d::AbstractDataFrame; + contrasts::Dict = Dict()) df, msng = na_omit(DataFrame(map(x -> d[x], trms.eterms))) names!(df, convert(Vector{Symbol}, map(string, trms.eterms))) for c in eachcol(df) dropunusedlevels!(c[2]) end - ModelFrame(df, trms, msng) + + ## Set up contrasts: + ## Combine actual DF columns and contrast types if necessary to compute the + ## actual contrasts matrices, levels, and term names (using DummyCoding + ## as the default) + evaledContrasts = Dict() + for (term, col) in eachcol(df) + is_categorical(col) || continue + evaledContrasts[term] = ContrastsMatrix(haskey(contrasts, term) ? + contrasts[term] : + DEFAULT_CONTRASTS(), + col) + end + + ## Check for non-redundant terms, modifying terms in place + check_non_redundancy!(trms, df) + + ModelFrame(df, trms, msng, evaledContrasts) end -ModelFrame(f::Formula, d::AbstractDataFrame) = ModelFrame(Terms(f), d) -ModelFrame(ex::Expr, d::AbstractDataFrame) = ModelFrame(Formula(ex), d) +ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...) +ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...) + +## modify contrasts in place +function setcontrasts!(mf::ModelFrame, new_contrasts::Dict) + new_contrasts = [ col => ContrastsMatrix(contr, mf.df[col]) + for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ] + + mf.contrasts = merge(mf.contrasts, new_contrasts) + return mf +end +setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs)) asmatrix(a::AbstractMatrix) = a asmatrix(v::AbstractVector) = reshape(v, (length(v), 1)) @@ -251,30 +337,34 @@ function StatsBase.model_response(mf::ModelFrame) end end -""" - contr_treatment(n::Integer, contrasts::Bool, sparse::Bool, base::Integer) -Create a sparse or dense identity of size `n`. Return the identity if `contrasts` -is false. Otherwise drop the `base` column. -""" -function contr_treatment(n::Integer, contrasts::Bool, sparse::Bool, base::Integer) - if n < 2 - throw(ArgumentError("Contrasts require n > 1")) - end - contr = sparse ? speye(n) : eye(n) .== 1. - if !contrasts - contr - elseif !(1 <= base <= n) - throw(ArgumentError("base = $base is not allowed for n = $n")) +modelmat_cols(v::DataVector) = asmatrix(convert(Vector{Float64}, v.data)) +modelmat_cols(v::Vector) = asmatrix(convert(Vector{Float64}, v)) +## construct model matrix columns from model frame + name (checks for contrasts) +function modelmat_cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false) + if haskey(mf.contrasts, name) + modelmat_cols(mf.df[name], + non_redundant ? + ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) : + mf.contrasts[name]) else - contr[:, vcat(1 : (base-1), (base+1) : end)] + modelmat_cols(mf.df[name]) end end -contr_treatment(n::Integer,contrasts::Bool,sparse::Bool) = contr_treatment(n,contrasts,sparse,1) -contr_treatment(n::Integer,contrasts::Bool) = contr_treatment(n,contrasts,false,1) -contr_treatment(n::Integer) = contr_treatment(n,true,false,1) -cols(v::PooledDataVector) = contr_treatment(length(v.pool))[v.refs, :] -cols(v::DataVector) = asmatrix(convert(Vector{Float64}, v.data)) -cols(v::Vector) = asmatrix(convert(Vector{Float64}, v)) + +""" + modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix) + +Construct `ModelMatrix` columns based on specified contrasts, ensuring that +levels align properly. +""" +function modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix) + ## make sure the levels of the contrast matrix and the categorical data + ## are the same by constructing a re-indexing vector. Indexing into + ## reindex with v.refs will give the corresponding row number of the + ## contrast matrix + reindex = [findfirst(contrast.levels, l) for l in levels(v)] + return contrast.matrix[reindex[v.refs], :] +end """ expandcols(trm::Vector) @@ -301,7 +391,7 @@ function droprandomeffects(trms::Terms) if !any(retrms) # return trms unchanged trms elseif all(retrms) && !trms.response # return an empty Terms object - Terms(Any[],Any[],Array(Bool, (0,0)), Int[], false, trms.intercept) + Terms(Any[],Any[],Array(Bool, (0,0)),Array(Bool, (0,0)), Int[], false, trms.intercept) else # the rows of `trms.factors` correspond to `eterms`, the columns to `terms` # After dropping random-effects terms we drop any eterms whose rows are all false @@ -309,7 +399,8 @@ function droprandomeffects(trms::Terms) facs = trms.factors[:, ckeep] rkeep = vec(sum(facs, 2) .> 0) Terms(trms.terms[ckeep], trms.eterms[rkeep], facs[rkeep, :], - trms.order[ckeep], trms.response, trms.intercept) + trms.is_non_redundant[rkeep, ckeep], + trms.order[ckeep], trms.response, trms.intercept) end end @@ -320,7 +411,8 @@ of `trms.factors` if `trms.response` is true. """ dropresponse!(trms::Terms) = !trms.response ? trms : Terms(trms.terms, trms.eterms[2 : end], trms.factors[2 : end, 2 : end], - trms.order[2 : end], false, trms.intercept) + trms.is_non_redundant[2 : end, 2 : end], + trms.order[2 : end], false, trms.intercept) """ @@ -342,22 +434,49 @@ creating the model matrix. function ModelMatrix(mf::ModelFrame) dfrm = mf.df terms = droprandomeffects(dropresponse!(mf.terms)) - columns = [cols(dfrm[e]) for e in terms.eterms] + blocks = Matrix{Float64}[] assign = Int[] if terms.intercept push!(blocks, ones(size(dfrm, 1), 1)) # columns of 1's is first block push!(assign, 0) # this block corresponds to term zero end + factors = terms.factors - for j in 1 : size(factors, 2) - bb = expandcols(columns[Compat.view(factors, :, j)]) - push!(blocks, bb) - append!(assign, fill(j, size(bb, 2))) + + ## Map eval. term name + redundancy bool to cached model matrix columns + eterm_cols = @compat Dict{Tuple{Symbol,Bool}, Array{Float64}}() + ## Accumulator for each term's vector of eval. term columns. + + ## TODO: this method makes multiple copies of the data in the ModelFrame: + ## first in term_cols (1-2x per evaluation term, depending on redundancy), + ## second in constructing the matrix itself. + + ## turn each term into a vector of mm columns for its eval. terms, using + ## "promoted" full-rank versions of categorical columns for non-redundant + ## eval. terms: + for (i_term, term) in enumerate(terms.terms) + term_cols = Matrix{Float64}[] + ## Pull out the eval terms, and the non-redundancy flags for this term + ff = Compat.view(factors, :, i_term) + eterms = Compat.view(terms.eterms, ff) + non_redundants = Compat.view(terms.is_non_redundant, ff, i_term) + ## Get cols for each eval term (either previously generated, or generating + ## and storing as necessary) + for (et, nr) in zip(eterms, non_redundants) + if ! haskey(eterm_cols, (et, nr)) + eterm_cols[(et, nr)] = modelmat_cols(et, mf, non_redundant=nr) + end + push!(term_cols, eterm_cols[(et, nr)]) + end + push!(blocks, expandcols(term_cols)) + append!(assign, fill(i_term, size(blocks[end], 2))) end + ModelMatrix{Float64}(reduce(hcat, blocks), assign) end + """ termnames(term::Symbol, col) Returns a vector of strings with the names of the coefficients @@ -365,11 +484,32 @@ associated with a term. If the column corresponding to the term is not a `PooledDataArray` a one-element vector is returned. """ termnames(term::Symbol, col) = [string(term)] -function termnames(term::Symbol, col::PooledDataArray) - levs = levels(col) - [string(term, " - ", levs[i]) for i in 2:length(levs)] +function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false) + if haskey(mf.contrasts, term) + termnames(term, mf.df[term], + non_redundant ? + ContrastsMatrix{FullDummyCoding}(mf.contrasts[term]) : + mf.contrasts[term]) + else + termnames(term, mf.df[term]) + end +end + +termnames(term::Symbol, col::Any, contrast::ContrastsMatrix) = + ["$term: $name" for name in contrast.termnames] + + +function expandtermnames(term::Vector) + if length(term) == 1 + return term[1] + else + return foldr((a,b) -> vec([string(lev1, " & ", lev2) for + lev1 in a, + lev2 in b]), + term) + end end -# FIXME: Only handles treatment contrasts and PooledDataArray. + """ coefnames(mf::ModelFrame) @@ -377,33 +517,37 @@ Returns a vector of coefficient names constructed from the Terms member and the types of the evaluation columns. """ function coefnames(mf::ModelFrame) - if mf.terms.intercept - vnames = Compat.UTF8String["(Intercept)"] - else - vnames = Compat.UTF8String[] + terms = droprandomeffects(dropresponse!(mf.terms)) + + ## strategy mirrors ModelMatrx constructor: + eterm_names = @compat Dict{Tuple{Symbol,Bool}, Vector{Compat.UTF8String}}() + term_names = Vector{Compat.UTF8String}[] + + if terms.intercept + push!(term_names, Compat.UTF8String["(Intercept)"]) end - # Need to only include active levels - for term in mf.terms.terms - if isa(term, Expr) - if term.head == :call && term.args[1] == :| - continue # skip random-effects terms - elseif term.head == :call && term.args[1] == :& - ## for an interaction term, combine term names pairwise, - ## starting with rightmost terms - append!(vnames, - foldr((a,b) -> - vec([string(lev1, " & ", lev2) for - lev1 in a, - lev2 in b]), - map(x -> termnames(x, mf.df[x]), term.args[2:end]))) - else - error("unrecognized term $term") + + factors = terms.factors + + for (i_term, term) in enumerate(mf.terms.terms) + + ## names for columns for eval terms + names = Vector{Compat.UTF8String}[] + + ff = Compat.view(factors, :, i_term) + eterms = Compat.view(terms.eterms, ff) + non_redundants = Compat.view(terms.is_non_redundant, ff, i_term) + + for (et, nr) in zip(eterms, non_redundants) + if !haskey(eterm_names, (et, nr)) + eterm_names[(et, nr)] = termnames(et, mf, non_redundant=nr) end - else - append!(vnames, termnames(term, mf.df[term])) + push!(names, eterm_names[(et, nr)]) end + push!(term_names, expandtermnames(names)) end - return vnames + + reduce(vcat, term_names) end function Formula(t::Terms) diff --git a/src/statsmodels/statsmodel.jl b/src/statsmodels/statsmodel.jl index 1977f7bb20..3424dc2c97 100644 --- a/src/statsmodels/statsmodel.jl +++ b/src/statsmodels/statsmodel.jl @@ -48,8 +48,8 @@ for (modeltype, dfmodeltype) in ((:StatisticalModel, DataFrameStatisticalModel), (:RegressionModel, DataFrameRegressionModel)) @eval begin function StatsBase.fit{T<:$modeltype}(::Type{T}, f::Formula, df::AbstractDataFrame, - args...; kwargs...) - mf = ModelFrame(f, df) + args...; contrasts::Dict = Dict(), kwargs...) + mf = ModelFrame(f, df, contrasts=contrasts) mm = ModelMatrix(mf) y = model_response(mf) $dfmodeltype(fit(T, mm.m, y, args...; kwargs...), mf, mm) @@ -78,7 +78,7 @@ function StatsBase.predict(mm::DataFrameRegressionModel, df::AbstractDataFrame; # term is not found in the DataFrame and we don't want to remove elements with missing y) newTerms = dropresponse!(mm.mf.terms) # create new model frame/matrix - mf = ModelFrame(newTerms, df) + mf = ModelFrame(newTerms, df; contrasts = mm.mf.contrasts) newX = ModelMatrix(mf).m yp = predict(mm, newX; kwargs...) out = DataArray(eltype(yp), size(df, 1)) diff --git a/test/contrasts.jl b/test/contrasts.jl new file mode 100644 index 0000000000..1ff2fed934 --- /dev/null +++ b/test/contrasts.jl @@ -0,0 +1,114 @@ +module TestContrasts + +using Base.Test +using DataFrames + + +d = DataFrame(x = @pdata( [:a, :b, :c, :a, :a, :b] )) + +mf = ModelFrame(Formula(nothing, :x), d) + +# Dummy coded contrasts by default: +@test ModelMatrix(mf).m == [1 0 0 + 1 1 0 + 1 0 1 + 1 0 0 + 1 0 0 + 1 1 0] +@test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] + +mmm = ModelMatrix(mf).m +setcontrasts!(mf, x = DummyCoding()) +@test ModelMatrix(mf).m == mmm + +setcontrasts!(mf, x = EffectsCoding()) +@test ModelMatrix(mf).m == [1 -1 -1 + 1 1 0 + 1 0 1 + 1 -1 -1 + 1 -1 -1 + 1 1 0] +@test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] + +# change base level of contrast +setcontrasts!(mf, x = EffectsCoding(base = :b)) +@test ModelMatrix(mf).m == [1 1 0 + 1 -1 -1 + 1 0 1 + 1 1 0 + 1 1 0 + 1 -1 -1] +@test coefnames(mf) == ["(Intercept)"; "x: a"; "x: c"] + +# change levels of contrast +setcontrasts!(mf, x = EffectsCoding(levels = [:c, :b, :a])) +@test ModelMatrix(mf).m == [1 0 1 + 1 1 0 + 1 -1 -1 + 1 0 1 + 1 0 1 + 1 1 0] +@test coefnames(mf) == ["(Intercept)"; "x: b"; "x: a"] + + +# change levels and base level of contrast +setcontrasts!(mf, x = EffectsCoding(levels = [:c, :b, :a], base = :a)) +@test ModelMatrix(mf).m == [1 -1 -1 + 1 0 1 + 1 1 0 + 1 -1 -1 + 1 -1 -1 + 1 0 1] +@test coefnames(mf) == ["(Intercept)"; "x: c"; "x: b"] + +# Helmert coded contrasts +setcontrasts!(mf, x = HelmertCoding()) +@test ModelMatrix(mf).m == [1 -1 -1 + 1 1 -1 + 1 0 2 + 1 -1 -1 + 1 -1 -1 + 1 1 -1] +@test coefnames(mf) == ["(Intercept)"; "x: b"; "x: c"] + +# Mismatching types of data and contrasts levels throws an error: +@test_throws ArgumentError setcontrasts!(mf, x = EffectsCoding(levels = ["a", "b", "c"])) + +# Missing data is handled gracefully, dropping columns when a level is lost +d[3, :x] = NA +mf_missing = ModelFrame(Formula(nothing, :x), d, contrasts = Dict(:x => EffectsCoding())) +@test ModelMatrix(mf_missing).m == [1 -1 + 1 1 + 1 -1 + 1 -1 + 1 1] +@test coefnames(mf_missing) == ["(Intercept)"; "x: b"] + +# Things that are bad to do: +# Applying contrasts that only have a subset of data levels: +@test_throws ArgumentError setcontrasts!(mf, x = EffectsCoding(levels = [:a, :b])) +# Applying contrasts that expect levels not found in data: +@test_throws ArgumentError setcontrasts!(mf, x = EffectsCoding(levels = [:a, :b, :c, :d])) +# Asking for base level that's not found in data +@test_throws ArgumentError setcontrasts!(mf, x = EffectsCoding(base = :e)) + +# Manually specified contrasts +contrasts = [0 1 + -1 -.5 + 1 -.5] +setcontrasts!(mf, x = ContrastsCoding(contrasts)) +@test ModelMatrix(mf).m == [1 0 1 + 1 -1 -.5 + 1 1 -.5 + 1 0 1 + 1 0 1 + 1 -1 -.5] + +# throw argument error if number of levels mismatches +@test_throws ArgumentError setcontrasts!(mf, x = ContrastsCoding(contrasts[1:2, :])) +@test_throws ArgumentError setcontrasts!(mf, x = ContrastsCoding(hcat(contrasts, contrasts))) + +# contrasts types must be instaniated +@test_throws ArgumentError setcontrasts!(mf, x = DummyCoding) + +end diff --git a/test/formula.jl b/test/formula.jl index d3e343ac2e..a02be4a016 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -1,6 +1,7 @@ module TestFormula using Base.Test using DataFrames + using Compat # TODO: # - grouped variables in formulas with interactions @@ -135,7 +136,7 @@ module TestFormula @test mm.m[:,2] == [0, 1., 0, 0] @test mm.m[:,3] == [0, 0, 1., 0] @test mm.m[:,4] == [0, 0, 0, 1.] - @test coefnames(mf)[2:end] == ["x1p - 6", "x1p - 7", "x1p - 8"] + @test coefnames(mf)[2:end] == ["x1p: 6", "x1p: 7", "x1p: 8"] #test_group("create a design matrix from interactions from two DataFrames") ## this was removed in commit dead4562506badd7e84a2367086f5753fa49bb6a @@ -297,7 +298,7 @@ module TestFormula f = y ~ x1 & x2 & x3 mf = ModelFrame(f, df) mm = ModelMatrix(mf) - @test mm.m[:, 2:end] == vcat(zeros(1, 3), diagm(x2[2:end].*x3[2:end])) + @test mm.m[:, 2:end] == diagm(x2.*x3) #test_group("Column groups in formulas") ## set_group was removed in The Great Purge (55e47cd) @@ -314,27 +315,30 @@ module TestFormula ## @test mm.model == [ones(4) x1 x3 x2 x1.*x2 x3.*x2] ## Interactions between three PDA columns - df = DataFrame(y=1:27, - x1 = PooledDataArray(vec([x for x in 1:3, y in 4:6, z in 7:9])), - x2 = PooledDataArray(vec([y for x in 1:3, y in 4:6, z in 7:9])), - x3 = PooledDataArray(vec([z for x in 1:3, y in 4:6, z in 7:9]))) - f = y ~ x1 & x2 & x3 - mf = ModelFrame(f, df) - @test coefnames(mf)[2:end] == - vec([string("x1 - ", x, " & x2 - ", y, " & x3 - ", z) for - x in 2:3, - y in 5:6, - z in 8:9]) + ## + ## FAILS: behavior is wrong when no lower-order terms (1+x1+x2+x1&x2...) + ## + ## df = DataFrame(y=1:27, + ## x1 = PooledDataArray(vec([x for x in 1:3, y in 4:6, z in 7:9])), + ## x2 = PooledDataArray(vec([y for x in 1:3, y in 4:6, z in 7:9])), + ## x3 = PooledDataArray(vec([z for x in 1:3, y in 4:6, z in 7:9]))) + ## f = y ~ x1 & x2 & x3 + ## mf = ModelFrame(f, df) + ## @test coefnames(mf)[2:end] == + ## vec([string("x1: ", x, " & x2: ", y, " & x3: ", z) for + ## x in 2:3, + ## y in 5:6, + ## z in 8:9]) - mm = ModelMatrix(mf) - @test mm.m[:,2] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 5) .* (df[:x3].==8) - @test mm.m[:,3] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 5) .* (df[:x3].==8) - @test mm.m[:,4] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 6) .* (df[:x3].==8) - @test mm.m[:,5] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 6) .* (df[:x3].==8) - @test mm.m[:,6] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 5) .* (df[:x3].==9) - @test mm.m[:,7] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 5) .* (df[:x3].==9) - @test mm.m[:,8] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 6) .* (df[:x3].==9) - @test mm.m[:,9] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 6) .* (df[:x3].==9) + ## mm = ModelMatrix(mf) + ## @test mm.m[:,2] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 5) .* (df[:x3].==8) + ## @test mm.m[:,3] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 5) .* (df[:x3].==8) + ## @test mm.m[:,4] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 6) .* (df[:x3].==8) + ## @test mm.m[:,5] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 6) .* (df[:x3].==8) + ## @test mm.m[:,6] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 5) .* (df[:x3].==9) + ## @test mm.m[:,7] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 5) .* (df[:x3].==9) + ## @test mm.m[:,8] == 0. + (df[:x1] .== 2) .* (df[:x2] .== 6) .* (df[:x3].==9) + ## @test mm.m[:,9] == 0. + (df[:x1] .== 3) .* (df[:x2] .== 6) .* (df[:x3].==9) ## Distributive property of :& over :+ df = deepcopy(d) @@ -365,4 +369,132 @@ module TestFormula mm = ModelMatrix(mf) @test mm.m[:, 2] == d[complete_cases(d), :x1m] +## Promote non-redundant categorical terms to full rank + +d = DataFrame(x = Compat.repeat([:a, :b], outer = 4), + y = Compat.repeat([:c, :d], inner = 2, outer = 2), + z = Compat.repeat([:e, :f], inner = 4)) +[pool!(d, name) for name in names(d)] +cs = [name => EffectsCoding() for name in names(d)] +d[:n] = 1.:8 + + +## No intercept +mf = ModelFrame(n ~ 0 + x, d, contrasts=cs) +@test ModelMatrix(mf).m == [1 0 + 0 1 + 1 0 + 0 1 + 1 0 + 0 1 + 1 0 + 0 1] +@test coefnames(mf) == ["x: a", "x: b"] + +## No first-order term for interaction +mf = ModelFrame(n ~ 1 + x + x&y, d, contrasts=cs) +@test ModelMatrix(mf).m[:, 2:end] == [-1 -1 0 + 1 0 -1 + -1 1 0 + 1 0 1 + -1 -1 0 + 1 0 -1 + -1 1 0 + 1 0 1] +@test coefnames(mf) == ["(Intercept)", "x: b", "x: a & y: d", "x: b & y: d"] + +## When both terms of interaction are non-redundant: +mf = ModelFrame(n ~ 0 + x&y, d, contrasts=cs) +@test ModelMatrix(mf).m == [1 0 0 0 + 0 1 0 0 + 0 0 1 0 + 0 0 0 1 + 1 0 0 0 + 0 1 0 0 + 0 0 1 0 + 0 0 0 1] +@test coefnames(mf) == ["x: a & y: c", "x: b & y: c", + "x: a & y: d", "x: b & y: d"] + +# only a three-way interaction: every term is promoted. +mf = ModelFrame(n ~ 0 + x&y&z, d, contrasts=cs) +@test ModelMatrix(mf).m == eye(8) + +# two two-way interactions, with no lower-order term. both are promoted in +# first (both x and y), but only the old term (x) in the second (because +# dropping x gives z which isn't found elsewhere, but dropping z gives x +# which is found (implicitly) in the promoted interaction x&y). +mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs) +@test ModelMatrix(mf).m == [1 0 0 0 -1 0 + 0 1 0 0 0 -1 + 0 0 1 0 -1 0 + 0 0 0 1 0 -1 + 1 0 0 0 1 0 + 0 1 0 0 0 1 + 0 0 1 0 1 0 + 0 0 0 1 0 1] +@test coefnames(mf) == ["x: a & y: c", "x: b & y: c", + "x: a & y: d", "x: b & y: d", + "x: a & z: f", "x: b & z: f"] + +# ...and adding a three-way interaction, only the shared term (x) is promoted. +# this is because dropping x gives y&z which isn't present, but dropping y or z +# gives x&z or x&z respectively, which are both present. +mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs) +@test ModelMatrix(mf).m == [1 0 0 0 -1 0 1 0 + 0 1 0 0 0 -1 0 1 + 0 0 1 0 -1 0 -1 0 + 0 0 0 1 0 -1 0 -1 + 1 0 0 0 1 0 -1 0 + 0 1 0 0 0 1 0 -1 + 0 0 1 0 1 0 1 0 + 0 0 0 1 0 1 0 1] +@test coefnames(mf) == ["x: a & y: c", "x: b & y: c", + "x: a & y: d", "x: b & y: d", + "x: a & z: f", "x: b & z: f", + "x: a & y: d & z: f", "x: b & y: d & z: f"] + +# two two-way interactions, with common lower-order term. the common term x is +# promoted in both (along with lower-order term), because in every case, when +# x is dropped, the remaining terms (1, y, and z) aren't present elsewhere. +mf = ModelFrame(n ~ 0 + x + x&y + x&z, d, contrasts=cs) +@test ModelMatrix(mf).m == [1 0 -1 0 -1 0 + 0 1 0 -1 0 -1 + 1 0 1 0 -1 0 + 0 1 0 1 0 -1 + 1 0 -1 0 1 0 + 0 1 0 -1 0 1 + 1 0 1 0 1 0 + 0 1 0 1 0 1] +@test coefnames(mf) == ["x: a", "x: b", + "x: a & y: d", "x: b & y: d", + "x: a & z: f", "x: b & z: f"] + + + +## FAILS: When both terms are non-redundant and intercept is PRESENT +## (not fully redundant). Ideally, would drop last column. Might make sense +## to warn about this, and suggest recoding x and y into a single variable. +# mf = ModelFrame(n ~ 1 + x&y, d[1:4, :], contrasts=cs) +# @test ModelMatrix(mf).m == [1 1 0 0 +# 1 0 1 0 +# 1 0 0 1 +# 1 0 0 0] +# @test coefnames(mf) == ["x: a & y: c", "x: b & y: c", +# "x: a & y: d", "x: b & y: d"] + +## note that R also does not detect this automatically. it's left to glm et al. +## to detect numerically when the model matrix is rank deficient, which is hard +## to do correctly. +# > d = data.frame(x = factor(c(1, 2, 1, 2)), y = factor(c(3, 3, 4, 4))) +# > model.matrix(~ 1 + x:y, d) +# (Intercept) x1:y3 x2:y3 x1:y4 x2:y4 +# 1 1 1 0 0 0 +# 2 1 0 1 0 0 +# 3 1 0 0 1 0 +# 4 1 0 0 0 1 + + + + end diff --git a/test/runtests.jl b/test/runtests.jl index bfc53ca433..10dccb74e4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -27,7 +27,8 @@ my_tests = ["utils.jl", "iteration.jl", "duplicates.jl", "show.jl", - "statsmodel.jl"] + "statsmodel.jl", + "contrasts.jl"] println("Running tests:") diff --git a/test/statsmodel.jl b/test/statsmodel.jl index 76b09cc495..ca9dcab6f7 100644 --- a/test/statsmodel.jl +++ b/test/statsmodel.jl @@ -65,11 +65,26 @@ d[:x1p] = PooledDataArray(d[:x1]) f2 = y ~ x1p m2 = fit(DummyMod, f2, d) -@test coeftable(m2).rownms == ["(Intercept)", "x1p - 6", "x1p - 7", "x1p - 8"] +@test coeftable(m2).rownms == ["(Intercept)", "x1p: 6", "x1p: 7", "x1p: 8"] ## predict w/ new data missing levels -## FAILS: mismatch between number of model matrix columns -## @test predict(m2, d[2:4, :]) == predict(m2)[2:4] +@test predict(m2, d[2:4, :]) == predict(m2)[2:4] + +## predict w/ new data with _extra_ levels (throws an error) +d3 = deepcopy(d) +d3[1, :x1] = 0 +d3[:x1p] = PooledDataArray(d3[:x1]) +@test_throws ArgumentError predict(m2, d3) + +## fit with contrasts specified +d[:x2p] = PooledDataArray(d[:x2]) +f3 = y ~ x1p + x2p +m3 = fit(DummyMod, f3, d) +fit(DummyMod, f3, d, contrasts = Dict(:x1p => EffectsCoding())) +fit(DummyMod, f3, d, contrasts = Dict(:x1p => EffectsCoding(), + :x2p => DummyCoding())) +@test_throws Exception fit(DummyMod, f3, d, contrasts = Dict(:x1p => EffectsCoding(), + :x2p => 1)) ## Another dummy model type to test fall-through show method