Skip to content

Commit

Permalink
Sparse ModelMatrix support (JuliaData#1040)
Browse files Browse the repository at this point in the history
Parametrize ModelMatrix container type.
  • Loading branch information
Gord Stephen authored and maximerischard committed Sep 28, 2016
1 parent 5927f61 commit f0bbd59
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 77 deletions.
57 changes: 30 additions & 27 deletions src/statsmodels/formula.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ type ModelFrame
contrasts::Dict{Symbol, ContrastsMatrix}
end

type ModelMatrix{T <: @compat(Union{Float32, Float64})}
m::Matrix{T}
typealias AbstractFloatMatrix{T<:AbstractFloat} AbstractMatrix{T}

type ModelMatrix{T <: AbstractFloatMatrix}
m::T
assign::Vector{Int}
end

Expand Down Expand Up @@ -321,9 +323,6 @@ function setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
end
setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs))

asmatrix(a::AbstractMatrix) = a
asmatrix(v::AbstractVector) = reshape(v, (length(v), 1))

"""
StatsBase.model_response(mf::ModelFrame)
Extract the response column, if present. `DataVector` or
Expand All @@ -337,44 +336,46 @@ function StatsBase.model_response(mf::ModelFrame)
end
end

modelmat_cols(v::DataVector) = asmatrix(convert(Vector{Float64}, v.data))
modelmat_cols(v::Vector) = asmatrix(convert(Vector{Float64}, v))
## construct model matrix columns from model frame + name (checks for contrasts)
function modelmat_cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false)
function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::ModelFrame; non_redundant::Bool = false)
if haskey(mf.contrasts, name)
modelmat_cols(mf.df[name],
modelmat_cols(T, mf.df[name],
non_redundant ?
ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) :
mf.contrasts[name])
else
modelmat_cols(mf.df[name])
modelmat_cols(T, mf.df[name])
end
end

modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::DataVector) = convert(T, reshape(v.data, length(v), 1))
modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Vector) = convert(T, reshape(v, length(v), 1))

"""
modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix)
modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
Construct `ModelMatrix` columns based on specified contrasts, ensuring that
Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that
levels align properly.
"""
function modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix)
function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
## make sure the levels of the contrast matrix and the categorical data
## are the same by constructing a re-indexing vector. Indexing into
## reindex with v.refs will give the corresponding row number of the
## contrast matrix
reindex = [findfirst(contrast.levels, l) for l in levels(v)]
return contrast.matrix[reindex[v.refs], :]
contrastmatrix = convert(T, contrast.matrix)
return contrastmatrix[reindex[v.refs], :]
end

"""
expandcols(trm::Vector)
expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
Create pairwise products of columns from a vector of matrices
"""
function expandcols(trm::Vector)
function expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
if length(trm) == 1
asmatrix(convert(Array{Float64}, trm[1]))
trm[1]
else
a = convert(Array{Float64}, trm[1])
a = trm[1]
b = expandcols(trm[2 : end])
reduce(hcat, [broadcast(*, a, Compat.view(b, :, j)) for j in 1 : size(b, 2)])
end
Expand Down Expand Up @@ -422,8 +423,9 @@ end


"""
ModelMatrix(mf::ModelFrame)
Create a `ModelMatrix` from the `terms` and `df` members of `mf`
ModelMatrix{T<:AbstractFloatMatrix}(mf::ModelFrame)
Create a `ModelMatrix` of type `T` (default `Matrix{Float64}`) from the
`terms` and `df` members of `mf`.
This is basically a map-reduce where terms are mapped to columns by `cols`
and reduced by `hcat`. During the collection of the columns the `assign`
Expand All @@ -437,21 +439,21 @@ If there is an intercept in the model, that column occurs first and its
Mixed-effects models include "random-effects" terms which are ignored when
creating the model matrix.
"""
function ModelMatrix(mf::ModelFrame)
@compat function (::Type{ModelMatrix{T}}){T<:AbstractFloatMatrix}(mf::ModelFrame)
dfrm = mf.df
terms = droprandomeffects(dropresponse!(mf.terms))

blocks = Matrix{Float64}[]
blocks = T[]
assign = Int[]
if terms.intercept
push!(blocks, ones(size(dfrm, 1), 1)) # columns of 1's is first block
push!(assign, 0) # this block corresponds to term zero
push!(assign, 0) # this block corresponds to term zero
end

factors = terms.factors

## Map eval. term name + redundancy bool to cached model matrix columns
eterm_cols = @compat Dict{Tuple{Symbol,Bool}, Array{Float64}}()
eterm_cols = @compat Dict{Tuple{Symbol,Bool}, T}()
## Accumulator for each term's vector of eval. term columns.

## TODO: this method makes multiple copies of the data in the ModelFrame:
Expand All @@ -462,7 +464,7 @@ function ModelMatrix(mf::ModelFrame)
## "promoted" full-rank versions of categorical columns for non-redundant
## eval. terms:
for (i_term, term) in enumerate(terms.terms)
term_cols = Matrix{Float64}[]
term_cols = T[]
## Pull out the eval terms, and the non-redundancy flags for this term
ff = Compat.view(factors, :, i_term)
eterms = Compat.view(terms.eterms, ff)
Expand All @@ -471,16 +473,17 @@ function ModelMatrix(mf::ModelFrame)
## and storing as necessary)
for (et, nr) in zip(eterms, non_redundants)
if ! haskey(eterm_cols, (et, nr))
eterm_cols[(et, nr)] = modelmat_cols(et, mf, non_redundant=nr)
eterm_cols[(et, nr)] = modelmat_cols(T, et, mf, non_redundant=nr)
end
push!(term_cols, eterm_cols[(et, nr)])
end
push!(blocks, expandcols(term_cols))
append!(assign, fill(i_term, size(blocks[end], 2)))
end

ModelMatrix{Float64}(reduce(hcat, blocks), assign)
ModelMatrix{T}(reduce(hcat, blocks), assign)
end
ModelMatrix(mf::ModelFrame) = ModelMatrix{Matrix{Float64}}(mf)


"""
Expand Down
130 changes: 80 additions & 50 deletions test/formula.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ module TestFormula

## Tests for constructing ModelFrame and ModelMatrix

sparsetype = SparseMatrixCSC{Float64,Int}

d = DataFrame()
d[:y] = [1:4;]
d[:x1] = [5:8;]
Expand All @@ -124,8 +126,14 @@ module TestFormula
@test coefnames(mf) == ["(Intercept)","x1","x2"]
## @test model_response(mf) == transpose([1. 2 3 4]) # fails: Int64 vs. Float64
mm = ModelMatrix(mf)
smm = ModelMatrix{sparsetype}(mf)
@test mm.m[:,1] == ones(4)
@test mm.m[:,2:3] == [x1 x2]
@test mm.m == smm.m

@test isa(mm.m, Matrix{Float64})
@test isa(smm.m, sparsetype)
@test isa(ModelMatrix{DataMatrix{Float64}}(mf).m, DataMatrix{Float64})

#test_group("expanding a PooledVec into a design matrix of indicators for each dummy variable")

Expand All @@ -137,6 +145,7 @@ module TestFormula
@test mm.m[:,3] == [0, 0, 1., 0]
@test mm.m[:,4] == [0, 0, 0, 1.]
@test coefnames(mf)[2:end] == ["x1p: 6", "x1p: 7", "x1p: 8"]
@test mm.m == ModelMatrix{sparsetype}(mf).m

#test_group("create a design matrix from interactions from two DataFrames")
## this was removed in commit dead4562506badd7e84a2367086f5753fa49bb6a
Expand Down Expand Up @@ -199,18 +208,21 @@ module TestFormula
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m == [ones(4) x1.*x2]
@test mm.m == ModelMatrix{sparsetype}(mf).m

f = y ~ x1 * x2
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m == [ones(4) x1 x2 x1.*x2]
@test mm.m == ModelMatrix{sparsetype}(mf).m

df[:x1] = PooledDataArray(x1)
x1e = [[0, 1, 0, 0] [0, 0, 1, 0] [0, 0, 0, 1]]
f = y ~ x1 * x2
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m == [ones(4) x1e x2 [0, 10, 0, 0] [0, 0, 11, 0] [0, 0, 0, 12]]
@test mm.m == ModelMatrix{sparsetype}(mf).m

#test_group("Basic transformations")

Expand Down Expand Up @@ -261,6 +273,7 @@ module TestFormula
mf = ModelFrame(y ~ x2, d)
mm = ModelMatrix(mf)
@test mm.m == [ones(4) x2]
@test mm.m == ModelMatrix{sparsetype}(mf).m
## @test model_response(mf) == y'' # fails: Int64 vs. Float64

df = deepcopy(d)
Expand Down Expand Up @@ -294,11 +307,13 @@ module TestFormula
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m == [ones(4) x2.*x3.*x4]
@test mm.m == ModelMatrix{sparsetype}(mf).m

f = y ~ x1 & x2 & x3
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m[:, 2:end] == diagm(x2.*x3)
@test mm.m == ModelMatrix{sparsetype}(mf).m

#test_group("Column groups in formulas")
## set_group was removed in The Great Purge (55e47cd)
Expand Down Expand Up @@ -346,6 +361,7 @@ module TestFormula
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m == hcat(ones(4), x1.*x3, x1.*x4, x2.*x3, x2.*x4)
@test mm.m == ModelMatrix{sparsetype}(mf).m

## Condensing nested :+ calls
f = y ~ x1 + (x2 + (x3 + x4))
Expand All @@ -368,6 +384,7 @@ module TestFormula
mf = ModelFrame(y ~ x1m, d)
mm = ModelMatrix(mf)
@test mm.m[:, 2] == d[complete_cases(d), :x1m]
@test mm.m == ModelMatrix{sparsetype}(mf).m

## Same variable on left and right side
mf = ModelFrame(x1 ~ x1, df)
Expand All @@ -386,58 +403,68 @@ d[:n] = 1.:8

## No intercept
mf = ModelFrame(n ~ 0 + x, d, contrasts=cs)
@test ModelMatrix(mf).m == [1 0
0 1
1 0
0 1
1 0
0 1
1 0
0 1]
mm = ModelMatrix(mf)
@test mm.m == [1 0
0 1
1 0
0 1
1 0
0 1
1 0
0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["x: a", "x: b"]

## No first-order term for interaction
mf = ModelFrame(n ~ 1 + x + x&y, d, contrasts=cs)
@test ModelMatrix(mf).m[:, 2:end] == [-1 -1 0
1 0 -1
-1 1 0
1 0 1
-1 -1 0
1 0 -1
-1 1 0
1 0 1]
mm = ModelMatrix(mf)
@test mm.m[:, 2:end] == [-1 -1 0
1 0 -1
-1 1 0
1 0 1
-1 -1 0
1 0 -1
-1 1 0
1 0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["(Intercept)", "x: b", "x: a & y: d", "x: b & y: d"]

## When both terms of interaction are non-redundant:
mf = ModelFrame(n ~ 0 + x&y, d, contrasts=cs)
@test ModelMatrix(mf).m == [1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1]
mm = ModelMatrix(mf)
@test mm.m == [1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["x: a & y: c", "x: b & y: c",
"x: a & y: d", "x: b & y: d"]

# only a three-way interaction: every term is promoted.
mf = ModelFrame(n ~ 0 + x&y&z, d, contrasts=cs)
@test ModelMatrix(mf).m == eye(8)
mm = ModelMatrix(mf)
@test mm.m == eye(8)
@test mm.m == ModelMatrix{sparsetype}(mf).m

# two two-way interactions, with no lower-order term. both are promoted in
# first (both x and y), but only the old term (x) in the second (because
# dropping x gives z which isn't found elsewhere, but dropping z gives x
# which is found (implicitly) in the promoted interaction x&y).
mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs)
@test ModelMatrix(mf).m == [1 0 0 0 -1 0
0 1 0 0 0 -1
0 0 1 0 -1 0
0 0 0 1 0 -1
1 0 0 0 1 0
0 1 0 0 0 1
0 0 1 0 1 0
0 0 0 1 0 1]
mm = ModelMatrix(mf)
@test mm.m == [1 0 0 0 -1 0
0 1 0 0 0 -1
0 0 1 0 -1 0
0 0 0 1 0 -1
1 0 0 0 1 0
0 1 0 0 0 1
0 0 1 0 1 0
0 0 0 1 0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["x: a & y: c", "x: b & y: c",
"x: a & y: d", "x: b & y: d",
"x: a & z: f", "x: b & z: f"]
Expand All @@ -446,14 +473,16 @@ mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs)
# this is because dropping x gives y&z which isn't present, but dropping y or z
# gives x&z or x&z respectively, which are both present.
mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs)
@test ModelMatrix(mf).m == [1 0 0 0 -1 0 1 0
0 1 0 0 0 -1 0 1
0 0 1 0 -1 0 -1 0
0 0 0 1 0 -1 0 -1
1 0 0 0 1 0 -1 0
0 1 0 0 0 1 0 -1
0 0 1 0 1 0 1 0
0 0 0 1 0 1 0 1]
mm = ModelMatrix(mf)
@test mm.m == [1 0 0 0 -1 0 1 0
0 1 0 0 0 -1 0 1
0 0 1 0 -1 0 -1 0
0 0 0 1 0 -1 0 -1
1 0 0 0 1 0 -1 0
0 1 0 0 0 1 0 -1
0 0 1 0 1 0 1 0
0 0 0 1 0 1 0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["x: a & y: c", "x: b & y: c",
"x: a & y: d", "x: b & y: d",
"x: a & z: f", "x: b & z: f",
Expand All @@ -463,20 +492,21 @@ mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs)
# promoted in both (along with lower-order term), because in every case, when
# x is dropped, the remaining terms (1, y, and z) aren't present elsewhere.
mf = ModelFrame(n ~ 0 + x + x&y + x&z, d, contrasts=cs)
@test ModelMatrix(mf).m == [1 0 -1 0 -1 0
0 1 0 -1 0 -1
1 0 1 0 -1 0
0 1 0 1 0 -1
1 0 -1 0 1 0
0 1 0 -1 0 1
1 0 1 0 1 0
0 1 0 1 0 1]
mm = ModelMatrix(mf)
@test mm.m == [1 0 -1 0 -1 0
0 1 0 -1 0 -1
1 0 1 0 -1 0
0 1 0 1 0 -1
1 0 -1 0 1 0
0 1 0 -1 0 1
1 0 1 0 1 0
0 1 0 1 0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["x: a", "x: b",
"x: a & y: d", "x: b & y: d",
"x: a & z: f", "x: b & z: f"]



## FAILS: When both terms are non-redundant and intercept is PRESENT
## (not fully redundant). Ideally, would drop last column. Might make sense
## to warn about this, and suggest recoding x and y into a single variable.
Expand Down

0 comments on commit f0bbd59

Please sign in to comment.