Sparse ModelMatrix support (JuliaData#1040)

Parametrize ModelMatrix container type.
maximerischard · Sep 28, 2016 · f0bbd59 · f0bbd59
1 parent 5927f61
commit f0bbd59
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 77 deletions.
diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl
@@ -48,8 +48,10 @@ type ModelFrame
     contrasts::Dict{Symbol, ContrastsMatrix}
 end
 
-type ModelMatrix{T <: @compat(Union{Float32, Float64})}
-    m::Matrix{T}
+typealias AbstractFloatMatrix{T<:AbstractFloat} AbstractMatrix{T}
+
+type ModelMatrix{T <: AbstractFloatMatrix}
+    m::T
     assign::Vector{Int}
 end
 
@@ -321,9 +323,6 @@ function setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
 end
 setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs))
 
-asmatrix(a::AbstractMatrix) = a
-asmatrix(v::AbstractVector) = reshape(v, (length(v), 1))
-
 """
     StatsBase.model_response(mf::ModelFrame)
 Extract the response column, if present.  `DataVector` or
@@ -337,44 +336,46 @@ function StatsBase.model_response(mf::ModelFrame)
     end
 end
 
-modelmat_cols(v::DataVector) = asmatrix(convert(Vector{Float64}, v.data))
-modelmat_cols(v::Vector) = asmatrix(convert(Vector{Float64}, v))
 ## construct model matrix columns from model frame + name (checks for contrasts)
-function modelmat_cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false)
+function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::ModelFrame; non_redundant::Bool = false)
     if haskey(mf.contrasts, name)
-        modelmat_cols(mf.df[name],
+        modelmat_cols(T, mf.df[name],
                       non_redundant ?
                       ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) :
                       mf.contrasts[name])
     else
-        modelmat_cols(mf.df[name])
+        modelmat_cols(T, mf.df[name])
     end
 end
 
+modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::DataVector) = convert(T, reshape(v.data, length(v), 1))
+modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Vector) = convert(T, reshape(v, length(v), 1))
+
 """
-    modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix)
+    modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
 
-Construct `ModelMatrix` columns based on specified contrasts, ensuring that
+Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that
 levels align properly.
 """
-function modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix)
+function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
     ## make sure the levels of the contrast matrix and the categorical data
     ## are the same by constructing a re-indexing vector. Indexing into
     ## reindex with v.refs will give the corresponding row number of the
     ## contrast matrix
     reindex = [findfirst(contrast.levels, l) for l in levels(v)]
-    return contrast.matrix[reindex[v.refs], :]
+    contrastmatrix = convert(T, contrast.matrix)
+    return contrastmatrix[reindex[v.refs], :]
 end
 
 """
-    expandcols(trm::Vector)
+    expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
 Create pairwise products of columns from a vector of matrices
 """
-function expandcols(trm::Vector)
+function expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
     if length(trm) == 1
-        asmatrix(convert(Array{Float64}, trm[1]))
+        trm[1]
     else
-        a = convert(Array{Float64}, trm[1])
+        a = trm[1]
         b = expandcols(trm[2 : end])
         reduce(hcat, [broadcast(*, a, Compat.view(b, :, j)) for j in 1 : size(b, 2)])
     end
@@ -422,8 +423,9 @@ end
 
 
 """
-    ModelMatrix(mf::ModelFrame)
-Create a `ModelMatrix` from the `terms` and `df` members of `mf`
+    ModelMatrix{T<:AbstractFloatMatrix}(mf::ModelFrame)
+Create a `ModelMatrix` of type `T` (default `Matrix{Float64}`) from the
+`terms` and `df` members of `mf`.
 
 This is basically a map-reduce where terms are mapped to columns by `cols`
 and reduced by `hcat`.  During the collection of the columns the `assign`
@@ -437,21 +439,21 @@ If there is an intercept in the model, that column occurs first and its
 Mixed-effects models include "random-effects" terms which are ignored when
 creating the model matrix.
 """
-function ModelMatrix(mf::ModelFrame)
+@compat function (::Type{ModelMatrix{T}}){T<:AbstractFloatMatrix}(mf::ModelFrame)
     dfrm = mf.df
     terms = droprandomeffects(dropresponse!(mf.terms))
 
-    blocks = Matrix{Float64}[]
+    blocks = T[]
     assign = Int[]
     if terms.intercept
         push!(blocks, ones(size(dfrm, 1), 1))  # columns of 1's is first block
-        push!(assign, 0)                        # this block corresponds to term zero
+        push!(assign, 0)                       # this block corresponds to term zero
     end
 
     factors = terms.factors
 
     ## Map eval. term name + redundancy bool to cached model matrix columns
-    eterm_cols = @compat Dict{Tuple{Symbol,Bool}, Array{Float64}}()
+    eterm_cols = @compat Dict{Tuple{Symbol,Bool}, T}()
     ## Accumulator for each term's vector of eval. term columns.
 
     ## TODO: this method makes multiple copies of the data in the ModelFrame:
@@ -462,7 +464,7 @@ function ModelMatrix(mf::ModelFrame)
     ## "promoted" full-rank versions of categorical columns for non-redundant
     ## eval. terms:
     for (i_term, term) in enumerate(terms.terms)
-        term_cols = Matrix{Float64}[]
+        term_cols = T[]
         ## Pull out the eval terms, and the non-redundancy flags for this term
         ff = Compat.view(factors, :, i_term)
         eterms = Compat.view(terms.eterms, ff)
@@ -471,16 +473,17 @@ function ModelMatrix(mf::ModelFrame)
         ## and storing as necessary)
         for (et, nr) in zip(eterms, non_redundants)
             if ! haskey(eterm_cols, (et, nr))
-                eterm_cols[(et, nr)] = modelmat_cols(et, mf, non_redundant=nr)
+                eterm_cols[(et, nr)] = modelmat_cols(T, et, mf, non_redundant=nr)
             end
             push!(term_cols, eterm_cols[(et, nr)])
         end
         push!(blocks, expandcols(term_cols))
         append!(assign, fill(i_term, size(blocks[end], 2)))
     end
 
-    ModelMatrix{Float64}(reduce(hcat, blocks), assign)
+    ModelMatrix{T}(reduce(hcat, blocks), assign)
 end
+ModelMatrix(mf::ModelFrame) = ModelMatrix{Matrix{Float64}}(mf)
 
 
 """

diff --git a/test/formula.jl b/test/formula.jl
@@ -107,6 +107,8 @@ module TestFormula
 
     ## Tests for constructing ModelFrame and ModelMatrix
 
+    sparsetype = SparseMatrixCSC{Float64,Int}
+
     d = DataFrame()
     d[:y] = [1:4;]
     d[:x1] = [5:8;]
@@ -124,8 +126,14 @@ module TestFormula
     @test coefnames(mf) == ["(Intercept)","x1","x2"]
     ## @test model_response(mf) == transpose([1. 2 3 4]) # fails: Int64 vs. Float64
     mm = ModelMatrix(mf)
+    smm = ModelMatrix{sparsetype}(mf)
     @test mm.m[:,1] == ones(4)
     @test mm.m[:,2:3] == [x1 x2]
+    @test mm.m == smm.m
+
+    @test isa(mm.m, Matrix{Float64})
+    @test isa(smm.m, sparsetype)
+    @test isa(ModelMatrix{DataMatrix{Float64}}(mf).m, DataMatrix{Float64})
 
     #test_group("expanding a PooledVec into a design matrix of indicators for each dummy variable")
 
@@ -137,6 +145,7 @@ module TestFormula
     @test mm.m[:,3] == [0, 0, 1., 0]
     @test mm.m[:,4] == [0, 0, 0, 1.]
     @test coefnames(mf)[2:end] == ["x1p: 6", "x1p: 7", "x1p: 8"]
+    @test mm.m == ModelMatrix{sparsetype}(mf).m
 
     #test_group("create a design matrix from interactions from two DataFrames")
     ## this was removed in commit dead4562506badd7e84a2367086f5753fa49bb6a
@@ -199,18 +208,21 @@ module TestFormula
     mf = ModelFrame(f, df)
     mm = ModelMatrix(mf)
     @test mm.m == [ones(4) x1.*x2]
+    @test mm.m == ModelMatrix{sparsetype}(mf).m
 
     f = y ~ x1 * x2
     mf = ModelFrame(f, df)
     mm = ModelMatrix(mf)
     @test mm.m == [ones(4) x1 x2 x1.*x2]
+    @test mm.m == ModelMatrix{sparsetype}(mf).m
 
     df[:x1] = PooledDataArray(x1)
     x1e = [[0, 1, 0, 0] [0, 0, 1, 0] [0, 0, 0, 1]]
     f = y ~ x1 * x2
     mf = ModelFrame(f, df)
     mm = ModelMatrix(mf)
     @test mm.m == [ones(4) x1e x2 [0, 10, 0, 0] [0, 0, 11, 0] [0, 0, 0, 12]]
+    @test mm.m == ModelMatrix{sparsetype}(mf).m
 
     #test_group("Basic transformations")
 
@@ -261,6 +273,7 @@ module TestFormula
     mf = ModelFrame(y ~ x2, d)
     mm = ModelMatrix(mf)
     @test mm.m == [ones(4) x2]
+    @test mm.m == ModelMatrix{sparsetype}(mf).m
     ## @test model_response(mf) == y''     # fails: Int64 vs. Float64
 
     df = deepcopy(d)
@@ -294,11 +307,13 @@ module TestFormula
     mf = ModelFrame(f, df)
     mm = ModelMatrix(mf)
     @test mm.m == [ones(4) x2.*x3.*x4]
+    @test mm.m == ModelMatrix{sparsetype}(mf).m
 
     f = y ~ x1 & x2 & x3
     mf = ModelFrame(f, df)
     mm = ModelMatrix(mf)
     @test mm.m[:, 2:end] == diagm(x2.*x3)
+    @test mm.m == ModelMatrix{sparsetype}(mf).m
 
     #test_group("Column groups in formulas")
     ## set_group was removed in The Great Purge (55e47cd)
@@ -346,6 +361,7 @@ module TestFormula
     mf = ModelFrame(f, df)
     mm = ModelMatrix(mf)
     @test mm.m == hcat(ones(4), x1.*x3, x1.*x4, x2.*x3, x2.*x4)
+    @test mm.m == ModelMatrix{sparsetype}(mf).m
 
     ## Condensing nested :+ calls
     f = y ~ x1 + (x2 + (x3 + x4))
@@ -368,6 +384,7 @@ module TestFormula
     mf = ModelFrame(y ~ x1m, d)
     mm = ModelMatrix(mf)
     @test mm.m[:, 2] == d[complete_cases(d), :x1m]
+    @test mm.m == ModelMatrix{sparsetype}(mf).m
 
     ## Same variable on left and right side
     mf = ModelFrame(x1 ~ x1, df)
@@ -386,58 +403,68 @@ d[:n] = 1.:8
 
 ## No intercept
 mf = ModelFrame(n ~ 0 + x, d, contrasts=cs)
-@test ModelMatrix(mf).m == [1 0
-                            0 1
-                            1 0
-                            0 1
-                            1 0
-                            0 1
-                            1 0
-                            0 1]
+mm = ModelMatrix(mf)
+@test mm.m == [1 0
+               0 1
+               1 0
+               0 1
+               1 0
+               0 1
+               1 0
+               0 1]
+@test mm.m == ModelMatrix{sparsetype}(mf).m
 @test coefnames(mf) == ["x: a", "x: b"]
 
 ## No first-order term for interaction
 mf = ModelFrame(n ~ 1 + x + x&y, d, contrasts=cs)
-@test ModelMatrix(mf).m[:, 2:end] == [-1 -1  0
-                                       1  0 -1
-                                      -1  1  0
-                                       1  0  1
-                                      -1 -1  0
-                                       1  0 -1
-                                      -1  1  0
-                                       1  0  1]
+mm = ModelMatrix(mf)
+@test mm.m[:, 2:end] == [-1 -1  0
+                         1  0 -1
+                         -1  1  0
+                         1  0  1
+                         -1 -1  0
+                         1  0 -1
+                         -1  1  0
+                         1  0  1]
+@test mm.m == ModelMatrix{sparsetype}(mf).m
 @test coefnames(mf) == ["(Intercept)", "x: b", "x: a & y: d", "x: b & y: d"]
 
 ## When both terms of interaction are non-redundant:
 mf = ModelFrame(n ~ 0 + x&y, d, contrasts=cs)
-@test ModelMatrix(mf).m == [1 0 0 0
-                            0 1 0 0
-                            0 0 1 0
-                            0 0 0 1
-                            1 0 0 0
-                            0 1 0 0
-                            0 0 1 0
-                            0 0 0 1]
+mm = ModelMatrix(mf)
+@test mm.m == [1 0 0 0
+               0 1 0 0
+               0 0 1 0
+               0 0 0 1
+               1 0 0 0
+               0 1 0 0
+               0 0 1 0
+               0 0 0 1]
+@test mm.m == ModelMatrix{sparsetype}(mf).m
 @test coefnames(mf) == ["x: a & y: c", "x: b & y: c",                             
                         "x: a & y: d", "x: b & y: d"]
 
 # only a three-way interaction: every term is promoted.
 mf = ModelFrame(n ~ 0 + x&y&z, d, contrasts=cs)
-@test ModelMatrix(mf).m == eye(8)
+mm = ModelMatrix(mf)
+@test mm.m == eye(8)
+@test mm.m == ModelMatrix{sparsetype}(mf).m
 
 # two two-way interactions, with no lower-order term. both are promoted in
 # first (both x and y), but only the old term (x) in the second (because
 # dropping x gives z which isn't found elsewhere, but dropping z gives x
 # which is found (implicitly) in the promoted interaction x&y).
 mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs)
-@test ModelMatrix(mf).m == [1 0 0 0 -1  0
-                            0 1 0 0  0 -1
-                            0 0 1 0 -1  0
-                            0 0 0 1  0 -1
-                            1 0 0 0  1  0
-                            0 1 0 0  0  1
-                            0 0 1 0  1  0
-                            0 0 0 1  0  1]
+mm = ModelMatrix(mf)
+@test mm.m == [1 0 0 0 -1  0
+               0 1 0 0  0 -1
+               0 0 1 0 -1  0
+               0 0 0 1  0 -1
+               1 0 0 0  1  0
+               0 1 0 0  0  1
+               0 0 1 0  1  0
+               0 0 0 1  0  1]
+@test mm.m == ModelMatrix{sparsetype}(mf).m
 @test coefnames(mf) == ["x: a & y: c", "x: b & y: c",
                         "x: a & y: d", "x: b & y: d",
                         "x: a & z: f", "x: b & z: f"]
@@ -446,14 +473,16 @@ mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs)
 # this is because dropping x gives y&z which isn't present, but dropping y or z
 # gives x&z or x&z respectively, which are both present.
 mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs)
-@test ModelMatrix(mf).m == [1 0 0 0 -1  0  1  0
-                            0 1 0 0  0 -1  0  1
-                            0 0 1 0 -1  0 -1  0
-                            0 0 0 1  0 -1  0 -1
-                            1 0 0 0  1  0 -1  0
-                            0 1 0 0  0  1  0 -1
-                            0 0 1 0  1  0  1  0
-                            0 0 0 1  0  1  0  1]
+mm = ModelMatrix(mf)
+@test mm.m == [1 0 0 0 -1  0  1  0
+               0 1 0 0  0 -1  0  1
+               0 0 1 0 -1  0 -1  0
+               0 0 0 1  0 -1  0 -1
+               1 0 0 0  1  0 -1  0
+               0 1 0 0  0  1  0 -1
+               0 0 1 0  1  0  1  0
+               0 0 0 1  0  1  0  1]
+@test mm.m == ModelMatrix{sparsetype}(mf).m
 @test coefnames(mf) == ["x: a & y: c", "x: b & y: c",
                         "x: a & y: d", "x: b & y: d",
                         "x: a & z: f", "x: b & z: f",
@@ -463,20 +492,21 @@ mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs)
 # promoted in both (along with lower-order term), because in every case, when
 # x is dropped, the remaining terms (1, y, and z) aren't present elsewhere.
 mf = ModelFrame(n ~ 0 + x + x&y + x&z, d, contrasts=cs)
-@test ModelMatrix(mf).m == [1 0 -1  0 -1  0
-                            0 1  0 -1  0 -1
-                            1 0  1  0 -1  0
-                            0 1  0  1  0 -1
-                            1 0 -1  0  1  0
-                            0 1  0 -1  0  1
-                            1 0  1  0  1  0
-                            0 1  0  1  0  1]
+mm = ModelMatrix(mf)
+@test mm.m == [1 0 -1  0 -1  0
+               0 1  0 -1  0 -1
+               1 0  1  0 -1  0
+               0 1  0  1  0 -1
+               1 0 -1  0  1  0
+               0 1  0 -1  0  1
+               1 0  1  0  1  0
+               0 1  0  1  0  1]
+@test mm.m == ModelMatrix{sparsetype}(mf).m
 @test coefnames(mf) == ["x: a", "x: b",
                         "x: a & y: d", "x: b & y: d",
                         "x: a & z: f", "x: b & z: f"]
 
 
-
 ## FAILS: When both terms are non-redundant and intercept is PRESENT
 ## (not fully redundant). Ideally, would drop last column. Might make sense
 ## to warn about this, and suggest recoding x and y into a single variable.