Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC: Sparse ModelMatrix support #1040

Merged
merged 11 commits into from
Aug 26, 2016
57 changes: 30 additions & 27 deletions src/statsmodels/formula.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ type ModelFrame
contrasts::Dict{Symbol, ContrastsMatrix}
end

type ModelMatrix{T <: @compat(Union{Float32, Float64})}
m::Matrix{T}
typealias AbstractFloatMatrix{T<:AbstractFloat} AbstractMatrix{T}

type ModelMatrix{T <: AbstractFloatMatrix}
m::T
assign::Vector{Int}
end

Expand Down Expand Up @@ -321,9 +323,6 @@ function setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
end
setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs))

asmatrix(a::AbstractMatrix) = a
asmatrix(v::AbstractVector) = reshape(v, (length(v), 1))

"""
StatsBase.model_response(mf::ModelFrame)
Extract the response column, if present. `DataVector` or
Expand All @@ -337,44 +336,46 @@ function StatsBase.model_response(mf::ModelFrame)
end
end

modelmat_cols(v::DataVector) = asmatrix(convert(Vector{Float64}, v.data))
modelmat_cols(v::Vector) = asmatrix(convert(Vector{Float64}, v))
## construct model matrix columns from model frame + name (checks for contrasts)
function modelmat_cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false)
function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::ModelFrame; non_redundant::Bool = false)
if haskey(mf.contrasts, name)
modelmat_cols(mf.df[name],
modelmat_cols(T, mf.df[name],
non_redundant ?
ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) :
mf.contrasts[name])
else
modelmat_cols(mf.df[name])
modelmat_cols(T, mf.df[name])
end
end

modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::DataVector) = convert(T, reshape(v.data, length(v), 1))
modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Vector) = convert(T, reshape(v, length(v), 1))

"""
modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix)
modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)

Construct `ModelMatrix` columns based on specified contrasts, ensuring that
Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that
levels align properly.
"""
function modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix)
function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix)
## make sure the levels of the contrast matrix and the categorical data
## are the same by constructing a re-indexing vector. Indexing into
## reindex with v.refs will give the corresponding row number of the
## contrast matrix
reindex = [findfirst(contrast.levels, l) for l in levels(v)]
return contrast.matrix[reindex[v.refs], :]
contrastmatrix = convert(T, contrast.matrix)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what cases can T be different from contrast.matrix's type?

Copy link
Author

@GordStephen GordStephen Aug 21, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

contrast.matrix is always a Matrix{Float64}, while T is whatever your desired output type is (e.g. any AbstractFloatMatrix subtype).

return contrastmatrix[reindex[v.refs], :]
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This array creation can be extremely slow for sparse T and large datasets... but I'm not sure if there's a faster way to do it without creating dense columns first?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No idea. Why is it slow? Indexing rows shouldn't be a problem for sparse matrices AFAIK.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not an expert on sparse matrix indexing, but it seems to spend a lot of time sorting... Truncated profile output from a million-row reference vector and 5-column constrast matrix:

 3594 ./event.jl:68; (::Base.REPL.##3#4{Base.REPL.REPLBackend})()
 3594 ./REPL.jl:95; macro expansion
  3594 ./REPL.jl:64; eval_user_input(::Any, ::Base.REPL.REPLBackend)
   3594 ./boot.jl:234; eval(::Module, ::Any)
    3594 ./<missing>:?; anonymous
     3594 ./profile.jl:16; macro expansion;
      3594 ./sparse/sparsematrix.jl:2099; getindex(::SparseMatrixCSC{Float64,Int64}, ::Array{Int64,1}, :...
       4    ./sparse/sparsematrix.jl:2437; getindex(::SparseMatrixCSC{Float64,Int64}, ::Array{Int64,1}, :...
        3 ./reduce.jl:371; extrema(::Array{Int64,1})
        1 ./reduce.jl:372; extrema(::Array{Int64,1})
       3590 ./sparse/sparsematrix.jl:2448; getindex(::SparseMatrixCSC{Float64,Int64}, ::Array{Int64,1}, :...
        3547 ./sparse/sparsematrix.jl:2422; getindex_general(::SparseMatrixCSC{Float64,Int64}, ::Array{In...
         1    ./sort.jl:451; #sortperm#11(::Base.Sort.QuickSortAlg, ::Function, ::Function...
         4    ./sort.jl:452; #sortperm#11(::Base.Sort.QuickSortAlg, ::Function, ::Function...
         3542 ./sort.jl:454; #sortperm#11(::Base.Sort.QuickSortAlg, ::Function, ::Function...
          3542 ./sort.jl:404; sort!(::Array{Int64,1}, ::Base.Sort.QuickSortAlg, ::Base.Ord...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm... You could ask on the mailing list for advice about the best algorithm to do this for sparse matrices. I guess working column by column (for SparseMatrixCSC) would make more sense.

end

"""
expandcols(trm::Vector)
expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
Create pairwise products of columns from a vector of matrices
"""
function expandcols(trm::Vector)
function expandcols{T<:AbstractFloatMatrix}(trm::Vector{T})
if length(trm) == 1
asmatrix(convert(Array{Float64}, trm[1]))
trm[1]
else
Copy link
Author

@GordStephen GordStephen Aug 21, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I can tell, the conversions here (and just above) are redundant since elements of trm are always created by modelmat_cols, which already handles this?

a = convert(Array{Float64}, trm[1])
a = trm[1]
b = expandcols(trm[2 : end])
reduce(hcat, [broadcast(*, a, Compat.view(b, :, j)) for j in 1 : size(b, 2)])
end
Expand Down Expand Up @@ -422,8 +423,9 @@ end


"""
ModelMatrix(mf::ModelFrame)
Create a `ModelMatrix` from the `terms` and `df` members of `mf`
ModelMatrix{T<:AbstractFloatMatrix}(mf::ModelFrame)
Create a `ModelMatrix` of type `T` (default `Matrix{Float64}`) from the
`terms` and `df` members of `mf`.

This is basically a map-reduce where terms are mapped to columns by `cols`
and reduced by `hcat`. During the collection of the columns the `assign`
Expand All @@ -437,21 +439,21 @@ If there is an intercept in the model, that column occurs first and its
Mixed-effects models include "random-effects" terms which are ignored when
creating the model matrix.
"""
function ModelMatrix(mf::ModelFrame)
@compat function (::Type{ModelMatrix{T}}){T<:AbstractFloatMatrix}(mf::ModelFrame)
dfrm = mf.df
terms = droprandomeffects(dropresponse!(mf.terms))

blocks = Matrix{Float64}[]
blocks = T[]
assign = Int[]
if terms.intercept
push!(blocks, ones(size(dfrm, 1), 1)) # columns of 1's is first block
push!(assign, 0) # this block corresponds to term zero
push!(assign, 0) # this block corresponds to term zero
end

factors = terms.factors

## Map eval. term name + redundancy bool to cached model matrix columns
eterm_cols = @compat Dict{Tuple{Symbol,Bool}, Array{Float64}}()
eterm_cols = @compat Dict{Tuple{Symbol,Bool}, T}()
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't find any reason not to restrict the Array dimension here. Did I miss something?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only issue I can think of is the case where a single-column term would give a column vector instead of a one-column matrix. But conversion will probably happen automatically, and tests should catch this. Have you run the tests of GLM.jl on the modified package?

Copy link
Author

@GordStephen GordStephen Aug 20, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I could tell modelmat_cols handles the vector->matrix conversion. The GLM.jl tests pass as well, so hopefully this is OK.

## Accumulator for each term's vector of eval. term columns.

## TODO: this method makes multiple copies of the data in the ModelFrame:
Expand All @@ -462,7 +464,7 @@ function ModelMatrix(mf::ModelFrame)
## "promoted" full-rank versions of categorical columns for non-redundant
## eval. terms:
for (i_term, term) in enumerate(terms.terms)
term_cols = Matrix{Float64}[]
term_cols = T[]
## Pull out the eval terms, and the non-redundancy flags for this term
ff = Compat.view(factors, :, i_term)
eterms = Compat.view(terms.eterms, ff)
Expand All @@ -471,16 +473,17 @@ function ModelMatrix(mf::ModelFrame)
## and storing as necessary)
for (et, nr) in zip(eterms, non_redundants)
if ! haskey(eterm_cols, (et, nr))
eterm_cols[(et, nr)] = modelmat_cols(et, mf, non_redundant=nr)
eterm_cols[(et, nr)] = modelmat_cols(T, et, mf, non_redundant=nr)
end
push!(term_cols, eterm_cols[(et, nr)])
end
push!(blocks, expandcols(term_cols))
append!(assign, fill(i_term, size(blocks[end], 2)))
end

ModelMatrix{Float64}(reduce(hcat, blocks), assign)
ModelMatrix{T}(reduce(hcat, blocks), assign)
end
ModelMatrix(mf::ModelFrame) = ModelMatrix{Matrix{Float64}}(mf)


"""
Expand Down
130 changes: 80 additions & 50 deletions test/formula.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ module TestFormula

## Tests for constructing ModelFrame and ModelMatrix

sparsetype = SparseMatrixCSC{Float64,Int}

d = DataFrame()
d[:y] = [1:4;]
d[:x1] = [5:8;]
Expand All @@ -124,8 +126,14 @@ module TestFormula
@test coefnames(mf) == ["(Intercept)","x1","x2"]
## @test model_response(mf) == transpose([1. 2 3 4]) # fails: Int64 vs. Float64
mm = ModelMatrix(mf)
smm = ModelMatrix{sparsetype}(mf)
@test mm.m[:,1] == ones(4)
@test mm.m[:,2:3] == [x1 x2]
@test mm.m == smm.m

@test isa(mm.m, Matrix{Float64})
@test isa(smm.m, sparsetype)
@test isa(ModelMatrix{DataMatrix{Float64}}(mf).m, DataMatrix{Float64})

#test_group("expanding a PooledVec into a design matrix of indicators for each dummy variable")

Expand All @@ -137,6 +145,7 @@ module TestFormula
@test mm.m[:,3] == [0, 0, 1., 0]
@test mm.m[:,4] == [0, 0, 0, 1.]
@test coefnames(mf)[2:end] == ["x1p: 6", "x1p: 7", "x1p: 8"]
@test mm.m == ModelMatrix{sparsetype}(mf).m

#test_group("create a design matrix from interactions from two DataFrames")
## this was removed in commit dead4562506badd7e84a2367086f5753fa49bb6a
Expand Down Expand Up @@ -199,18 +208,21 @@ module TestFormula
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m == [ones(4) x1.*x2]
@test mm.m == ModelMatrix{sparsetype}(mf).m

f = y ~ x1 * x2
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m == [ones(4) x1 x2 x1.*x2]
@test mm.m == ModelMatrix{sparsetype}(mf).m

df[:x1] = PooledDataArray(x1)
x1e = [[0, 1, 0, 0] [0, 0, 1, 0] [0, 0, 0, 1]]
f = y ~ x1 * x2
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m == [ones(4) x1e x2 [0, 10, 0, 0] [0, 0, 11, 0] [0, 0, 0, 12]]
@test mm.m == ModelMatrix{sparsetype}(mf).m

#test_group("Basic transformations")

Expand Down Expand Up @@ -261,6 +273,7 @@ module TestFormula
mf = ModelFrame(y ~ x2, d)
mm = ModelMatrix(mf)
@test mm.m == [ones(4) x2]
@test mm.m == ModelMatrix{sparsetype}(mf).m
## @test model_response(mf) == y'' # fails: Int64 vs. Float64

df = deepcopy(d)
Expand Down Expand Up @@ -294,11 +307,13 @@ module TestFormula
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m == [ones(4) x2.*x3.*x4]
@test mm.m == ModelMatrix{sparsetype}(mf).m

f = y ~ x1 & x2 & x3
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m[:, 2:end] == diagm(x2.*x3)
@test mm.m == ModelMatrix{sparsetype}(mf).m

#test_group("Column groups in formulas")
## set_group was removed in The Great Purge (55e47cd)
Expand Down Expand Up @@ -346,6 +361,7 @@ module TestFormula
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
@test mm.m == hcat(ones(4), x1.*x3, x1.*x4, x2.*x3, x2.*x4)
@test mm.m == ModelMatrix{sparsetype}(mf).m

## Condensing nested :+ calls
f = y ~ x1 + (x2 + (x3 + x4))
Expand All @@ -368,6 +384,7 @@ module TestFormula
mf = ModelFrame(y ~ x1m, d)
mm = ModelMatrix(mf)
@test mm.m[:, 2] == d[complete_cases(d), :x1m]
@test mm.m == ModelMatrix{sparsetype}(mf).m

## Same variable on left and right side
mf = ModelFrame(x1 ~ x1, df)
Expand All @@ -386,58 +403,68 @@ d[:n] = 1.:8

## No intercept
mf = ModelFrame(n ~ 0 + x, d, contrasts=cs)
@test ModelMatrix(mf).m == [1 0
0 1
1 0
0 1
1 0
0 1
1 0
0 1]
mm = ModelMatrix(mf)
@test mm.m == [1 0
0 1
1 0
0 1
1 0
0 1
1 0
0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["x: a", "x: b"]

## No first-order term for interaction
mf = ModelFrame(n ~ 1 + x + x&y, d, contrasts=cs)
@test ModelMatrix(mf).m[:, 2:end] == [-1 -1 0
1 0 -1
-1 1 0
1 0 1
-1 -1 0
1 0 -1
-1 1 0
1 0 1]
mm = ModelMatrix(mf)
@test mm.m[:, 2:end] == [-1 -1 0
1 0 -1
-1 1 0
1 0 1
-1 -1 0
1 0 -1
-1 1 0
1 0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["(Intercept)", "x: b", "x: a & y: d", "x: b & y: d"]

## When both terms of interaction are non-redundant:
mf = ModelFrame(n ~ 0 + x&y, d, contrasts=cs)
@test ModelMatrix(mf).m == [1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1]
mm = ModelMatrix(mf)
@test mm.m == [1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["x: a & y: c", "x: b & y: c",
"x: a & y: d", "x: b & y: d"]

# only a three-way interaction: every term is promoted.
mf = ModelFrame(n ~ 0 + x&y&z, d, contrasts=cs)
@test ModelMatrix(mf).m == eye(8)
mm = ModelMatrix(mf)
@test mm.m == eye(8)
@test mm.m == ModelMatrix{sparsetype}(mf).m

# two two-way interactions, with no lower-order term. both are promoted in
# first (both x and y), but only the old term (x) in the second (because
# dropping x gives z which isn't found elsewhere, but dropping z gives x
# which is found (implicitly) in the promoted interaction x&y).
mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs)
@test ModelMatrix(mf).m == [1 0 0 0 -1 0
0 1 0 0 0 -1
0 0 1 0 -1 0
0 0 0 1 0 -1
1 0 0 0 1 0
0 1 0 0 0 1
0 0 1 0 1 0
0 0 0 1 0 1]
mm = ModelMatrix(mf)
@test mm.m == [1 0 0 0 -1 0
0 1 0 0 0 -1
0 0 1 0 -1 0
0 0 0 1 0 -1
1 0 0 0 1 0
0 1 0 0 0 1
0 0 1 0 1 0
0 0 0 1 0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["x: a & y: c", "x: b & y: c",
"x: a & y: d", "x: b & y: d",
"x: a & z: f", "x: b & z: f"]
Expand All @@ -446,14 +473,16 @@ mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs)
# this is because dropping x gives y&z which isn't present, but dropping y or z
# gives x&z or x&z respectively, which are both present.
mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs)
@test ModelMatrix(mf).m == [1 0 0 0 -1 0 1 0
0 1 0 0 0 -1 0 1
0 0 1 0 -1 0 -1 0
0 0 0 1 0 -1 0 -1
1 0 0 0 1 0 -1 0
0 1 0 0 0 1 0 -1
0 0 1 0 1 0 1 0
0 0 0 1 0 1 0 1]
mm = ModelMatrix(mf)
@test mm.m == [1 0 0 0 -1 0 1 0
0 1 0 0 0 -1 0 1
0 0 1 0 -1 0 -1 0
0 0 0 1 0 -1 0 -1
1 0 0 0 1 0 -1 0
0 1 0 0 0 1 0 -1
0 0 1 0 1 0 1 0
0 0 0 1 0 1 0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["x: a & y: c", "x: b & y: c",
"x: a & y: d", "x: b & y: d",
"x: a & z: f", "x: b & z: f",
Expand All @@ -463,20 +492,21 @@ mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs)
# promoted in both (along with lower-order term), because in every case, when
# x is dropped, the remaining terms (1, y, and z) aren't present elsewhere.
mf = ModelFrame(n ~ 0 + x + x&y + x&z, d, contrasts=cs)
@test ModelMatrix(mf).m == [1 0 -1 0 -1 0
0 1 0 -1 0 -1
1 0 1 0 -1 0
0 1 0 1 0 -1
1 0 -1 0 1 0
0 1 0 -1 0 1
1 0 1 0 1 0
0 1 0 1 0 1]
mm = ModelMatrix(mf)
@test mm.m == [1 0 -1 0 -1 0
0 1 0 -1 0 -1
1 0 1 0 -1 0
0 1 0 1 0 -1
1 0 -1 0 1 0
0 1 0 -1 0 1
1 0 1 0 1 0
0 1 0 1 0 1]
@test mm.m == ModelMatrix{sparsetype}(mf).m
@test coefnames(mf) == ["x: a", "x: b",
"x: a & y: d", "x: b & y: d",
"x: a & z: f", "x: b & z: f"]



## FAILS: When both terms are non-redundant and intercept is PRESENT
## (not fully redundant). Ideally, would drop last column. Might make sense
## to warn about this, and suggest recoding x and y into a single variable.
Expand Down