From c61a0ecb985948ba9a104fdf0085fae0f2e48362 Mon Sep 17 00:00:00 2001 From: Gord Stephen Date: Sat, 20 Aug 2016 10:16:52 -0400 Subject: [PATCH 01/11] Parametrize ModelMatrix container type --- src/statsmodels/formula.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 052fea86a4..17e94b40ae 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -48,8 +48,8 @@ type ModelFrame contrasts::Dict{Symbol, ContrastsMatrix} end -type ModelMatrix{T <: @compat(Union{Float32, Float64})} - m::Matrix{T} +type ModelMatrix{T <: @compat(Union{Matrix{Float32}, Matrix{Float64}, SparseMatrixCSC{Float32,Int}, SparseMatrixCSC{Float64,Int}})} + m::T assign::Vector{Int} end @@ -479,7 +479,7 @@ function ModelMatrix(mf::ModelFrame) append!(assign, fill(i_term, size(blocks[end], 2))) end - ModelMatrix{Float64}(reduce(hcat, blocks), assign) + ModelMatrix{Matrix{Float64}}(reduce(hcat, blocks), assign) end From d46fc37c775e4cf67f7ae80a5ffcf93157126653 Mon Sep 17 00:00:00 2001 From: Gord Stephen Date: Sat, 20 Aug 2016 16:53:25 -0400 Subject: [PATCH 02/11] Eliminate hardcoded model matrix container type when constructing from ModelFrame --- src/statsmodels/formula.jl | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 17e94b40ae..072809fd81 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -48,7 +48,11 @@ type ModelFrame contrasts::Dict{Symbol, ContrastsMatrix} end -type ModelMatrix{T <: @compat(Union{Matrix{Float32}, Matrix{Float64}, SparseMatrixCSC{Float32,Int}, SparseMatrixCSC{Float64,Int}})} +modelmatrixcontainertypes = [Matrix{Float32}, Matrix{Float64}, + SparseMatrixCSC{Float32,Int}, + SparseMatrixCSC{Float64,Int}] + +type ModelMatrix{T <: Union{modelmatrixcontainertypes...}} m::T assign::Vector{Int} end @@ -437,21 +441,21 @@ If there is an intercept in the model, that column occurs first and its Mixed-effects models include "random-effects" terms which are ignored when creating the model matrix. """ -function ModelMatrix(mf::ModelFrame) +function ModelMatrix(T::Union{map(t->Type{t}, modelmatrixcontainertypes)...}, mf::ModelFrame) dfrm = mf.df terms = droprandomeffects(dropresponse!(mf.terms)) - blocks = Matrix{Float64}[] + blocks = T[] assign = Int[] if terms.intercept - push!(blocks, ones(size(dfrm, 1), 1)) # columns of 1's is first block - push!(assign, 0) # this block corresponds to term zero + push!(blocks, convert(T, ones(size(dfrm, 1), 1))) # columns of 1's is first block + push!(assign, 0) # this block corresponds to term zero end factors = terms.factors ## Map eval. term name + redundancy bool to cached model matrix columns - eterm_cols = @compat Dict{Tuple{Symbol,Bool}, Array{Float64}}() + eterm_cols = @compat Dict{Tuple{Symbol,Bool}, T}() ## Accumulator for each term's vector of eval. term columns. ## TODO: this method makes multiple copies of the data in the ModelFrame: @@ -462,7 +466,7 @@ function ModelMatrix(mf::ModelFrame) ## "promoted" full-rank versions of categorical columns for non-redundant ## eval. terms: for (i_term, term) in enumerate(terms.terms) - term_cols = Matrix{Float64}[] + term_cols = T[] ## Pull out the eval terms, and the non-redundancy flags for this term ff = Compat.view(factors, :, i_term) eterms = Compat.view(terms.eterms, ff) @@ -479,8 +483,9 @@ function ModelMatrix(mf::ModelFrame) append!(assign, fill(i_term, size(blocks[end], 2))) end - ModelMatrix{Matrix{Float64}}(reduce(hcat, blocks), assign) + ModelMatrix{T}(reduce(hcat, blocks), assign) end +ModelMatrix(mf::ModelFrame) = ModelMatrix(Matrix{Float64}, mf) """ From 288552c65eb6b0f3f6e7dc4bd8fcf31b0b6c15f7 Mon Sep 17 00:00:00 2001 From: Gord Stephen Date: Sun, 21 Aug 2016 09:58:29 -0400 Subject: [PATCH 03/11] More idiomatic model matrix constructor and relaxed container type restrictions --- src/statsmodels/formula.jl | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 072809fd81..05775b4d22 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -48,11 +48,9 @@ type ModelFrame contrasts::Dict{Symbol, ContrastsMatrix} end -modelmatrixcontainertypes = [Matrix{Float32}, Matrix{Float64}, - SparseMatrixCSC{Float32,Int}, - SparseMatrixCSC{Float64,Int}] +typealias ModelMatrixContainer{T<:AbstractFloat} AbstractMatrix{T} -type ModelMatrix{T <: Union{modelmatrixcontainertypes...}} +type ModelMatrix{T <: ModelMatrixContainer} m::T assign::Vector{Int} end @@ -441,15 +439,15 @@ If there is an intercept in the model, that column occurs first and its Mixed-effects models include "random-effects" terms which are ignored when creating the model matrix. """ -function ModelMatrix(T::Union{map(t->Type{t}, modelmatrixcontainertypes)...}, mf::ModelFrame) +function (::Type{ModelMatrix{T}}){T<:ModelMatrixContainer}(mf::ModelFrame) dfrm = mf.df terms = droprandomeffects(dropresponse!(mf.terms)) blocks = T[] assign = Int[] if terms.intercept - push!(blocks, convert(T, ones(size(dfrm, 1), 1))) # columns of 1's is first block - push!(assign, 0) # this block corresponds to term zero + push!(blocks, ones(size(dfrm, 1), 1)) # columns of 1's is first block + push!(assign, 0) # this block corresponds to term zero end factors = terms.factors @@ -485,7 +483,7 @@ function ModelMatrix(T::Union{map(t->Type{t}, modelmatrixcontainertypes)...}, mf ModelMatrix{T}(reduce(hcat, blocks), assign) end -ModelMatrix(mf::ModelFrame) = ModelMatrix(Matrix{Float64}, mf) +ModelMatrix(mf::ModelFrame) = ModelMatrix{Matrix{Float64}}(mf) """ From e1b068ed3588db786360416ed16a00aeea4f49c4 Mon Sep 17 00:00:00 2001 From: Gord Stephen Date: Sun, 21 Aug 2016 11:03:45 -0400 Subject: [PATCH 04/11] Generalize modelmat_cols output typing --- src/statsmodels/formula.jl | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 05775b4d22..a6c0df2ca2 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -323,8 +323,8 @@ function setcontrasts!(mf::ModelFrame, new_contrasts::Dict) end setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs)) -asmatrix(a::AbstractMatrix) = a -asmatrix(v::AbstractVector) = reshape(v, (length(v), 1)) +asmatrix(T::Type, a::AbstractMatrix) = convert(T, a) +asmatrix(T::Type, v::AbstractVector) = convert(T, reshape(v, (length(v), 1))) """ StatsBase.model_response(mf::ModelFrame) @@ -339,33 +339,35 @@ function StatsBase.model_response(mf::ModelFrame) end end -modelmat_cols(v::DataVector) = asmatrix(convert(Vector{Float64}, v.data)) -modelmat_cols(v::Vector) = asmatrix(convert(Vector{Float64}, v)) +modelmat_cols{T<:ModelMatrixContainer}(::Type{T}, v::DataVector) = asmatrix(T, convert(Vector{Float64}, v.data)) +modelmat_cols{T<:ModelMatrixContainer}(::Type{T}, v::Vector) = asmatrix(T, convert(Vector{Float64}, v)) + ## construct model matrix columns from model frame + name (checks for contrasts) -function modelmat_cols(name::Symbol, mf::ModelFrame; non_redundant::Bool = false) +function modelmat_cols{T<:ModelMatrixContainer}(::Type{T}, name::Symbol, mf::ModelFrame; non_redundant::Bool = false) if haskey(mf.contrasts, name) - modelmat_cols(mf.df[name], + modelmat_cols(T, mf.df[name], non_redundant ? ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) : mf.contrasts[name]) else - modelmat_cols(mf.df[name]) + modelmat_cols(T, mf.df[name]) end end """ - modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix) + modelmat_cols(T::Type{ModelMatrixContainer}, v::PooledDataVector, contrast::ContrastsMatrix) -Construct `ModelMatrix` columns based on specified contrasts, ensuring that +Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that levels align properly. """ -function modelmat_cols(v::PooledDataVector, contrast::ContrastsMatrix) +function modelmat_cols{T<:ModelMatrixContainer}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix) ## make sure the levels of the contrast matrix and the categorical data ## are the same by constructing a re-indexing vector. Indexing into ## reindex with v.refs will give the corresponding row number of the ## contrast matrix reindex = [findfirst(contrast.levels, l) for l in levels(v)] - return contrast.matrix[reindex[v.refs], :] + contrastmatrix = convert(T, contrast.matrix) + return contrastmatrix[reindex[v.refs], :] end """ @@ -374,7 +376,7 @@ Create pairwise products of columns from a vector of matrices """ function expandcols(trm::Vector) if length(trm) == 1 - asmatrix(convert(Array{Float64}, trm[1])) + asmatrix(Matrix{Float64}, convert(Array{Float64}, trm[1])) else a = convert(Array{Float64}, trm[1]) b = expandcols(trm[2 : end]) @@ -439,7 +441,8 @@ If there is an intercept in the model, that column occurs first and its Mixed-effects models include "random-effects" terms which are ignored when creating the model matrix. """ -function (::Type{ModelMatrix{T}}){T<:ModelMatrixContainer}(mf::ModelFrame) +@compat function (::Type{ModelMatrix{T}}){T<:ModelMatrixContainer}(mf::ModelFrame) + sparsemm = T <: AbstractSparseMatrix dfrm = mf.df terms = droprandomeffects(dropresponse!(mf.terms)) @@ -473,7 +476,7 @@ function (::Type{ModelMatrix{T}}){T<:ModelMatrixContainer}(mf::ModelFrame) ## and storing as necessary) for (et, nr) in zip(eterms, non_redundants) if ! haskey(eterm_cols, (et, nr)) - eterm_cols[(et, nr)] = modelmat_cols(et, mf, non_redundant=nr) + eterm_cols[(et, nr)] = modelmat_cols(T, et, mf, non_redundant=nr) end push!(term_cols, eterm_cols[(et, nr)]) end From 4a9a65f0c5bded221ae06dac88759fcc1ebb68a3 Mon Sep 17 00:00:00 2001 From: Gord Stephen Date: Sun, 21 Aug 2016 11:25:37 -0400 Subject: [PATCH 05/11] Generalize expandcols output types --- src/statsmodels/formula.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index a6c0df2ca2..628e15017e 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -374,12 +374,11 @@ end expandcols(trm::Vector) Create pairwise products of columns from a vector of matrices """ -function expandcols(trm::Vector) +function expandcols{T<:ModelMatrixContainer}(trm::Vector{T}) if length(trm) == 1 - asmatrix(Matrix{Float64}, convert(Array{Float64}, trm[1])) + trm[1] else - a = convert(Array{Float64}, trm[1]) - b = expandcols(trm[2 : end]) + a, b = trm[1], expandcols(trm[2 : end]) reduce(hcat, [broadcast(*, a, Compat.view(b, :, j)) for j in 1 : size(b, 2)]) end end From fd12a5d272cd8d4ff325592fb68b8bca1a00fde7 Mon Sep 17 00:00:00 2001 From: Gord Stephen Date: Sun, 21 Aug 2016 12:05:46 -0400 Subject: [PATCH 06/11] Added sparse ModelMatrix creation tests --- test/formula.jl | 47 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/test/formula.jl b/test/formula.jl index b09dd715d9..62864100d7 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -107,6 +107,8 @@ module TestFormula ## Tests for constructing ModelFrame and ModelMatrix + sparsetype = SparseMatrixCSC{Float64,Int} + d = DataFrame() d[:y] = [1:4;] d[:x1] = [5:8;] @@ -127,6 +129,10 @@ module TestFormula @test mm.m[:,1] == ones(4) @test mm.m[:,2:3] == [x1 x2] + smm = ModelMatrix{sparsetype}(mf) + @test issparse(smm.m) + @test mm.m == smm.m + #test_group("expanding a PooledVec into a design matrix of indicators for each dummy variable") d[:x1p] = PooledDataArray(d[:x1]) @@ -138,6 +144,10 @@ module TestFormula @test mm.m[:,4] == [0, 0, 0, 1.] @test coefnames(mf)[2:end] == ["x1p: 6", "x1p: 7", "x1p: 8"] + smm = ModelMatrix{sparsetype}(mf) + @test issparse(smm.m) + @test mm.m == smm.m + #test_group("create a design matrix from interactions from two DataFrames") ## this was removed in commit dead4562506badd7e84a2367086f5753fa49bb6a @@ -199,11 +209,13 @@ module TestFormula mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m == [ones(4) x1.*x2] + @test mm.m == ModelMatrix{sparsetype}(mf).m f = y ~ x1 * x2 mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m == [ones(4) x1 x2 x1.*x2] + @test mm.m == ModelMatrix{sparsetype}(mf).m df[:x1] = PooledDataArray(x1) x1e = [[0, 1, 0, 0] [0, 0, 1, 0] [0, 0, 0, 1]] @@ -211,6 +223,7 @@ module TestFormula mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m == [ones(4) x1e x2 [0, 10, 0, 0] [0, 0, 11, 0] [0, 0, 0, 12]] + @test mm.m == ModelMatrix{sparsetype}(mf).m #test_group("Basic transformations") @@ -261,6 +274,7 @@ module TestFormula mf = ModelFrame(y ~ x2, d) mm = ModelMatrix(mf) @test mm.m == [ones(4) x2] + @test mm.m == ModelMatrix{sparsetype}(mf).m ## @test model_response(mf) == y'' # fails: Int64 vs. Float64 df = deepcopy(d) @@ -294,11 +308,13 @@ module TestFormula mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m == [ones(4) x2.*x3.*x4] + @test mm.m == ModelMatrix{sparsetype}(mf).m f = y ~ x1 & x2 & x3 mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m[:, 2:end] == diagm(x2.*x3) + @test mm.m == ModelMatrix{sparsetype}(mf).m #test_group("Column groups in formulas") ## set_group was removed in The Great Purge (55e47cd) @@ -346,6 +362,7 @@ module TestFormula mf = ModelFrame(f, df) mm = ModelMatrix(mf) @test mm.m == hcat(ones(4), x1.*x3, x1.*x4, x2.*x3, x2.*x4) + @test mm.m == ModelMatrix{sparsetype}(mf).m ## Condensing nested :+ calls f = y ~ x1 + (x2 + (x3 + x4)) @@ -368,6 +385,7 @@ module TestFormula mf = ModelFrame(y ~ x1m, d) mm = ModelMatrix(mf) @test mm.m[:, 2] == d[complete_cases(d), :x1m] + @test mm.m == ModelMatrix{sparsetype}(mf).m ## Same variable on left and right side mf = ModelFrame(x1 ~ x1, df) @@ -386,7 +404,8 @@ d[:n] = 1.:8 ## No intercept mf = ModelFrame(n ~ 0 + x, d, contrasts=cs) -@test ModelMatrix(mf).m == [1 0 +mm = ModelMatrix(mf) +@test mm.m == [1 0 0 1 1 0 0 1 @@ -394,11 +413,13 @@ mf = ModelFrame(n ~ 0 + x, d, contrasts=cs) 0 1 1 0 0 1] +@test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["x: a", "x: b"] ## No first-order term for interaction mf = ModelFrame(n ~ 1 + x + x&y, d, contrasts=cs) -@test ModelMatrix(mf).m[:, 2:end] == [-1 -1 0 +mm = ModelMatrix(mf) +@test mm.m[:, 2:end] == [-1 -1 0 1 0 -1 -1 1 0 1 0 1 @@ -406,11 +427,13 @@ mf = ModelFrame(n ~ 1 + x + x&y, d, contrasts=cs) 1 0 -1 -1 1 0 1 0 1] +@test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["(Intercept)", "x: b", "x: a & y: d", "x: b & y: d"] ## When both terms of interaction are non-redundant: mf = ModelFrame(n ~ 0 + x&y, d, contrasts=cs) -@test ModelMatrix(mf).m == [1 0 0 0 +mm = ModelMatrix(mf) +@test mm.m == [1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 @@ -418,19 +441,23 @@ mf = ModelFrame(n ~ 0 + x&y, d, contrasts=cs) 0 1 0 0 0 0 1 0 0 0 0 1] +@test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["x: a & y: c", "x: b & y: c", "x: a & y: d", "x: b & y: d"] # only a three-way interaction: every term is promoted. mf = ModelFrame(n ~ 0 + x&y&z, d, contrasts=cs) -@test ModelMatrix(mf).m == eye(8) +mm = ModelMatrix(mf) +@test mm.m == eye(8) +@test mm.m == ModelMatrix{sparsetype}(mf).m # two two-way interactions, with no lower-order term. both are promoted in # first (both x and y), but only the old term (x) in the second (because # dropping x gives z which isn't found elsewhere, but dropping z gives x # which is found (implicitly) in the promoted interaction x&y). mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs) -@test ModelMatrix(mf).m == [1 0 0 0 -1 0 +mm = ModelMatrix(mf) +@test mm.m == [1 0 0 0 -1 0 0 1 0 0 0 -1 0 0 1 0 -1 0 0 0 0 1 0 -1 @@ -438,6 +465,7 @@ mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs) 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 1] +@test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["x: a & y: c", "x: b & y: c", "x: a & y: d", "x: b & y: d", "x: a & z: f", "x: b & z: f"] @@ -446,7 +474,8 @@ mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs) # this is because dropping x gives y&z which isn't present, but dropping y or z # gives x&z or x&z respectively, which are both present. mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs) -@test ModelMatrix(mf).m == [1 0 0 0 -1 0 1 0 +mm = ModelMatrix(mf) +@test mm.m == [1 0 0 0 -1 0 1 0 0 1 0 0 0 -1 0 1 0 0 1 0 -1 0 -1 0 0 0 0 1 0 -1 0 -1 @@ -454,6 +483,7 @@ mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs) 0 1 0 0 0 1 0 -1 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 1] +@test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["x: a & y: c", "x: b & y: c", "x: a & y: d", "x: b & y: d", "x: a & z: f", "x: b & z: f", @@ -463,7 +493,8 @@ mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs) # promoted in both (along with lower-order term), because in every case, when # x is dropped, the remaining terms (1, y, and z) aren't present elsewhere. mf = ModelFrame(n ~ 0 + x + x&y + x&z, d, contrasts=cs) -@test ModelMatrix(mf).m == [1 0 -1 0 -1 0 +mm = ModelMatrix(mf) +@test mm.m == [1 0 -1 0 -1 0 0 1 0 -1 0 -1 1 0 1 0 -1 0 0 1 0 1 0 -1 @@ -471,12 +502,12 @@ mf = ModelFrame(n ~ 0 + x + x&y + x&z, d, contrasts=cs) 0 1 0 -1 0 1 1 0 1 0 1 0 0 1 0 1 0 1] +@test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["x: a", "x: b", "x: a & y: d", "x: b & y: d", "x: a & z: f", "x: b & z: f"] - ## FAILS: When both terms are non-redundant and intercept is PRESENT ## (not fully redundant). Ideally, would drop last column. Might make sense ## to warn about this, and suggest recoding x and y into a single variable. From ff6f7062fa4a3858662ab74566fa9cec3572b3de Mon Sep 17 00:00:00 2001 From: Gord Stephen Date: Sun, 21 Aug 2016 15:21:10 -0400 Subject: [PATCH 07/11] More explicit model matrix constructor type output testing --- test/formula.jl | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test/formula.jl b/test/formula.jl index 62864100d7..401d17b9ee 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -126,13 +126,15 @@ module TestFormula @test coefnames(mf) == ["(Intercept)","x1","x2"] ## @test model_response(mf) == transpose([1. 2 3 4]) # fails: Int64 vs. Float64 mm = ModelMatrix(mf) + smm = ModelMatrix{sparsetype}(mf) @test mm.m[:,1] == ones(4) @test mm.m[:,2:3] == [x1 x2] - - smm = ModelMatrix{sparsetype}(mf) - @test issparse(smm.m) @test mm.m == smm.m + @test isa(mm.m, Matrix{Float64}) + @test isa(smm.m, sparsetype) + @test isa(ModelMatrix{DataMatrix{Float64}}(mf).m, DataMatrix{Float64}) + #test_group("expanding a PooledVec into a design matrix of indicators for each dummy variable") d[:x1p] = PooledDataArray(d[:x1]) @@ -143,10 +145,7 @@ module TestFormula @test mm.m[:,3] == [0, 0, 1., 0] @test mm.m[:,4] == [0, 0, 0, 1.] @test coefnames(mf)[2:end] == ["x1p: 6", "x1p: 7", "x1p: 8"] - - smm = ModelMatrix{sparsetype}(mf) - @test issparse(smm.m) - @test mm.m == smm.m + @test mm.m == ModelMatrix{sparsetype}(mf).m #test_group("create a design matrix from interactions from two DataFrames") ## this was removed in commit dead4562506badd7e84a2367086f5753fa49bb6a From f94dd836d1aa3c18f9b70e486f027cd92ac70033 Mon Sep 17 00:00:00 2001 From: Gord Stephen Date: Sun, 21 Aug 2016 15:48:25 -0400 Subject: [PATCH 08/11] Rename ModelMatrixContainer and remove unneeded variables/methods --- src/statsmodels/formula.jl | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 628e15017e..b6de297079 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -48,9 +48,9 @@ type ModelFrame contrasts::Dict{Symbol, ContrastsMatrix} end -typealias ModelMatrixContainer{T<:AbstractFloat} AbstractMatrix{T} +typealias AbstractFloatMatrix{T<:AbstractFloat} AbstractMatrix{T} -type ModelMatrix{T <: ModelMatrixContainer} +type ModelMatrix{T <: AbstractFloatMatrix} m::T assign::Vector{Int} end @@ -323,9 +323,6 @@ function setcontrasts!(mf::ModelFrame, new_contrasts::Dict) end setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs)) -asmatrix(T::Type, a::AbstractMatrix) = convert(T, a) -asmatrix(T::Type, v::AbstractVector) = convert(T, reshape(v, (length(v), 1))) - """ StatsBase.model_response(mf::ModelFrame) Extract the response column, if present. `DataVector` or @@ -339,11 +336,8 @@ function StatsBase.model_response(mf::ModelFrame) end end -modelmat_cols{T<:ModelMatrixContainer}(::Type{T}, v::DataVector) = asmatrix(T, convert(Vector{Float64}, v.data)) -modelmat_cols{T<:ModelMatrixContainer}(::Type{T}, v::Vector) = asmatrix(T, convert(Vector{Float64}, v)) - ## construct model matrix columns from model frame + name (checks for contrasts) -function modelmat_cols{T<:ModelMatrixContainer}(::Type{T}, name::Symbol, mf::ModelFrame; non_redundant::Bool = false) +function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::ModelFrame; non_redundant::Bool = false) if haskey(mf.contrasts, name) modelmat_cols(T, mf.df[name], non_redundant ? @@ -354,13 +348,16 @@ function modelmat_cols{T<:ModelMatrixContainer}(::Type{T}, name::Symbol, mf::Mod end end +modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::DataVector) = convert(T, reshape(v.data, length(v), 1)) +modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Vector) = convert(T, reshape(v, length(v), 1)) + """ - modelmat_cols(T::Type{ModelMatrixContainer}, v::PooledDataVector, contrast::ContrastsMatrix) + modelmat_cols(T::Type{AbstractFloatMatrix}, v::PooledDataVector, contrast::ContrastsMatrix) Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that levels align properly. """ -function modelmat_cols{T<:ModelMatrixContainer}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix) +function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix) ## make sure the levels of the contrast matrix and the categorical data ## are the same by constructing a re-indexing vector. Indexing into ## reindex with v.refs will give the corresponding row number of the @@ -374,7 +371,7 @@ end expandcols(trm::Vector) Create pairwise products of columns from a vector of matrices """ -function expandcols{T<:ModelMatrixContainer}(trm::Vector{T}) +function expandcols{T<:AbstractFloatMatrix}(trm::Vector{T}) if length(trm) == 1 trm[1] else @@ -440,8 +437,7 @@ If there is an intercept in the model, that column occurs first and its Mixed-effects models include "random-effects" terms which are ignored when creating the model matrix. """ -@compat function (::Type{ModelMatrix{T}}){T<:ModelMatrixContainer}(mf::ModelFrame) - sparsemm = T <: AbstractSparseMatrix +@compat function (::Type{ModelMatrix{T}}){T<:AbstractFloatMatrix}(mf::ModelFrame) dfrm = mf.df terms = droprandomeffects(dropresponse!(mf.terms)) From dd0ae91a69dd3e012b8e932811e3be7810849788 Mon Sep 17 00:00:00 2001 From: Gord Stephen Date: Sun, 21 Aug 2016 20:40:31 -0400 Subject: [PATCH 09/11] Split value assignment onto two lines --- src/statsmodels/formula.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index b6de297079..413faaf4c4 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -375,7 +375,8 @@ function expandcols{T<:AbstractFloatMatrix}(trm::Vector{T}) if length(trm) == 1 trm[1] else - a, b = trm[1], expandcols(trm[2 : end]) + a = trm[1] + b = expandcols(trm[2 : end]) reduce(hcat, [broadcast(*, a, Compat.view(b, :, j)) for j in 1 : size(b, 2)]) end end From db5831862e86fc0cf445b3684b669dd276b9b4d0 Mon Sep 17 00:00:00 2001 From: Gord Stephen Date: Mon, 22 Aug 2016 19:45:48 -0400 Subject: [PATCH 10/11] Fix test result spacing and incorrect method signature documentation --- src/statsmodels/formula.jl | 2 +- test/formula.jl | 84 +++++++++++++++++++------------------- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 413faaf4c4..2f13d7019c 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -352,7 +352,7 @@ modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::DataVector) = convert(T, res modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Vector) = convert(T, reshape(v, length(v), 1)) """ - modelmat_cols(T::Type{AbstractFloatMatrix}, v::PooledDataVector, contrast::ContrastsMatrix) + modelmat_cols(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix) Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that levels align properly. diff --git a/test/formula.jl b/test/formula.jl index 401d17b9ee..5ae777e632 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -405,13 +405,13 @@ d[:n] = 1.:8 mf = ModelFrame(n ~ 0 + x, d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m == [1 0 - 0 1 - 1 0 - 0 1 - 1 0 - 0 1 - 1 0 - 0 1] + 0 1 + 1 0 + 0 1 + 1 0 + 0 1 + 1 0 + 0 1] @test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["x: a", "x: b"] @@ -419,13 +419,13 @@ mm = ModelMatrix(mf) mf = ModelFrame(n ~ 1 + x + x&y, d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m[:, 2:end] == [-1 -1 0 - 1 0 -1 - -1 1 0 - 1 0 1 - -1 -1 0 - 1 0 -1 - -1 1 0 - 1 0 1] + 1 0 -1 + -1 1 0 + 1 0 1 + -1 -1 0 + 1 0 -1 + -1 1 0 + 1 0 1] @test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["(Intercept)", "x: b", "x: a & y: d", "x: b & y: d"] @@ -433,13 +433,13 @@ mm = ModelMatrix(mf) mf = ModelFrame(n ~ 0 + x&y, d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m == [1 0 0 0 - 0 1 0 0 - 0 0 1 0 - 0 0 0 1 - 1 0 0 0 - 0 1 0 0 - 0 0 1 0 - 0 0 0 1] + 0 1 0 0 + 0 0 1 0 + 0 0 0 1 + 1 0 0 0 + 0 1 0 0 + 0 0 1 0 + 0 0 0 1] @test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["x: a & y: c", "x: b & y: c", "x: a & y: d", "x: b & y: d"] @@ -457,13 +457,13 @@ mm = ModelMatrix(mf) mf = ModelFrame(n ~ 0 + x&y + x&z, d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m == [1 0 0 0 -1 0 - 0 1 0 0 0 -1 - 0 0 1 0 -1 0 - 0 0 0 1 0 -1 - 1 0 0 0 1 0 - 0 1 0 0 0 1 - 0 0 1 0 1 0 - 0 0 0 1 0 1] + 0 1 0 0 0 -1 + 0 0 1 0 -1 0 + 0 0 0 1 0 -1 + 1 0 0 0 1 0 + 0 1 0 0 0 1 + 0 0 1 0 1 0 + 0 0 0 1 0 1] @test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["x: a & y: c", "x: b & y: c", "x: a & y: d", "x: b & y: d", @@ -475,13 +475,13 @@ mm = ModelMatrix(mf) mf = ModelFrame(n ~ 0 + x&y + x&z + x&y&z, d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m == [1 0 0 0 -1 0 1 0 - 0 1 0 0 0 -1 0 1 - 0 0 1 0 -1 0 -1 0 - 0 0 0 1 0 -1 0 -1 - 1 0 0 0 1 0 -1 0 - 0 1 0 0 0 1 0 -1 - 0 0 1 0 1 0 1 0 - 0 0 0 1 0 1 0 1] + 0 1 0 0 0 -1 0 1 + 0 0 1 0 -1 0 -1 0 + 0 0 0 1 0 -1 0 -1 + 1 0 0 0 1 0 -1 0 + 0 1 0 0 0 1 0 -1 + 0 0 1 0 1 0 1 0 + 0 0 0 1 0 1 0 1] @test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["x: a & y: c", "x: b & y: c", "x: a & y: d", "x: b & y: d", @@ -494,13 +494,13 @@ mm = ModelMatrix(mf) mf = ModelFrame(n ~ 0 + x + x&y + x&z, d, contrasts=cs) mm = ModelMatrix(mf) @test mm.m == [1 0 -1 0 -1 0 - 0 1 0 -1 0 -1 - 1 0 1 0 -1 0 - 0 1 0 1 0 -1 - 1 0 -1 0 1 0 - 0 1 0 -1 0 1 - 1 0 1 0 1 0 - 0 1 0 1 0 1] + 0 1 0 -1 0 -1 + 1 0 1 0 -1 0 + 0 1 0 1 0 -1 + 1 0 -1 0 1 0 + 0 1 0 -1 0 1 + 1 0 1 0 1 0 + 0 1 0 1 0 1] @test mm.m == ModelMatrix{sparsetype}(mf).m @test coefnames(mf) == ["x: a", "x: b", "x: a & y: d", "x: b & y: d", From 25935c05358cfd9f50fdc302712c73b86dd35eec Mon Sep 17 00:00:00 2001 From: Gord Stephen Date: Fri, 26 Aug 2016 17:17:35 -0400 Subject: [PATCH 11/11] Docstring updates --- src/statsmodels/formula.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/statsmodels/formula.jl b/src/statsmodels/formula.jl index 2f13d7019c..165fce802d 100644 --- a/src/statsmodels/formula.jl +++ b/src/statsmodels/formula.jl @@ -352,7 +352,7 @@ modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::DataVector) = convert(T, res modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::Vector) = convert(T, reshape(v, length(v), 1)) """ - modelmat_cols(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix) + modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix) Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that levels align properly. @@ -368,7 +368,7 @@ function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, c end """ - expandcols(trm::Vector) + expandcols{T<:AbstractFloatMatrix}(trm::Vector{T}) Create pairwise products of columns from a vector of matrices """ function expandcols{T<:AbstractFloatMatrix}(trm::Vector{T}) @@ -423,8 +423,9 @@ end """ - ModelMatrix(mf::ModelFrame) -Create a `ModelMatrix` from the `terms` and `df` members of `mf` + ModelMatrix{T<:AbstractFloatMatrix}(mf::ModelFrame) +Create a `ModelMatrix` of type `T` (default `Matrix{Float64}`) from the +`terms` and `df` members of `mf`. This is basically a map-reduce where terms are mapped to columns by `cols` and reduced by `hcat`. During the collection of the columns the `assign`