diff --git a/DESCRIPTION b/DESCRIPTION index 2479f362..6155fe16 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,7 @@ Authors@R: family = "Lenth", role = "ctb", email = "russell-lenth@uiowa.edu")) -Imports: stats, graphics, grDevices, tools, utils, methods, numDeriv, nlme, sandwich, Rcpp(>= 1.0.5), dreamerr(>= 1.4.0), stringmagic(>= 1.1.2) +Imports: stats, graphics, grDevices, tools, utils, methods, numDeriv, nlme, sandwich, Rcpp(>= 1.0.5), dreamerr(>= 1.4.0), stringmagic(>= 1.1.2), Matrix Suggests: knitr, rmarkdown, data.table, plm, MASS, pander, ggplot2, lfe, tinytex, pdftools, emmeans, estimability, AER LinkingTo: Rcpp Depends: R(>= 3.5.0) diff --git a/NAMESPACE b/NAMESPACE index 610057f2..baf1ff04 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -39,6 +39,9 @@ export(as.dict) # setters & getters exportPattern("^(s|g)etFixest") +# sparse_model_matrix +export(sparse_model_matrix) + # coeftable and co export(coeftable, se, pvalue, tstat) S3method(coeftable, default) diff --git a/NEWS.md b/NEWS.md index 95003bcc..ba804165 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,38 @@ # fixest 0.12.1 +## New Features + +- Add the ability to create sparse model matrices from regression objects and formula in a memory-efficient way. + +```R +est = feols(mpg ~ drat | cyl, mtcars) + +sparse_model_matrix(est, type = "lhs") +#> 32 x 1 sparse Matrix of class "dgCMatrix" +#> mpg +#> [1,] 21.0 +#> [2,] 21.0 +#> [3,] 22.8 + +sparse_model_matrix(est, type = c("rhs", "fixef")) +#> 32 x 4 sparse Matrix of class "dgCMatrix" +#> drat cyl::4 cyl::6 cyl::8 +#> [1,] 3.90 . 1 . +#> [2,] 3.90 . 1 . +#> [3,] 3.85 1 . . + +# Or you can pass a (linear) formula +sparse_model_matrix(mpg ~ i(vs) | gear^cyl, data = mtcars, type = c("rhs", "fixef")) +#> 32 x 10 sparse Matrix of class "dgCMatrix" +#> [[ suppressing 10 column names ‘vs::0’, ‘vs::1’, ‘gear^cyl::4_6’ ... ]] +#> +#> [1,] 1 . 1 . . . . . . . +#> [2,] 1 . 1 . . . . . . . +#> [3,] . 1 . 1 . . . . . . +``` + + ## Major bugs affecting R versions <= 4.1.2 - require `stringmagic` version >= 1.1.2 to fix several major bugs affecting R versions <= 4.1.2 diff --git a/R/miscfuns.R b/R/miscfuns.R index 70604c0e..0b92cefe 100644 --- a/R/miscfuns.R +++ b/R/miscfuns.R @@ -1450,7 +1450,7 @@ i = function(factor_var, var, ref, keep, bin, ref2, keep2, bin2, ...){ items_name = items } - if(FROM_FIXEST){ + if(FROM_FIXEST || is_sparse){ # Pour avoir des jolis noms c'est un vrai gloubiboulga, # mais j'ai pas trouve plus simple... if(IS_INTER_FACTOR){ @@ -1487,6 +1487,10 @@ i = function(factor_var, var, ref, keep, bin, ref2, keep2, bin2, ...){ fe_colid = to_integer(fe_num[valid_row], sorted = TRUE) values = if(length(var) == 1) rep(1, length(valid_row)) else var + + # Clean names + col_names = sub("^.*__CLEAN__", "", col_names) + res = list(rowid = which(valid_row), values = values, colid = fe_colid, col_names = col_names) diff --git a/R/sparse_model_matrix.R b/R/sparse_model_matrix.R new file mode 100644 index 00000000..8538998c --- /dev/null +++ b/R/sparse_model_matrix.R @@ -0,0 +1,708 @@ +#' Design matrix of a `fixest` object returned in sparse format +#' +#' This function creates the left-hand-side or the right-hand-side(s) of a [`femlm`], [`feols`] or [`feglm`] estimation. +#' +#' @inheritParams nobs.fixest +#' +#' @param data If missing (default) then the original data is obtained by evaluating the `call`. Otherwise, it should be a `data.frame`. +#' @param type Character vector or one sided formula, default is "rhs". Contains the type of matrix/data.frame to be returned. Possible values are: "lhs", "rhs", "fixef", "iv.rhs1" (1st stage RHS), "iv.rhs2" (2nd stage RHS), "iv.endo" (endogenous vars.), "iv.exo" (exogenous vars), "iv.inst" (instruments). +#' @param na.rm Default is `TRUE`. Should observations with NAs be removed from the matrix? +#' @param collin.rm Logical scalar. Whether to remove variables that were found to be collinear during the estimation. Beware: it does not perform a collinearity check and bases on the `coef(object)`. Default is TRUE if object is a `fixest` object, or FALSE if object is a formula. +#' @param combine Logical scalar, default is `TRUE`. Whether to combine each resulting sparse matrix +#' @param ... Not currently used. +#' +#' @return +#' It returns either a single sparse matrix a list of matrices, depending whether `combine` is `TRUE` or `FALSE`. The sparse matrix is of class `dgCMatrix` from the `Matrix` package. +#' +#' @seealso +#' See also the main estimation functions [`femlm`], [`feols`] or [`feglm`]. [`formula.fixest`], [`update.fixest`], [`summary.fixest`], [`vcov.fixest`]. +#' +#' +#' @author +#' Laurent Berge, Kyle Butts +#' +#' @examples +#' +#' est = feols(wt ~ i(vs) + hp | cyl, mtcars) +#' sparse_model_matrix(est) +#' sparse_model_matrix(wt ~ i(vs) + hp | cyl, mtcars) +#' +#' @export +sparse_model_matrix = function(object, data, type = "rhs", na.rm = TRUE, collin.rm = NULL, combine = TRUE, ...) { + # We evaluate the formula with the past call + # type: lhs, rhs, fixef, iv.endo, iv.inst, iv.rhs1, iv.rhs2 + # if fixef => return a DF + + # Checking the arguments + if (is_user_level_call()) { + validate_dots(suggest_args = c("data", "type")) + } + + # We allow type to be used in the location of data if data is missing + if (!missing(data) && missing(type)) { + sc = sys.call() + if (!"data" %in% names(sc)) { + if (!is.null(data) && (is.character(data) || "formula" %in% class(data))) { + # data is in fact the type + type = data + data = NULL + } + } + } + + type = check_set_types(type, c("lhs", "rhs", "fixef", "iv.endo", "iv.inst", "iv.exo", "iv.rhs1", "iv.rhs2")) + + if (isTRUE(object$is_fit)) { + stop("sparse_model_matrix method not available for fixest estimations obtained from fit methods.") + } + + if (any(grepl("^iv", type)) && !isTRUE(object$iv)) { + stop("The type", enumerate_items(grep("^iv", type, value = TRUE), "s.is"), " only valid for IV estimations.") + } + + check_arg(subset, "logical scalar | character vector no na") + + if (missing(collin.rm)) { + collin.rm = if (inherits(object, "formula")) FALSE else TRUE + } else { + check_arg_plus(collin.rm, "logical scalar") + } + + # Evaluation with the data + original_data = FALSE + if (missnull(data)) { + original_data = TRUE + data = fetch_data(object, "To apply 'sparse_model_matrix', ") + } + + # control of the data + if (is.matrix(data)) { + if (is.null(colnames(data))) { + stop("If argument 'data' is to be a matrix, its columns must be named.") + } + data = as.data.frame(data) + } + + if (!"data.frame" %in% class(data)) { + stop("The argument 'data' must be a data.frame or a matrix.") + } + + + # Allows passage of formula to sparse_model_matrix. A bit inefficient, but it works. + isFormula = FALSE + split_fml = NULL + if (inherits(object, "formula")) { + split_fml = fml_split_internal(object) + if (isTRUE(collin.rm)) { + message("Formula passed to sparse_model_matrix with collin.rm == TRUE. Estimating feols with formula.") + object = feols(object, data = data) + } else if (length(split_fml) == 3) { + message("Formula passed to sparse_model_matrix with an iv formula. Estimating feols with formula.") + object = feols(object, data = data) + } else { + isFormula = TRUE + } + } + + # na.rm = FALSE doesn't work with type = "fixef" (which FE col gets NA?) + if (("fixef" %in% type & !na.rm)) { + # na.rm = TRUE + message("na.rm = FALSE doesn't work with type = 'fixef'. It has been set to TRUE.") + } + + + # Panel setup + if (!isFormula) { + panel__meta__info = set_panel_meta_info(object, data) + } + + # The formulas + if (isFormula) { + fml_linear = split_fml[[1]] + + fml_0 = attr(stats::terms(fml_linear), "intercept") == 0 + fake_intercept = !is.null(split_fml[[2]]) | fml_0 + + } else { + fml_linear = formula(object, type = "linear") + + # we kick out the intercept if there is presence of fixed-effects + fml_0 = attr(stats::terms(fml_linear), "intercept") == 0 + fake_intercept = !is.null(object$fixef_vars) && !(!is.null(object$slope_flag) && all(object$slope_flag < 0)) + fake_intercept = fake_intercept | fml_0 + } + + res = list() + + if ("lhs" %in% type) { + lhs_text = deparse_long(fml_linear[[2]]) + lhs = eval(fml_linear[[2]], data) + lhs = Matrix::Matrix(lhs, sparse = TRUE, ncol = 1) + + colnames(lhs) = lhs_text + + res[["lhs"]] = lhs + } + + if ("rhs" %in% type && !isTRUE(object$onlyFixef)) { + + fml = fml_linear + + if (isFormula && (length(split_fml) == 3)) { + fml_iv = split_fml[[3]] + fml = .xpd(..lhs ~ ..endo + ..rhs, ..lhs = fml[[2]], ..endo = fml_iv[[2]], ..rhs = fml[[3]]) + } else if (isTRUE(object$iv)) { + fml_iv = object$fml_all$iv + fml = .xpd(..lhs ~ ..endo + ..rhs, ..lhs = fml[[2]], ..endo = fml_iv[[2]], ..rhs = fml[[3]]) + } + + vars = attr(stats::terms(fml), "term.labels") + + linear.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "rhs", add_intercept = !fake_intercept) + + res[["rhs"]] = linear.mat + } + + if ("fixef" %in% type) { + + if (isFormula && (length(split_fml) < 2)) { + mat_FE = NULL + } else if (!isFormula & length(object$fixef_id) == 0) { + mat_FE = NULL + } else { + + if (isFormula) { + fixef_fml = .xpd(~ ..fe, ..fe = split_fml[[2]]) + } else { + fixef_fml = object$fml_all$fixef + } + + fixef_terms_full = fixef_terms(fixef_fml) + fe_vars = fixef_terms_full$fe_vars + slope_var_list = fixef_terms_full$slope_vars_list + + fixef_df = prepare_df(fe_vars, data, fastCombine = FALSE) + + # Check for slope vars + if (any(fixef_terms_full$slope_flag > 0)) { + slope_df = prepare_df(unlist(slope_var_list), data) + } + + cols_lengths = c(0) + total_cols = 0 + n_FE = 0 + nrows = nrow(data) + for (i in seq_along(fe_vars)) { + + fe_var = fe_vars[i] + slope_vars = slope_var_list[[i]] + n_slope_vars = if (is.null(slope_vars)) 0 else length(slope_vars) + + fe = fixef_df[[fe_var]] + unique_fe = unique(fe) + n_cols = length(unique_fe[!is.na(unique_fe)]) + + total_cols = total_cols + n_cols * (1 + n_slope_vars) + cols_lengths = c(cols_lengths, rep(n_cols, 1 + n_slope_vars)) + n_FE = n_FE + 1 + n_slope_vars + } + running_cols = cumsum(cols_lengths) + + id_all = names_all = val_all = vector("list", n_FE) + rowid = 1:nrows + j = 1 + for (i in seq_along(fe_vars)) { + + fe_var = fe_vars[i] + xi = fixef_df[[fe_var]] + + keep = which(!is.na(xi)) + if (length(keep) == 0) stop("All values of the fixed-effect variable '", fe_var, "' are NA.") + + xi = xi[keep] + xi_quf = quickUnclassFactor(xi, addItem = TRUE) + + col_id = xi_quf$x + col_levels = as.character(xi_quf$items) + + slope_vars = slope_var_list[[i]] + n_slope_vars = if (is.null(slope_vars)) 0 else length(slope_vars) + + # Be careful with NAs + # First fixed-effect by itself + val_all[[j]] = c(rep(1, length(col_id)), rep(NA, nrows - length(keep))) + id_all[[j]] = cbind( + c( + rowid[keep], + rowid[-keep] + ), + c( + running_cols[j] + col_id, + rep(running_cols[j] + 1, nrows - length(keep)) + ) + ) + names_all[[j]] = paste0(fe_var, "::", col_levels) + j = j + 1 + + for (k in seq_along(slope_vars)) { + slope_var = slope_vars[k] + slope = slope_df[[slope_var]] + + val_all[[j]] = c(as.numeric(slope[keep]), rep(NA, nrows - length(keep))) + id_all[[j]] = cbind( + c( + rowid[keep], + rowid[-keep] + ), + c( + running_cols[j] + col_id, + rep(running_cols[j] + 1, nrows - length(keep)) + ) + ) + names_all[[j]] = paste0(fe_var, "[[", slope_var, "]]", "::", col_levels) + j = j + 1 + } + } + + id_mat = do.call(rbind, id_all) + val_vec = unlist(val_all) + names_vec = unlist(names_all) + + mat_FE = Matrix::sparseMatrix( + i = id_mat[, 1], + j = id_mat[, 2], + x = val_vec, + dimnames = list(NULL, names_vec) + ) + + + # Keep non-zero FEs + if (collin.rm == TRUE) { + fixefs = fixef(object, sorted = TRUE) + + select = lapply( + names(fixefs), + function(var) { + names = names(fixefs[[var]]) + names = names[fixefs[[var]] != 0] + + paste0(var, "::", names) + } + ) + select = unlist(select) + + # When original_data isn't used, some FEs may not be in the new dataset, add them in + missing_cols = setdiff(select, colnames(mat_FE)) + mat_FE = cbind( + mat_FE, + Matrix::Matrix(0, ncol = length(missing_cols), nrow = nrow(mat_FE), sparse = TRUE, dimnames = list(NULL, missing_cols)) + ) + + # Subset fixef and order columns to match fixef(object) + idx = which(colnames(mat_FE) %in% select) + mat_FE = mat_FE[, idx, drop = FALSE] + + # Reorder columns to match fixef(object) + reorder = match(unlist(select), colnames(mat_FE)) + mat_FE = mat_FE[, reorder[!is.na(reorder)], drop = FALSE] + } + + } + + res[["fixef"]] = mat_FE + } + + # + # IV + # + if ("iv.endo" %in% type) { + fml = object$iv_endo_fml + vars = attr(stats::terms(fml), "term.labels") + endo.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm) + + res[["iv.endo"]] = endo.mat + } + + if ("iv.inst" %in% type) { + fml = object$fml_all$iv + vars = attr(stats::terms(fml), "term.labels") + inst.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm) + + res[["iv.inst"]] = inst.mat + } + + if ("iv.exo" %in% type) { + + fml = object$fml_all$linear + vars = attr(stats::terms(fml), "term.labels") + exo.mat = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "iv.exo", add_intercept = !fake_intercept) + + res[["iv.exo"]] = exo.mat + } + + if ("iv.rhs1" %in% type) { + fml = object$fml + if (object$iv_stage == 2) { + fml_iv = object$fml_all$iv + fml = .xpd(..lhs ~ ..inst + ..rhs, ..lhs = fml[[2]], ..inst = fml_iv[[3]], ..rhs = fml[[3]]) + } + vars = attr(stats::terms(fml), "term.labels") + iv_rhs1 = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "iv.rhs1", add_intercept = !fake_intercept) + + res[["iv.rhs1"]] = iv_rhs1 + } + + if ("iv.rhs2" %in% type) { + # Second stage + if (!object$iv_stage == 2) { + stop("In model.matrix, the type 'iv.rhs2' is only valid for second stage IV models. This estimation is the first stage.") + } + + # I) we get the fitted values + stage_1 = object$iv_first_stage + + fit_vars = c() + for (i in seq_along(stage_1)) { + fit_vars[i] = v = paste0("fit_", names(stage_1)[i]) + data[[v]] = predict(stage_1[[i]], newdata = data, sample = "original") + } + + # II) we create the variables + fml = object$fml + fml = .xpd(..lhs ~ ..fit + ..rhs, ..lhs = fml[[2]], ..fit = fit_vars, ..rhs = fml[[3]]) + vars = attr(stats::terms(fml), "term.labels") + iv_rhs2 = vars_to_sparse_mat(vars = vars, data = data, object = object, collin.rm = collin.rm, type = "iv.rhs2", add_intercept = !fake_intercept) + + res[["iv.rhs2"]] = iv_rhs2 + } + + if (na.rm) { + na_rows = lapply(res, function(mat) { + # Get rows with NA + 1 + mat@i[is.na(mat@x)] + }) + + na_rows = unlist(na_rows, use.names = FALSE) + + if (original_data) { + na_rows = c(na_rows, -1 * object$obs_selection$obsRemoved) + } + + na_rows = unique(na_rows) + + if (length(na_rows) > 0) { + res = lapply(res, function(mat) { + mat[-na_rows, , drop = FALSE] + }) + } + } + + # Formatting res + if (length(res) == 0) { + return(NULL) + } else if (length(type) > 1) { + if (combine) { + res = res[type] + res = do.call(cbind, unname(res)) + } + } else { + res = res[[1]] + } + + res +} + + + +# Internal: modifies the calls so that each variable/interaction is evaluated with mult_sparse +mult_wrap = function(x) { + # x: character string of a variable to be evaluated + # ex: "x1" => mult_sparse(x1) + # "x1:factor(x2):x3" => mult_sparse(x3, factor(x2), x1) + # + # We also add the argument sparse to i() + # "x1:i(species, TRUE)" => mult_sparse(x1, i(species, TRUE, sparse = TRUE)) + + x_call = str2lang(x) + + res = (~ mult_sparse())[[2]] + + if (length(x_call) == 1 || x_call[[1]] != ":") { + res[[2]] = x_call + + } else { + res[[2]] = x_call[[3]] + tmp = x_call[[2]] + + while (length(tmp) == 3 && tmp[[1]] == ":") { + res[[length(res) + 1]] = tmp[[3]] + tmp = tmp[[2]] + } + + res[[length(res) + 1]] = tmp + } + + # We also add sparse to i() if found + for (i in 2:length(res)) { + ri = res[[i]] + if (length(ri) > 1 && ri[[1]] == "i") { + ri[["sparse"]] = TRUE + res[[i]] = ri + } + } + + if (length(res) > 2) { + # we restore the original order + res[-1] = rev(res[-1]) + } + + return(res) +} + +# Internal function to evaluate the variables (and interactions) in a sparse way +mult_sparse = function(...) { + # Only sparsifies factor variables + # Takes care of interactions + + dots = list(...) + n = length(dots) + mc = match.call() + var_call = sapply(seq_len(n), function(i) deparse_long(mc[[i+1]])) + + num_var = NULL + factor_list = list() + factor_idx = c() + i_idx = c() + info_i = NULL + is_i = is_factor = FALSE + # You can't have interactions between i and factors, it's either + + for (i in 1:n) { + xi = dots[[i]] + vari = mc[[i+1]] + if (is.numeric(xi)) { + # We stack the product + num_var = if (is.null(num_var)) xi else xi * num_var + } else if (inherits(xi, "i_sparse")) { + is_i = TRUE + info_i = xi + i_idx = c(i_idx, i) + } else { + is_factor = TRUE + factor_list[[length(factor_list) + 1]] = xi + factor_idx = c(factor_idx, i) + } + } + + if (is_factor && is_i) { + stop("Unfortunately, can not interact factor with `i()` in sparse_model_matrix") + } + + # numeric + if (!is_i && !is_factor) { + return(num_var) + } + # factor() + if (is_factor) { + factor_list$add_items = TRUE + factor_list$items.list = TRUE + factor_list$multi.join = " ; " + fact_as_int = do.call(to_integer, factor_list) + + values = if (is.null(num_var)) rep(1, length(fact_as_int$x)) else num_var + rowid = seq_along(values) + + # messy, but need this to get things like `factor(am)1:hp:factor(cyl)6` + items_mat = do.call(rbind, strsplit(fact_as_int$items, " ; ")) + + # intersplice var_call with items + col_names = sapply(seq_len(nrow(items_mat)), function(i) { + pasteable = var_call + pasteable[factor_idx] = paste0(var_call[factor_idx], items_mat[i, ]) + paste(pasteable, collapse = ":") + }, USE.NAMES = FALSE) + + res = list(rowid = rowid, colid = fact_as_int$x, values = values, + col_names = col_names, n_cols = length(fact_as_int$items)) + # i() + } else { + + values = info_i$values + if (!is.null(num_var)) { + num_var = num_var[info_i$rowid] + values = values * num_var + col_names = sapply(info_i$col_names, function(item) { + pasteable = var_call + pasteable[i_idx] = item + paste(pasteable, collapse = ":") + }, USE.NAMES = FALSE) + } else { + col_names = info_i$col_names + } + + res = list(rowid = info_i$rowid, colid = info_i$colid, + values = values[info_i$rowid], + col_names = col_names, + n_cols = length(info_i$col_names)) + } + + class(res) = "sparse_var" + + res +} + +# Takes a vector of strings denoting the variables (including terms like `poly()`, `i()`, `I()`, `lag()`, etc.) and returns a sparse matrix of the variables extracted from `data`. +vars_to_sparse_mat = function(vars, data, collin.rm = FALSE, object = NULL, type = NULL, add_intercept = FALSE) { + + if (length(vars) == 0) { + # Case only FEs + mat = NULL + } else { + + # Since we don't want to evaluate the factors, + # the code is a bit intricate because we have to catch them before + # any interaction takes place + # + # that's why I wrap interactions in a function (mult_sparse()) + # + + # Below, we evaluate all the variables in a "sparse" way + + vars_calls = lapply(vars, mult_wrap) + + n = length(vars) + variables_list = vector("list", n) + for (i in 1:n) { + variables_list[[i]] = eval(vars_calls[[i]], data) + } + + # To create the sparse matrix, we need the column indexes + total_cols = 0 + running_cols = c(0) + for (i in 1:n) { + xi = variables_list[[i]] + if (inherits(xi, "sparse_var")) { + total_cols = total_cols + xi$n_cols + } else { + total_cols = total_cols + NCOL(xi) + } + running_cols[i + 1] = total_cols + } + + # We just create a sparse matrix and fill it + + # 1) creating the indexes + names + + # NOTA: I use lists to avoid creating copies + rowid = 1:nrow(data) + id_all = values_all = names_all = vector("list", n) + for (i in 1:n) { + xi = variables_list[[i]] + + if (inherits(xi, "sparse_var")) { + + id_all[[i]] = cbind(xi$rowid, running_cols[i] + xi$colid) + values_all[[i]] = xi$values + names_all[[i]] = xi$col_names + + } else if (NCOL(xi) == 1) { + + id_all[[i]] = cbind(rowid, running_cols[i] + 1) + values_all[[i]] = xi + names_all[[i]] = vars[[i]] + + } else { + + colid = rep(1:NCOL(xi), each = nrow(data)) + id_all[[i]] = cbind(rep(rowid, NCOL(xi)), running_cols[i] + colid) + values_all[[i]] = as.vector(xi) + if (!is.null(colnames(xi))) { + names_all[[i]] = paste0(vars[[i]], colnames(xi)) + } else { + names_all[[i]] = paste0(vars[[i]], 1:NCOL(xi)) + } + + } + } + + id_mat = do.call(rbind, id_all) + values_vec = unlist(values_all) + names_vec = unlist(names_all) + + # 2) filling the matrix: one shot, no copies + + mat = Matrix::Matrix(0, nrow(data), total_cols, dimnames = list(NULL, names_vec)) + mat[id_mat] = values_vec + + } + + if (collin.rm & is.null(object)) { + stop("You need to provide the 'object' argument to use 'collin.rm = TRUE'.") + } + + if (collin.rm) { + qui = which(colnames(mat) %in% object$collin.var) + if (length(qui) == ncol(mat)) { + mat = NULL + } else if (length(qui) > 0) { + mat = mat[, -qui, drop = FALSE] + } + + coefs = names(object$coefficients) + if (isTRUE(object$iv)) { + fml_iv = object$fml_all$iv + endo = fml_iv[[2]] + + # Trick to get the rhs variables as a character vector + endo = .xpd(~ ..endo, ..endo = endo) + endo = attr(stats::terms(endo), "term.labels") + + exo = lapply(object$iv_first_stage, function(x) names(stats::coef(x))) + exo = unique(unlist(exo, use.names = FALSE)) + } + + # Custom subsetting for na.rm depending on `type` + if (!is.null(type)) { + if (type == "rhs") { + if (isTRUE(object$iv)) { + keep = c(endo, coefs) + } else { + keep = coefs + } + } else if (type == "iv.exo") { + keep = coefs + } else if (type == "iv.exo") { + keep = c(endo, coefs) + } else if (type == "iv.rhs1") { + keep = c(exo, coefs) + } else if (type == "iv.rhs2") { + keep = coefs + } + + keep = keep[!keep %in% object$collin.var] + if (length(keep) == 0) { + mat = NULL + } else { + idx = which(colnames(mat) %in% keep) + mat = mat[, idx, drop = FALSE] + } + } + + if (length(coefs) == ncol(mat) && any(colnames(mat) != names(coefs))) { + # we reorder the matrix + # This can happen in multiple estimations, where we respect the + # order of the user + + if (all(names(coefs) %in% colnames(mat))) { + mat = mat[, names(coefs), drop = FALSE] + } + } + } + + if (add_intercept) { + mat = cbind(1, mat) + colnames(mat)[1] = "(Intercept)" + } + + return(mat) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 1698743f..35a57f73 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -91,6 +91,7 @@ reference: - formula.fixest - terms.fixest - df.residual.fixest + - sparse_model_matrix - title: Formula tools desc: Tools to manipulate formulas contents: diff --git a/man/sparse_model_matrix.Rd b/man/sparse_model_matrix.Rd new file mode 100644 index 00000000..503dd2cd --- /dev/null +++ b/man/sparse_model_matrix.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sparse_model_matrix.R +\name{sparse_model_matrix} +\alias{sparse_model_matrix} +\title{Design matrix of a \code{fixest} object returned in sparse format} +\usage{ +sparse_model_matrix( + object, + data, + type = "rhs", + na.rm = TRUE, + collin.rm = NULL, + combine = TRUE, + ... +) +} +\arguments{ +\item{object}{A \code{fixest} object. Obtained using the functions \code{\link{femlm}}, \code{\link{feols}} or \code{\link{feglm}}.} + +\item{data}{If missing (default) then the original data is obtained by evaluating the \code{call}. Otherwise, it should be a \code{data.frame}.} + +\item{type}{Character vector or one sided formula, default is "rhs". Contains the type of matrix/data.frame to be returned. Possible values are: "lhs", "rhs", "fixef", "iv.rhs1" (1st stage RHS), "iv.rhs2" (2nd stage RHS), "iv.endo" (endogenous vars.), "iv.exo" (exogenous vars), "iv.inst" (instruments).} + +\item{na.rm}{Default is \code{TRUE}. Should observations with NAs be removed from the matrix?} + +\item{collin.rm}{Logical scalar. Whether to remove variables that were found to be collinear during the estimation. Beware: it does not perform a collinearity check and bases on the \code{coef(object)}. Default is TRUE if object is a \code{fixest} object, or FALSE if object is a formula.} + +\item{combine}{Logical scalar, default is \code{TRUE}. Whether to combine each resulting sparse matrix} + +\item{...}{Not currently used.} +} +\value{ +It returns either a single sparse matrix a list of matrices, depending whether \code{combine} is \code{TRUE} or \code{FALSE}. The sparse matrix is of class \code{dgCMatrix} from the \code{Matrix} package. +} +\description{ +This function creates the left-hand-side or the right-hand-side(s) of a \code{\link{femlm}}, \code{\link{feols}} or \code{\link{feglm}} estimation. +} +\examples{ + +est = feols(wt ~ i(vs) + hp | cyl, mtcars) +sparse_model_matrix(est) +sparse_model_matrix(wt ~ i(vs) + hp | cyl, mtcars) + +} +\seealso{ +See also the main estimation functions \code{\link{femlm}}, \code{\link{feols}} or \code{\link{feglm}}. \code{\link{formula.fixest}}, \code{\link{update.fixest}}, \code{\link{summary.fixest}}, \code{\link{vcov.fixest}}. +} +\author{ +Laurent Berge, Kyle Butts +} diff --git a/tests/fixest_tests.R b/tests/fixest_tests.R index d68bea95..9ac5b62d 100644 --- a/tests/fixest_tests.R +++ b/tests/fixest_tests.R @@ -2203,6 +2203,142 @@ res_mult = feols(y1 ~ x1 | species | x2 ~ x3, base) m_lhs_rhs_fixef = model.matrix(res_mult, type = c("lhs", "iv.rhs2", "fixef"), na.rm = FALSE) test(names(m_lhs_rhs_fixef), c("y1", "fit_x2", "x1", "species")) +#### +#### sparse_model_matrix #### +#### + +base = iris +names(base) = c("y1", "x1", "x2", "x3", "species") +base$y2 = 10 + rnorm(150) + 0.5 * base$x1 +base$x4 = rnorm(150) + 0.5 * base$y1 +base$fe2 = rep(letters[1:15], 10) +base$fe2[50:51] = NA +base$y2[base$fe2 == "a" & !is.na(base$fe2)] = 0 +base$x2[1:5] = NA +base$x3[6] = NA +base$fe3 = rep(letters[1:10], 15) +base$id = rep(1:15, each = 10) +base$time = rep(1:10, 15) + +base_bis = base[1:50, ] +base_bis$id = rep(1:5, each = 10) +base_bis$time = rep(1:10, 5) + + +res = feols(y1 ~ x1 + x2 + x3, base) +sm1 = sparse_model_matrix( + res, type = "lhs", + na.rm = TRUE +) +test(length(sm1), res$nobs) + +sm1_na = sparse_model_matrix( + res, data = base, + type = "lhs", + na.rm = FALSE +) +test(length(sm1_na), res$nobs_origin) +test(max(abs(sm1_na - base$y1), na.rm = TRUE), 0) + + +sm2 = sparse_model_matrix( + res, type = c("lhs", "rhs"), data = base, + combine = FALSE, na.rm = FALSE +) +y = sm2[["lhs"]] +X = sm2[["rhs"]] +obs_rm = res$obs_selection$obsRemoved +res_bis = solve( + crossprod(as.matrix(X)[obs_rm, ]), + crossprod(as.matrix(X)[obs_rm, ], as.matrix(y)[obs_rm, ]) +) +test(as.numeric(res_bis), res$coefficients) + +# No constant +res_nocons = feols(mpg ~ 0 + i(cyl), mtcars) +sm_nocons = sparse_model_matrix(res_nocons, type = "rhs") +test("(Intercept)" %in% colnames(sm_nocons), FALSE) + +# Lag +res_lag = feols(y1 ~ l(x1, 1:2) + x2 + x3, base, panel = ~id + time) +sm_lag = sparse_model_matrix(res_lag, type = "rhs") +test(nrow(sm_lag), nobs(res_lag)) + + +# TODO: Fix poly and newdata +# With poly +# res_poly = feols(y1 ~ poly(x1, 2), base) +# works +# res_poly = feols(y1 ~ x1, base) +# sm_poly_old = sparse_model_matrix(res_poly) +# sm_poly_new = sparse_model_matrix(res_poly, data = base_bis) +# test(sm_poly_old[1:50, 3], sm_poly_new[, 3]) + + +# Interacted fixef +res = feols(y1 ~ x1 + x2 + x3 | species^fe2, base) +sm_ife = sparse_model_matrix(res, data = base, type = "fixef", collin.rm = FALSE) + +# fixef +res = feols(y1 ~ x1 + x2 + x3 | species + fe2, base) +sm_fe = sparse_model_matrix(res, data = base, type = "fixef") +test(ncol(sm_fe), 17) + +sm_fe_no_collin_rm = sparse_model_matrix(res, data = base, type = "fixef", collin.rm = FALSE) +test(ncol(sm_fe_no_collin_rm), 18) + +sm_fe_base_bis = sparse_model_matrix(res, data = base_bis, type = "fixef") + + +# Time-varying slopes +res_slopes = feols(y1 ~ x1 + x2 + x3 | fe2[I(x2+1)], data = base[7:48, ]) +sm_slopes = sparse_model_matrix(res_slopes, type = "fixef") + +# IV +res_iv = feols(y1 ~ x1 | x2 ~ x3, base) + +sm_rhs1 = sparse_model_matrix(res_iv, type = "iv.rhs1") +test(colnames(sm_rhs1)[-1], c("x3", "x1")) + +sm_rhs2 = sparse_model_matrix(res_iv, type = "iv.rhs2") +test(colnames(sm_rhs2)[-1], c("fit_x2", "x1")) + +sm_endo = sparse_model_matrix(res_iv, type = "iv.endo") +test(colnames(sm_endo), "x2") + +sm_exo = sparse_model_matrix(res_iv, type = "iv.exo") +test(colnames(sm_exo)[-1], "x1") + +sm_inst = sparse_model_matrix(res_iv, type = "iv.inst") +test(colnames(sm_inst), "x3") + +# several +res_mult = feols(y1 ~ x1 | species | x2 ~ x3, base) + +sm_lhs_rhs_fixef = sparse_model_matrix(res_mult, type = c("lhs", "iv.rhs2", "fixef")) +test(colnames(sm_lhs_rhs_fixef), c("y1", "fit_x2", "x1", "species::setosa", "species::versicolor", "species::virginica")) + + +# non-linear model +res_pois = fepois(Sepal.Length ~ Sepal.Width + Petal.Length | Species, iris) +sp_pois = sparse_model_matrix(res_pois, data = iris) + + +# make sure names are correct +test_col_names <- function(est) { + m <- fixest::sparse_model_matrix(est, collin.rm = FALSE) + q <- test(all(names(coef(est)) %in% colnames(m)), TRUE) + return(invisible(NULL)) +} +test_col_names(feols(mpg ~ i(am) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ i(am, i.cyl) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ i(am, hp) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ i(am):hp + disp | vs, mtcars)) +test_col_names(feols(mpg ~ factor(am) + disp | vs, mtcars)) +test_col_names(feols(mpg ~ factor(am):factor(cyl) + disp | vs, mtcars, notes = FALSE)) +test_col_names(feols(mpg ~ factor(am):hp + disp | vs, mtcars)) +test_col_names(feols(mpg ~ poly(hp, degree = 2) + disp | vs, mtcars)) + #### #### update ####