diff --git a/DESCRIPTION b/DESCRIPTION index f91e36e73..24c90de69 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: insight Title: Easy Access to Model Information for Various Model Objects -Version: 0.20.3.2 +Version: 0.20.3.3 Authors@R: c(person(given = "Daniel", family = "Lüdecke", diff --git a/NEWS.md b/NEWS.md index 4e7e4acdf..ddd397396 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # insight 0.20.4 +## Changes + +* `null_model()` and `formula_ok()` now warn when indexed data frames, such as + `df[, 5]`, are used as response variable in the formula, as this can lead to + unexpected results. + ## Bug fixes * Fixed regression from latest fix related to `get_variance()` for *brms* models. diff --git a/R/find_formula.R b/R/find_formula.R index ff3ebf837..e43cd2d9e 100644 --- a/R/find_formula.R +++ b/R/find_formula.R @@ -88,7 +88,11 @@ formula_ok <- function(x, verbose = TRUE, ...) { # all.vars() returns "T" as variable, which is not intended check_2 <- .check_formula_for_T(f, verbose = verbose) - all(check_1 && check_2) + # check if formula contains index data frames as response variable + # this may result in unexpected behaviour, and we should warn users + check_3 <- .check_formula_index_df(f, x, verbose = verbose) + + all(check_1 && check_2 && check_3) } @@ -526,7 +530,7 @@ find_formula.afex_aov <- function(x, verbose = TRUE, ...) { id <- attr(x, "id") within_variables <- names(attr(x, "within")) - within_variables <- paste0(within_variables, collapse = "*") + within_variables <- paste(within_variables, collapse = "*") within_variables <- paste0("(", within_variables, ")") e <- paste0("Error(", id, "/", within_variables, ")") @@ -1596,13 +1600,13 @@ find_formula.model_fit <- function(x, verbose = TRUE, ...) { ) # check if any further pforms exist - if (!all(names(f$pforms) %in% auxiliary_names)) { + if (all(names(f$pforms) %in% auxiliary_names)) { + f_custom <- NULL + } else { custom_names <- setdiff(names(f$pforms), auxiliary_names) if (length(custom_names)) { f_custom <- f$pforms[custom_names] } - } else { - f_custom <- NULL } f_sigmarandom <- NULL @@ -1870,6 +1874,27 @@ find_formula.model_fit <- function(x, verbose = TRUE, ...) { } +# formulas with an index data frame, like "lm(mtcars[, "mpg"] ~ mtcars$hp), may +# cause problems in various functions throughout the easystats packages. We +# warn the user here... + +.check_formula_index_df <- function(f, x, verbose = TRUE) { + if (is_empty_object(f)) { + return(TRUE) + } + resp <- .safe(safe_deparse(f$conditional[[2]])) + if (!is.null(resp) && any(grepl("\\b\\w+\\[.*?,.*?\\]", resp))) { + if (verbose) { + format_warning( + "Using indexed data frames, such as `df[, 5]`, as model response can produce unexpected results. Specify your model using the literal name of the response variable instead." # nolint + ) + } + return(FALSE) + } + return(TRUE) +} + + .formula_clean <- function(f) { fc <- as.character(f) LHS <- fc[2] diff --git a/R/null_model.R b/R/null_model.R index d04512c1e..817f9641b 100644 --- a/R/null_model.R +++ b/R/null_model.R @@ -18,9 +18,13 @@ #' #' @export null_model <- function(model, verbose = TRUE, ...) { + # check if model formula is ok + formula_ok(model) + + model_formula <- find_formula(model, verbose = FALSE) offset_term <- tryCatch( { - f <- safe_deparse(find_formula(model)$conditional) + f <- safe_deparse(model_formula$conditional) if (grepl("offset(", f, fixed = TRUE)) { out <- gsub("(.*)offset\\((.*)\\)(.*)", "\\2", f) } else { @@ -34,7 +38,7 @@ null_model <- function(model, verbose = TRUE, ...) { ) if (is_mixed_model(model)) { - .null_model_mixed(model, offset_term, verbose) + .null_model_mixed(model, offset_term, model_formula, verbose) } else if (inherits(model, "clm2")) { stats::update(model, location = ~1, scale = ~1) } else if (inherits(model, "multinom")) { @@ -60,16 +64,16 @@ null_model <- function(model, verbose = TRUE, ...) { } -.null_model_mixed <- function(model, offset_term = NULL, verbose = TRUE) { +.null_model_mixed <- function(model, offset_term = NULL, model_formula = NULL, verbose = TRUE) { if (inherits(model, "MixMod")) { nullform <- stats::as.formula(paste(find_response(model), "~ 1")) null.model <- suppressWarnings(stats::update(model, fixed = nullform)) # fix fixed effects formula null.model$call$fixed <- nullform } else if (inherits(model, "cpglmm")) { - nullform <- find_formula(model, verbose = FALSE)[["random"]] + nullform <- model_formula[["random"]] null.model <- suppressWarnings(stats::update(model, nullform)) - } else if (inherits(model, "glmmTMB") && !is.null(find_formula(model)$zero_inflated)) { + } else if (inherits(model, "glmmTMB") && !is.null(model_formula$zero_inflated)) { insight::check_if_installed("glmmTMB") # for zero-inflated models, we need to create the NULL model for the # zero-inflation part as well. Since "update()" won't work here, we need diff --git a/tests/testthat/test-find_formula.R b/tests/testthat/test-find_formula.R index 81c809dff..00decc275 100644 --- a/tests/testthat/test-find_formula.R +++ b/tests/testthat/test-find_formula.R @@ -9,3 +9,10 @@ test_that("`find_formula` works with `mgcv::gam()`", { expect_identical(f$conditional, formula("y ~ s(x0) + s(x1) + s(x2)")) expect_identical(f$scale, formula("~s(x3)")) }) + + +test_that("formula warns for badly formulated response", { + data(iris) + model <- lm(iris[, 2] ~ Species, data = iris) + expect_warning(formula_ok(model), regex = "Using indexed") +}) diff --git a/tests/testthat/test-null_model.R b/tests/testthat/test-null_model.R index 1bc7adcab..27181baa2 100644 --- a/tests/testthat/test-null_model.R +++ b/tests/testthat/test-null_model.R @@ -60,6 +60,14 @@ test_that("null_model zero-inflated", { expect_equal(glmmTMB::fixef(out), glmmTMB::fixef(m0), tolerance = 1e-4) }) + +test_that("null_model warns for badly formulated response", { + data(iris) + model <- lm(iris[, 2] ~ Species, data = iris) + expect_warning(null_model(model), regex = "Using indexed") +}) + + # set.seed(123) # N <- 100 # Samples # x <- runif(N, 0, 10) # Predictor