From e30984534296665118b53b088aa29b7996977099 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sun, 27 Oct 2024 11:06:02 +0100
Subject: [PATCH] Avoid "number of items to replace is not a multiple" (#950)

* Avoid "number of items to replace is not a multiple"

* add test

* desc

* cleanup
---
 DESCRIPTION                       |  2 +-
 R/clean_names.R                   | 91 ++++++++++++-------------------
 tests/testthat/test-clean_names.R | 60 ++++++++++++--------
 3 files changed, 72 insertions(+), 81 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2a838c231..57009cabc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: insight
 Title: Easy Access to Model Information for Various Model Objects
-Version: 0.99.0
+Version: 0.99.0.1
 Authors@R:
     c(person(given = "Daniel",
              family = "Lüdecke",
diff --git a/R/clean_names.R b/R/clean_names.R
index 323ca5b8c..1a8da8563 100644
--- a/R/clean_names.R
+++ b/R/clean_names.R
@@ -132,98 +132,75 @@ clean_names.character <- function(x, include_names = FALSE, ...) {
 
   # do we have a "log()" pattern here? if yes, get capture region
   # which matches the "cleaned" variable name
-  cleaned <- unlist(lapply(seq_along(x), function(i) {
+  cleaned <- unlist(lapply(x, function(i) {
     # check if we have special patterns like 100 * log(xy), and remove it
-    if (isFALSE(is_emmeans) && grepl("^([0-9]+)", x[i])) {
-      x[i] <- gsub("^([0-9]+)[^(\\.|[:alnum:])]+(.*)", "\\2", x[i])
+    if (isFALSE(is_emmeans) && grepl("^([0-9]+)", i)) {
+      i <- gsub("^([0-9]+)[^(\\.|[:alnum:])]+(.*)", "\\2", i)
     }
     # for brms multimembership, multiple elements might be returned
     # need extra handling
     multimembership <- NULL
     for (j in seq_along(pattern)) {
       # check if we find pattern at all
-      if (grepl(pattern[j], x[i], fixed = TRUE)) {
+      if (grepl(pattern[j], i, fixed = TRUE)) {
         # remove possible namespace
-        if (grepl("::", x[i], fixed = TRUE)) {
-          x[i] <- sub("(.*)::(.*)", "\\2", x[i])
+        if (grepl("::", i, fixed = TRUE)) {
+          i <- sub("(.*)::(.*)", "\\2", i)
         }
         if (pattern[j] == "offset") { # nolint
-          x[i] <- trim_ws(unique(sub("^offset\\(([^-+ )]*).*", "\\1", x[i])))
+          i <- trim_ws(unique(sub("^offset\\(([^-+ )]*).*", "\\1", i)))
         } else if (pattern[j] == "I") {
-          if (!ignore_asis && grepl("I\\((.*)\\)", x[i])) {
-            # x[i] <- trim_ws(unique(sub("I\\(((\\w|\\.)*).*", "\\1", x[i])))
-            x[i] <- all.vars(stats::as.formula(paste("~", x[i])))
+          if (!ignore_asis && grepl("I\\((.*)\\)", i)) {
+            # i <- trim_ws(unique(sub("I\\(((\\w|\\.)*).*", "\\1", i)))
+            i <- all.vars(stats::as.formula(paste("~", i)))
           }
         } else if (pattern[j] == "asis") {
-          if (!ignore_asis && grepl("asis\\((.*)\\)", x[i])) {
-            # x[i] <- trim_ws(unique(sub("asis\\(((\\w|\\.)*).*", "\\1", x[i])))
-            x[i] <- all.vars(stats::as.formula(paste("~", x[i])))
+          if (!ignore_asis && grepl("asis\\((.*)\\)", i)) {
+            # i <- trim_ws(unique(sub("asis\\(((\\w|\\.)*).*", "\\1", i)))
+            i <- all.vars(stats::as.formula(paste("~", i)))
           }
         } else if (pattern[j] == "log(log") {
-          x[i] <- trim_ws(unique(sub("^log\\(log\\(((\\w|\\.)*).*", "\\1", x[i])))
+          i <- trim_ws(unique(sub("^log\\(log\\(((\\w|\\.)*).*", "\\1", i)))
         } else if (pattern[j] == "relevel(as.factor") {
-          x[i] <- trim_ws(unique(sub("^relevel\\(as.factor\\(((\\w|\\.)*).*", "\\1", x[i])))
+          i <- trim_ws(unique(sub("^relevel\\(as.factor\\(((\\w|\\.)*).*", "\\1", i)))
         } else if (pattern[j] == "scale(log") {
-          x[i] <- trim_ws(unique(sub("^scale\\(log\\(((\\w|\\.)*).*", "\\1", x[i])))
-          x[i] <- trim_ws(unique(sub("^scale\\(log1p\\(((\\w|\\.)*).*", "\\1", x[i])))
-          x[i] <- trim_ws(unique(sub("^scale\\(log2\\(((\\w|\\.)*).*", "\\1", x[i])))
-          x[i] <- trim_ws(unique(sub("^scale\\(log10\\(((\\w|\\.)*).*", "\\1", x[i])))
+          i <- trim_ws(unique(sub("^scale\\(log\\(((\\w|\\.)*).*", "\\1", i)))
+          i <- trim_ws(unique(sub("^scale\\(log1p\\(((\\w|\\.)*).*", "\\1", i)))
+          i <- trim_ws(unique(sub("^scale\\(log2\\(((\\w|\\.)*).*", "\\1", i)))
+          i <- trim_ws(unique(sub("^scale\\(log10\\(((\\w|\\.)*).*", "\\1", i)))
         } else if (pattern[j] == "scale(poly") {
-          x[i] <- trim_ws(unique(sub("^scale\\(poly\\(((\\w|\\.)*).*", "\\1", x[i])))
+          i <- trim_ws(unique(sub("^scale\\(poly\\(((\\w|\\.)*).*", "\\1", i)))
         } else if (pattern[j] %in% c("mmc", "mm")) {
-          # # detect mm-pattern
-          # p <- paste0("^", pattern[j], "\\((.*)\\).*")
-          # # extract terms from mm() / mmc() functions
-          # g <- trim_ws(sub(p, "\\1", x[i]))
-          # # split terms, but not if comma inside parentheses
-          # g <- trim_ws(unlist(strsplit(g, ",(?![^()]*\\))", perl = TRUE), use.names = FALSE))
-          # # we might have additional arguments, like scale or weights. handle these here
-          # g <- g[!startsWith(g, "scale")]
-          # # clean weights
-          # gweights <- g[startsWith(g, "weights")]
-          # if (length(gweights)) {
-          #   g <- g[!startsWith(g, "weights")]
-          #   # this regular pattern finds "weights=" or "weights =", possibly followed
-          #   # by "cbind()", e.g. "weights = cbind(w, w)". We extract the variable names,
-          #   # create a formula, so "all.vars()" will only extract variable names if
-          #   # we really have "cbind()" in the weights argument
-          #   g <- c(g, .safe(all.vars(as.formula(paste0("~", trim_ws(gsub("weights\\s?=(.*)", "\\1", "weights = cbind(w, w)"))))))) # nolint
-          # }
-          # multimembership <- as.vector(trim_ws(g))
-          if (grepl(paste0("^", pattern[j], "\\((.*)\\).*"), x[i])) {
-            multimembership <- all.vars(stats::as.formula(paste("~", x[i])))
+          if (grepl(paste0("^", pattern[j], "\\((.*)\\).*"), i)) {
+            i <- all.vars(stats::as.formula(paste("~", i)))
           }
-        } else if (pattern[j] == "s" && startsWith(x[i], "s(")) {
-          x[i] <- gsub("^s\\(", "", x[i])
-          x[i] <- gsub("\\)$", "", x[i])
-          if (grepl("=|[[:digit:]]", x[i])) {
-            new_x <- trim_ws(unlist(strsplit(x[i], ",", fixed = TRUE), use.names = FALSE))
+        } else if (pattern[j] == "s" && startsWith(i, "s(")) {
+          i <- gsub("^s\\(", "", i)
+          i <- gsub("\\)$", "", i)
+          if (grepl("=|[[:digit:]]", i)) {
+            new_x <- trim_ws(unlist(strsplit(i, ",", fixed = TRUE), use.names = FALSE))
             to_remove <- which(!grepl("\\D", new_x))
             to_remove <- c(to_remove, grep("=", new_x, fixed = TRUE))
             if (length(to_remove) == 0) {
-              x[i] <- toString(new_x)
+              i <- toString(new_x)
             } else {
-              x[i] <- toString(new_x[-to_remove])
+              i <- toString(new_x[-to_remove])
             }
           }
         } else {
           # p <- paste0("^", pattern[j], "\\(([^,/)]*).*")
           # this one should be more generic...
           p <- paste0("^", pattern[j], "\\(((\\w|\\.)*).*")
-          x[i] <- unique(sub(p, "\\1", x[i]))
+          i <- unique(sub(p, "\\1", i))
         }
       }
     }
     # for coxme-models, remove random-effect things...
-    if (grepl("|", x[i], fixed = TRUE)) {
-      x[i] <- sub("^(.*)\\|(.*)", "\\2", x[i])
-    }
-    # either return regular term, or mm term for brms
-    if (is.null(multimembership)) {
-      trim_ws(x[i])
-    } else {
-      multimembership
+    if (grepl("|", i, fixed = TRUE)) {
+      i <- sub("^(.*)\\|(.*)", "\\2", i)
     }
+
+    trim_ws(i)
   }), use.names = FALSE)
 
   # remove for random intercept only models
diff --git a/tests/testthat/test-clean_names.R b/tests/testthat/test-clean_names.R
index 027f77f7a..f963a6177 100644
--- a/tests/testthat/test-clean_names.R
+++ b/tests/testthat/test-clean_names.R
@@ -65,6 +65,7 @@ test_that("clean_names", {
   expect_identical(clean_names("s(x1, x2, x3)"), "x1, x2, x3")
 })
 
+
 test_that("clean_names, model", {
   m_rel1 <- lm(mpg ~ relevel(as.factor(cyl), "8") + gear, data = mtcars)
   expect_identical(insight::clean_names(m_rel1), c("mpg", "cyl", "gear"))
@@ -74,11 +75,35 @@ test_that("clean_names, model", {
   expect_identical(insight::clean_names(m_rel2), c("mpg", "cyl", "gear"))
 })
 
-skip_on_cran()
-skip_if_offline()
-skip_if_not_installed("httr2")
 
 test_that("clean_names, multimembership", {
+  skip_if_not_installed("gamlss")
+  set.seed(123)
+  dat <- data.frame(
+    Y = sample(20:50, 100, replace = TRUE),
+    date = sample(seq(as.Date("1999/01/01"), as.Date("2000/01/01"), by = "day"), 10),
+    cont1 = rchisq(100, df = 2),
+    cont2 = runif(100),
+    cat1 = sample(LETTERS[1:3], 100, replace = TRUE)
+  )
+  junk <- capture.output({
+    mod1 <- suppressWarnings(gamlss::gamlss(
+      Y ~ date + scale(cont1) + scale(cont2) + I(scale(cont2)^2) * cat1,
+      data = dat
+    ))
+  })
+  expect_identical(
+    clean_names(find_terms(mod1)$conditional),
+    c("date", "cont1", "cont2", "cont2", "cat1")
+  )
+})
+
+
+test_that("clean_names, multimembership", {
+  skip_on_cran()
+  skip_if_offline()
+  skip_if_not_installed("httr2")
+
   m1 <- suppressWarnings(insight::download_model("brms_mm_1"))
   skip_if(is.null(m1))
   out <- clean_names(m1)
@@ -106,25 +131,14 @@ test_that("clean_names, multimembership", {
   )
 })
 
-skip_if_not_installed("gamlss")
 
-test_that("clean_names, multimembership", {
-  set.seed(123)
-  dat <- data.frame(
-    Y = sample(20:50, 100, replace = TRUE),
-    date = sample(seq(as.Date("1999/01/01"), as.Date("2000/01/01"), by = "day"), 10),
-    cont1 = rchisq(100, df = 2),
-    cont2 = runif(100),
-    cat1 = sample(LETTERS[1:3], 100, replace = TRUE)
-  )
-  junk <- capture.output({
-    mod1 <- suppressWarnings(gamlss::gamlss(
-      Y ~ date + scale(cont1) + scale(cont2) + I(scale(cont2)^2) * cat1,
-      data = dat
-    ))
-  })
-  expect_identical(
-    clean_names(find_terms(mod1)$conditional),
-    c("date", "cont1", "cont2", "cont2", "cat1")
-  )
+test_that("clean_names, division in I()", {
+  skip_if_not_installed("betareg")
+  data("GasolineYield", package = "betareg")
+  data("FoodExpenditure", package = "betareg")
+
+  m1 <- betareg::betareg(yield ~ batch + temp, data = GasolineYield)
+  m2 <- betareg::betareg(I(food / income) ~ income + persons, data = FoodExpenditure)
+  expect_identical(clean_names(m1), c("yield", "batch", "temp"))
+  expect_identical(clean_names(m1), c("food", "income", "persons"))
 })