add sample option (#996)

* add sample option * fix * fix * fix * add test * fix * fix * fix * fix
easystats · Jan 21, 2025 · 9bfe905 · 9bfe905
1 parent 6a5f930
commit 9bfe905
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 27 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: insight
 Title: Easy Access to Model Information for Various Model Objects
-Version: 1.0.1.4
+Version: 1.0.1.5
 Authors@R:
     c(person(given = "Daniel",
              family = "Lüdecke",

diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,9 @@
 * `get_datagrid()` gives a more informative error message when a variable
   specified in `by` was not found in the data.
 
+* The `by` argument in `get_datagrid()` gets a new token-option, `"[sample <number>]"`,
+  to draw a random sample of values.
+
 ## Bug fixes
 
 * Option `"terciles"` and `"terciles2"` in `get_datagrid()` were swapped, i.e.

diff --git a/R/get_datagrid.R b/R/get_datagrid.R
@@ -42,14 +42,18 @@
 #'   - a "token" that creates pre-defined representative values:
 #'     - for mean and -/+ 1 SD around the mean: `"x = [sd]"`
 #'     - for median and -/+ 1 MAD around the median: `"x = [mad]"`
-#'     - for Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum): `"x = [fivenum]"`
+#'     - for Tukey's five number summary (minimum, lower-hinge, median,
+#'       upper-hinge, maximum): `"x = [fivenum]"`
 #'     - for terciles, including minimum and maximum: `"x = [terciles]"`
 #'     - for terciles, excluding minimum and maximum: `"x = [terciles2]"`
-#'     - for quartiles, including minimum and maximum: `"x = [quartiles]"` (same as `"x = [fivenum]"`)
+#'     - for quartiles, including minimum and maximum: `"x = [quartiles]"` (same
+#'       as `"x = [fivenum]"`)
 #'     - for quartiles, excluding minimum and maximum: `"x = [quartiles2]"`
 #'     - for a pretty value range: `"x = [pretty]"`
 #'     - for minimum and maximum value: `"x = [minmax]"`
 #'     - for 0 and the maximum value: `"x = [zeromax]"`
+#'     - for a random sample from all values: `"x = [sample <number>]"`, where
+#'       `<number>` should be a positive integer, e.g. `"x = [sample 15]"`.
 #'
 #'   For **factor** variables, the value(s) inside the brackets should indicate
 #'   one or more factor levels, like `by = "Species = [setosa, versicolor]"`.
@@ -788,20 +792,26 @@ get_datagrid.comparisons <- get_datagrid.slopes
       if (all(grepl('\\".*\\"', parts))) parts <- gsub('"', "", parts, fixed = TRUE)
 
       # Make expression ----------
-      if (is.factor(x) || is.character(x)) {
+      shortcuts <- c(
+        "meansd", "sd", "mad", "quartiles", "quartiles2", "zeromax",
+        "minmax", "terciles", "terciles2", "fivenum", "pretty"
+      )
+      if ((is.factor(x) && all(parts %in% levels(x))) || (is.character(x) && all(parts %in% x)))   {
         # Factor
         # Add quotes around them
         parts <- paste0("'", parts, "'")
         # Convert to character
         by_expression <- paste0("as.factor(c(", toString(parts), "))")
       } else if (length(parts) == 1) {
-        # Numeric
-        # If one, might be a shortcut
-        shortcuts <- c(
-          "meansd", "sd", "mad", "quartiles", "quartiles2", "zeromax",
-          "minmax", "terciles", "terciles2", "fivenum", "pretty"
-        )
-        if (parts %in% shortcuts) {
+        # If one, might be a shortcut. or a sampling request
+        if (grepl("sample", parts, fixed = TRUE)) {
+          n_to_sample <- .safe(as.numeric(trim_ws(gsub("sample", "", parts, fixed = TRUE))))
+          # do we have a proper definition of the sample size? If not, error
+          if (is.null(n_to_sample) || is.na(n_to_sample) || !length(n_to_sample)) {
+            format_error("The token `sample` must be followed by the number of samples to be drawn, e.g. `[sample 15]`.") # nolint
+          }
+          by_expression <- paste0("c(", paste(sample(x, n_to_sample), collapse = ","), ")")
+        } else if (parts %in% shortcuts) {
           if (parts %in% c("meansd", "sd")) {
             center <- mean(x, na.rm = TRUE)
             spread <- stats::sd(x, na.rm = TRUE)
@@ -813,7 +823,7 @@ get_datagrid.comparisons <- get_datagrid.slopes
           } else if (parts %in% c("fivenum", "quartiles")) {
             by_expression <- paste0("c(", paste(as.vector(stats::fivenum(x, na.rm = TRUE)), collapse = ","), ")")
           } else if (parts == "quartiles2") {
-            by_expression <- paste0("c(", paste(as.vector(stats::quantile(x, na.rm = TRUE))[2:4], collapse = ","), ")")
+            by_expression <- paste0("c(", paste(as.vector(stats::quantile(x, na.rm = TRUE))[2:4], collapse = ","), ")") # nolint
           } else if (parts == "terciles") {
             by_expression <- paste0("c(", paste(as.vector(stats::quantile(x, probs = (0:3) / 3, na.rm = TRUE)), collapse = ","), ")") # nolint
           } else if (parts == "terciles2") {
@@ -828,21 +838,28 @@ get_datagrid.comparisons <- get_datagrid.slopes
         } else if (is.numeric(parts)) {
           by_expression <- parts
         } else {
-          format_error(
-            paste0(
-              "The `by` argument (", by, ") should either indicate the minimum and the maximum, or one of the following options: ", # nolint
-              toString(shortcuts),
-              "."
-            )
-          )
+          by_expression <- NULL
         }
         # If only two, it's probably the range
-      } else if (length(parts) == 2) {
-        by_expression <- paste0("seq(", parts[1], ", ", parts[2], ", length.out = length)")
-        # If more, it's probably the vector
-      } else if (length(parts) > 2L) {
-        parts <- as.numeric(parts)
-        by_expression <- paste0("c(", toString(parts), ")")
+      } else if (is.numeric(x)) {
+        if (length(parts) == 2) {
+          by_expression <- paste0("seq(", parts[1], ", ", parts[2], ", length.out = length)")
+          # If more, it's probably the vector
+        } else if (length(parts) > 2L) {
+          parts <- as.numeric(parts)
+          by_expression <- paste0("c(", toString(parts), ")")
+        }
+      } else {
+        by_expression <- NULL
+      }
+      if (is.null(by_expression)) {
+        format_error(
+          paste0(
+            "The `by` argument (", by, ") should either indicate a valid factor level, the minimum and the maximum value of a vector, or one of the following options: ", # nolint
+            toString(shortcuts),
+            "."
+          )
+        )
       }
       # Else, try to directly eval the content
     } else {

diff --git a/man/get_datagrid.Rd b/man/get_datagrid.Rd
diff --git a/tests/testthat/test-get_datagrid.R b/tests/testthat/test-get_datagrid.R
@@ -49,7 +49,34 @@ test_that("get_datagrid - terciles, quartiles, mean-sd", {
   expect_equal(dg$Petal.Width, unname(quantile(iris$Petal.Width)), tolerance = 1e-4)
   expect_identical(attributes(dg)$adjusted_for, c("Petal.Length", "Species"))
 
+  set.seed(123)
+  dg <- insight::get_datagrid(m, "Petal.Width = [sample 8]")
+  set.seed(123)
+  expect_equal(dg$Petal.Width, sample(iris$Petal.Width, 8), tolerance = 1e-4)
+
+  expect_error(
+    insight::get_datagrid(m, "Petal.Width = [sample a]"),
+    regex = "must be followed"
+  )
+
+  dg <- insight::get_datagrid(m, "Species=[setosa]")
+  expect_identical(dim(dg), c(1L, 3L))
+
+  dg <- insight::get_datagrid(m, "Species=[setosa,versicolor]")
+  expect_identical(dim(dg), c(2L, 3L))
+
+  expect_error(
+    insight::get_datagrid(m, "Species=[setosa,wersicolor]"),
+    regex = "should either indicate"
+  )
+
+  expect_error(
+    insight::get_datagrid(m, "Species=[petosa]"),
+    regex = "should either indicate"
+  )
+
   skip_if_not_installed("ggeffects")
+  skip_if_not_installed("datawizard")
   data(efc, package = "ggeffects")
   efc$c161sex <- datawizard::to_factor(efc$c161sex)
   efc$e16sex <- datawizard::to_factor(efc$e16sex)