diff --git a/DESCRIPTION b/DESCRIPTION index 13aedd421..3ca8334c4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: insight Title: Easy Access to Model Information for Various Model Objects -Version: 1.0.1.4 +Version: 1.0.1.5 Authors@R: c(person(given = "Daniel", family = "Lüdecke", diff --git a/NEWS.md b/NEWS.md index 332d42410..12bc4896d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,9 @@ * `get_datagrid()` gives a more informative error message when a variable specified in `by` was not found in the data. +* The `by` argument in `get_datagrid()` gets a new token-option, `"[sample ]"`, + to draw a random sample of values. + ## Bug fixes * Option `"terciles"` and `"terciles2"` in `get_datagrid()` were swapped, i.e. diff --git a/R/get_datagrid.R b/R/get_datagrid.R index 28265c47f..3f1a6740d 100644 --- a/R/get_datagrid.R +++ b/R/get_datagrid.R @@ -42,14 +42,18 @@ #' - a "token" that creates pre-defined representative values: #' - for mean and -/+ 1 SD around the mean: `"x = [sd]"` #' - for median and -/+ 1 MAD around the median: `"x = [mad]"` -#' - for Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum): `"x = [fivenum]"` +#' - for Tukey's five number summary (minimum, lower-hinge, median, +#' upper-hinge, maximum): `"x = [fivenum]"` #' - for terciles, including minimum and maximum: `"x = [terciles]"` #' - for terciles, excluding minimum and maximum: `"x = [terciles2]"` -#' - for quartiles, including minimum and maximum: `"x = [quartiles]"` (same as `"x = [fivenum]"`) +#' - for quartiles, including minimum and maximum: `"x = [quartiles]"` (same +#' as `"x = [fivenum]"`) #' - for quartiles, excluding minimum and maximum: `"x = [quartiles2]"` #' - for a pretty value range: `"x = [pretty]"` #' - for minimum and maximum value: `"x = [minmax]"` #' - for 0 and the maximum value: `"x = [zeromax]"` +#' - for a random sample from all values: `"x = [sample ]"`, where +#' `` should be a positive integer, e.g. `"x = [sample 15]"`. #' #' For **factor** variables, the value(s) inside the brackets should indicate #' one or more factor levels, like `by = "Species = [setosa, versicolor]"`. @@ -788,20 +792,26 @@ get_datagrid.comparisons <- get_datagrid.slopes if (all(grepl('\\".*\\"', parts))) parts <- gsub('"', "", parts, fixed = TRUE) # Make expression ---------- - if (is.factor(x) || is.character(x)) { + shortcuts <- c( + "meansd", "sd", "mad", "quartiles", "quartiles2", "zeromax", + "minmax", "terciles", "terciles2", "fivenum", "pretty" + ) + if ((is.factor(x) && all(parts %in% levels(x))) || (is.character(x) && all(parts %in% x))) { # Factor # Add quotes around them parts <- paste0("'", parts, "'") # Convert to character by_expression <- paste0("as.factor(c(", toString(parts), "))") } else if (length(parts) == 1) { - # Numeric - # If one, might be a shortcut - shortcuts <- c( - "meansd", "sd", "mad", "quartiles", "quartiles2", "zeromax", - "minmax", "terciles", "terciles2", "fivenum", "pretty" - ) - if (parts %in% shortcuts) { + # If one, might be a shortcut. or a sampling request + if (grepl("sample", parts, fixed = TRUE)) { + n_to_sample <- .safe(as.numeric(trim_ws(gsub("sample", "", parts, fixed = TRUE)))) + # do we have a proper definition of the sample size? If not, error + if (is.null(n_to_sample) || is.na(n_to_sample) || !length(n_to_sample)) { + format_error("The token `sample` must be followed by the number of samples to be drawn, e.g. `[sample 15]`.") # nolint + } + by_expression <- paste0("c(", paste(sample(x, n_to_sample), collapse = ","), ")") + } else if (parts %in% shortcuts) { if (parts %in% c("meansd", "sd")) { center <- mean(x, na.rm = TRUE) spread <- stats::sd(x, na.rm = TRUE) @@ -813,7 +823,7 @@ get_datagrid.comparisons <- get_datagrid.slopes } else if (parts %in% c("fivenum", "quartiles")) { by_expression <- paste0("c(", paste(as.vector(stats::fivenum(x, na.rm = TRUE)), collapse = ","), ")") } else if (parts == "quartiles2") { - by_expression <- paste0("c(", paste(as.vector(stats::quantile(x, na.rm = TRUE))[2:4], collapse = ","), ")") + by_expression <- paste0("c(", paste(as.vector(stats::quantile(x, na.rm = TRUE))[2:4], collapse = ","), ")") # nolint } else if (parts == "terciles") { by_expression <- paste0("c(", paste(as.vector(stats::quantile(x, probs = (0:3) / 3, na.rm = TRUE)), collapse = ","), ")") # nolint } else if (parts == "terciles2") { @@ -828,21 +838,28 @@ get_datagrid.comparisons <- get_datagrid.slopes } else if (is.numeric(parts)) { by_expression <- parts } else { - format_error( - paste0( - "The `by` argument (", by, ") should either indicate the minimum and the maximum, or one of the following options: ", # nolint - toString(shortcuts), - "." - ) - ) + by_expression <- NULL } # If only two, it's probably the range - } else if (length(parts) == 2) { - by_expression <- paste0("seq(", parts[1], ", ", parts[2], ", length.out = length)") - # If more, it's probably the vector - } else if (length(parts) > 2L) { - parts <- as.numeric(parts) - by_expression <- paste0("c(", toString(parts), ")") + } else if (is.numeric(x)) { + if (length(parts) == 2) { + by_expression <- paste0("seq(", parts[1], ", ", parts[2], ", length.out = length)") + # If more, it's probably the vector + } else if (length(parts) > 2L) { + parts <- as.numeric(parts) + by_expression <- paste0("c(", toString(parts), ")") + } + } else { + by_expression <- NULL + } + if (is.null(by_expression)) { + format_error( + paste0( + "The `by` argument (", by, ") should either indicate a valid factor level, the minimum and the maximum value of a vector, or one of the following options: ", # nolint + toString(shortcuts), + "." + ) + ) } # Else, try to directly eval the content } else { diff --git a/man/get_datagrid.Rd b/man/get_datagrid.Rd index f0ed91802..537b77ccc 100644 --- a/man/get_datagrid.Rd +++ b/man/get_datagrid.Rd @@ -80,14 +80,18 @@ case these values are used as representative values. \itemize{ \item for mean and -/+ 1 SD around the mean: \code{"x = [sd]"} \item for median and -/+ 1 MAD around the median: \code{"x = [mad]"} -\item for Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum): \code{"x = [fivenum]"} +\item for Tukey's five number summary (minimum, lower-hinge, median, +upper-hinge, maximum): \code{"x = [fivenum]"} \item for terciles, including minimum and maximum: \code{"x = [terciles]"} \item for terciles, excluding minimum and maximum: \code{"x = [terciles2]"} -\item for quartiles, including minimum and maximum: \code{"x = [quartiles]"} (same as \code{"x = [fivenum]"}) +\item for quartiles, including minimum and maximum: \code{"x = [quartiles]"} (same +as \code{"x = [fivenum]"}) \item for quartiles, excluding minimum and maximum: \code{"x = [quartiles2]"} \item for a pretty value range: \code{"x = [pretty]"} \item for minimum and maximum value: \code{"x = [minmax]"} \item for 0 and the maximum value: \code{"x = [zeromax]"} +\item for a random sample from all values: \code{"x = [sample ]"}, where +\verb{} should be a positive integer, e.g. \code{"x = [sample 15]"}. } } diff --git a/tests/testthat/test-get_datagrid.R b/tests/testthat/test-get_datagrid.R index ce833f2a8..5ec8618e5 100644 --- a/tests/testthat/test-get_datagrid.R +++ b/tests/testthat/test-get_datagrid.R @@ -49,7 +49,34 @@ test_that("get_datagrid - terciles, quartiles, mean-sd", { expect_equal(dg$Petal.Width, unname(quantile(iris$Petal.Width)), tolerance = 1e-4) expect_identical(attributes(dg)$adjusted_for, c("Petal.Length", "Species")) + set.seed(123) + dg <- insight::get_datagrid(m, "Petal.Width = [sample 8]") + set.seed(123) + expect_equal(dg$Petal.Width, sample(iris$Petal.Width, 8), tolerance = 1e-4) + + expect_error( + insight::get_datagrid(m, "Petal.Width = [sample a]"), + regex = "must be followed" + ) + + dg <- insight::get_datagrid(m, "Species=[setosa]") + expect_identical(dim(dg), c(1L, 3L)) + + dg <- insight::get_datagrid(m, "Species=[setosa,versicolor]") + expect_identical(dim(dg), c(2L, 3L)) + + expect_error( + insight::get_datagrid(m, "Species=[setosa,wersicolor]"), + regex = "should either indicate" + ) + + expect_error( + insight::get_datagrid(m, "Species=[petosa]"), + regex = "should either indicate" + ) + skip_if_not_installed("ggeffects") + skip_if_not_installed("datawizard") data(efc, package = "ggeffects") efc$c161sex <- datawizard::to_factor(efc$c161sex) efc$e16sex <- datawizard::to_factor(efc$e16sex)