Skip to content

Commit

Permalink
add sample option (#996)
Browse files Browse the repository at this point in the history
* add sample option

* fix

* fix

* fix

* add test

* fix

* fix

* fix

* fix
  • Loading branch information
strengejacke authored Jan 21, 2025
1 parent 6a5f930 commit 9bfe905
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 27 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: insight
Title: Easy Access to Model Information for Various Model Objects
Version: 1.0.1.4
Version: 1.0.1.5
Authors@R:
c(person(given = "Daniel",
family = "Lüdecke",
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
* `get_datagrid()` gives a more informative error message when a variable
specified in `by` was not found in the data.

* The `by` argument in `get_datagrid()` gets a new token-option, `"[sample <number>]"`,
to draw a random sample of values.

## Bug fixes

* Option `"terciles"` and `"terciles2"` in `get_datagrid()` were swapped, i.e.
Expand Down
65 changes: 41 additions & 24 deletions R/get_datagrid.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,18 @@
#' - a "token" that creates pre-defined representative values:
#' - for mean and -/+ 1 SD around the mean: `"x = [sd]"`
#' - for median and -/+ 1 MAD around the median: `"x = [mad]"`
#' - for Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum): `"x = [fivenum]"`
#' - for Tukey's five number summary (minimum, lower-hinge, median,
#' upper-hinge, maximum): `"x = [fivenum]"`
#' - for terciles, including minimum and maximum: `"x = [terciles]"`
#' - for terciles, excluding minimum and maximum: `"x = [terciles2]"`
#' - for quartiles, including minimum and maximum: `"x = [quartiles]"` (same as `"x = [fivenum]"`)
#' - for quartiles, including minimum and maximum: `"x = [quartiles]"` (same
#' as `"x = [fivenum]"`)
#' - for quartiles, excluding minimum and maximum: `"x = [quartiles2]"`
#' - for a pretty value range: `"x = [pretty]"`
#' - for minimum and maximum value: `"x = [minmax]"`
#' - for 0 and the maximum value: `"x = [zeromax]"`
#' - for a random sample from all values: `"x = [sample <number>]"`, where
#' `<number>` should be a positive integer, e.g. `"x = [sample 15]"`.
#'
#' For **factor** variables, the value(s) inside the brackets should indicate
#' one or more factor levels, like `by = "Species = [setosa, versicolor]"`.
Expand Down Expand Up @@ -788,20 +792,26 @@ get_datagrid.comparisons <- get_datagrid.slopes
if (all(grepl('\\".*\\"', parts))) parts <- gsub('"', "", parts, fixed = TRUE)

# Make expression ----------
if (is.factor(x) || is.character(x)) {
shortcuts <- c(
"meansd", "sd", "mad", "quartiles", "quartiles2", "zeromax",
"minmax", "terciles", "terciles2", "fivenum", "pretty"
)
if ((is.factor(x) && all(parts %in% levels(x))) || (is.character(x) && all(parts %in% x))) {
# Factor
# Add quotes around them
parts <- paste0("'", parts, "'")
# Convert to character
by_expression <- paste0("as.factor(c(", toString(parts), "))")
} else if (length(parts) == 1) {
# Numeric
# If one, might be a shortcut
shortcuts <- c(
"meansd", "sd", "mad", "quartiles", "quartiles2", "zeromax",
"minmax", "terciles", "terciles2", "fivenum", "pretty"
)
if (parts %in% shortcuts) {
# If one, might be a shortcut. or a sampling request
if (grepl("sample", parts, fixed = TRUE)) {
n_to_sample <- .safe(as.numeric(trim_ws(gsub("sample", "", parts, fixed = TRUE))))
# do we have a proper definition of the sample size? If not, error
if (is.null(n_to_sample) || is.na(n_to_sample) || !length(n_to_sample)) {
format_error("The token `sample` must be followed by the number of samples to be drawn, e.g. `[sample 15]`.") # nolint
}
by_expression <- paste0("c(", paste(sample(x, n_to_sample), collapse = ","), ")")
} else if (parts %in% shortcuts) {
if (parts %in% c("meansd", "sd")) {
center <- mean(x, na.rm = TRUE)
spread <- stats::sd(x, na.rm = TRUE)
Expand All @@ -813,7 +823,7 @@ get_datagrid.comparisons <- get_datagrid.slopes
} else if (parts %in% c("fivenum", "quartiles")) {
by_expression <- paste0("c(", paste(as.vector(stats::fivenum(x, na.rm = TRUE)), collapse = ","), ")")
} else if (parts == "quartiles2") {
by_expression <- paste0("c(", paste(as.vector(stats::quantile(x, na.rm = TRUE))[2:4], collapse = ","), ")")
by_expression <- paste0("c(", paste(as.vector(stats::quantile(x, na.rm = TRUE))[2:4], collapse = ","), ")") # nolint
} else if (parts == "terciles") {
by_expression <- paste0("c(", paste(as.vector(stats::quantile(x, probs = (0:3) / 3, na.rm = TRUE)), collapse = ","), ")") # nolint
} else if (parts == "terciles2") {
Expand All @@ -828,21 +838,28 @@ get_datagrid.comparisons <- get_datagrid.slopes
} else if (is.numeric(parts)) {
by_expression <- parts
} else {
format_error(
paste0(
"The `by` argument (", by, ") should either indicate the minimum and the maximum, or one of the following options: ", # nolint
toString(shortcuts),
"."
)
)
by_expression <- NULL
}
# If only two, it's probably the range
} else if (length(parts) == 2) {
by_expression <- paste0("seq(", parts[1], ", ", parts[2], ", length.out = length)")
# If more, it's probably the vector
} else if (length(parts) > 2L) {
parts <- as.numeric(parts)
by_expression <- paste0("c(", toString(parts), ")")
} else if (is.numeric(x)) {
if (length(parts) == 2) {
by_expression <- paste0("seq(", parts[1], ", ", parts[2], ", length.out = length)")
# If more, it's probably the vector
} else if (length(parts) > 2L) {
parts <- as.numeric(parts)
by_expression <- paste0("c(", toString(parts), ")")
}
} else {
by_expression <- NULL
}
if (is.null(by_expression)) {
format_error(
paste0(
"The `by` argument (", by, ") should either indicate a valid factor level, the minimum and the maximum value of a vector, or one of the following options: ", # nolint
toString(shortcuts),
"."
)
)
}
# Else, try to directly eval the content
} else {
Expand Down
8 changes: 6 additions & 2 deletions man/get_datagrid.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions tests/testthat/test-get_datagrid.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,34 @@ test_that("get_datagrid - terciles, quartiles, mean-sd", {
expect_equal(dg$Petal.Width, unname(quantile(iris$Petal.Width)), tolerance = 1e-4)
expect_identical(attributes(dg)$adjusted_for, c("Petal.Length", "Species"))

set.seed(123)
dg <- insight::get_datagrid(m, "Petal.Width = [sample 8]")
set.seed(123)
expect_equal(dg$Petal.Width, sample(iris$Petal.Width, 8), tolerance = 1e-4)

expect_error(
insight::get_datagrid(m, "Petal.Width = [sample a]"),
regex = "must be followed"
)

dg <- insight::get_datagrid(m, "Species=[setosa]")
expect_identical(dim(dg), c(1L, 3L))

dg <- insight::get_datagrid(m, "Species=[setosa,versicolor]")
expect_identical(dim(dg), c(2L, 3L))

expect_error(
insight::get_datagrid(m, "Species=[setosa,wersicolor]"),
regex = "should either indicate"
)

expect_error(
insight::get_datagrid(m, "Species=[petosa]"),
regex = "should either indicate"
)

skip_if_not_installed("ggeffects")
skip_if_not_installed("datawizard")
data(efc, package = "ggeffects")
efc$c161sex <- datawizard::to_factor(efc$c161sex)
efc$e16sex <- datawizard::to_factor(efc$e16sex)
Expand Down

0 comments on commit 9bfe905

Please sign in to comment.