diff --git a/NAMESPACE b/NAMESPACE index 7700f05bd..831413a39 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -602,6 +602,7 @@ export(recipes_extension_check) export(recipes_names_outcomes) export(recipes_names_predictors) export(recipes_pkg_check) +export(recipes_ptype) export(recipes_remove_cols) export(remove_original_cols) export(remove_role) diff --git a/NEWS.md b/NEWS.md index 6e2ee62b6..e101f6bb5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,8 @@ * New `extract_fit_time()` method has been added that returns the time it took to train the recipe. (#1071) +* Developer helper function `recipes_ptype()` has been added, returning expected input data for `prep()` and `bake()` for a given recipe object. (#1329) + * The `prefix` argument of `step_dummy_multi_choice()` is not properly documented. (#1298) * `step_dummy()` now gives an informative error on attempt to generate too many columns to fit in memory. (#828) diff --git a/R/developer.R b/R/developer.R index a9f1b841b..06578c4d8 100644 --- a/R/developer.R +++ b/R/developer.R @@ -139,6 +139,11 @@ #' #' # Interacting with recipe objects #' +#' [recipes_ptype()] returns the ptype, expected variables and types, that a +#' recipe object expects at `prep()` and `bake()` time. Controlled using the +#' `stage` argument. Can be used by functions that interact with recipes to +#' verify data is correct before passing it to `prep()` and `bake()`. +#' #' [detect_step()] returns a logical indicator to determine if a given step or #' check is included in a recipe. #' diff --git a/R/ptype.R b/R/ptype.R new file mode 100644 index 000000000..85e808b99 --- /dev/null +++ b/R/ptype.R @@ -0,0 +1,113 @@ +#' Prototype of recipe object +#' +#' This helper function returns the prototype of the input data set expected by +#' the recipe object. +#' +#' @param x A `recipe` object. +#' @param ... currently not used. +#' @param stage A single character. Must be one of `"prep"` or `"bake"`. See +#' details for more. Defaults to `"prep"`. +#' +#' @details +#' The returned ptype is a tibble of the data set that the recipe object is +#' expecting. The specifics of which columns depend on the `stage`. +#' +#' At `prep()` time, when `stage = "prep"`, the ptype is the data passed to +#' `recipe()`. The following code chunk represents a possible recipe scenario. +#' `recipes_ptype(rec_spec, stage = "prep")` and +#' `recipes_ptype(rec_prep, stage = "prep")` both return a ptype tibble +#' corresponding to `data_ptype`. This information is used internally in +#' `prep()` to verify that `data_training` has the right columns with the right +#' types. +#' +#' ```r +#' rec_spec <- recipe(outcome ~ ., data = data_ptype) %>% +#' step_normalize(all_numeric_predictors()) %>% +#' step_dummy(all_nominal_predictors()) +#' +#' rec_prep <- prep(rec_spec, training = data_training) +#' ``` +#' +#' At `bake()` time, when `stage = "bake"`, the ptype represents the data +#' that are required for `bake()` to run. +#' +#' ```r +#' data_bake <- bake(rec_prep, new_data = data_testing) +#' ``` +#' +#' What this means in practice is that unless otherwise specified, everything +#' but outcomes and case weights are required. These requirements can be changed +#' with `update_role_requirements()`, and `recipes_ptype()` respects those +#' changes. +#' +#' Note that the order of the columns aren't guaranteed to align with +#' `data_ptype` as the data internally is ordered according to roles. +#' +#' @return A zero row tibble. +#' @keywords internal +#' +#' @seealso [developer_functions] +#' +#' @examples +#' training <- tibble( +#' y = 1:10, +#' id = 1:10, +#' x1 = letters[1:10], +#' x2 = factor(letters[1:10]), +#' cw = hardhat::importance_weights(1:10) +#' ) +#' training +#' +#' rec_spec <- recipe(y ~ ., data = training) +#' +#' # outcomes and case_weights are not required at bake time +#' recipes_ptype(rec_spec, stage = "prep") +#' recipes_ptype(rec_spec, stage = "bake") +#' +#' rec_spec <- recipe(y ~ ., data = training) %>% +#' update_role(x1, new_role = "id") +#' +#' # outcomes and case_weights are not required at bake time +#' # "id" column is assumed to be needed +#' recipes_ptype(rec_spec, stage = "prep") +#' recipes_ptype(rec_spec, stage = "bake") +#' +#' rec_spec <- recipe(y ~ ., data = training) %>% +#' update_role(x1, new_role = "id") %>% +#' update_role_requirements("id", bake = FALSE) +#' +#' # update_role_requirements() is used to specify that "id" isn't needed +#' recipes_ptype(rec_spec, stage = "prep") +#' recipes_ptype(rec_spec, stage = "bake") +#' +#' @export +recipes_ptype <- function(x, ..., stage = "prep") { + check_dots_empty0(...) + + if (is.null(x$ptype)) { + cli::cli_abort( + c( + x = "Doesn't work on recipes created prior to version 1.1.0.", + i = "Please recreate recipe." + ) + ) + } + + ptype <- x$ptype + + stage <- rlang::arg_match(stage, values = c("prep", "bake")) + + if (stage == "bake") { + required_roles <- compute_bake_role_requirements(x) + + var_info <- x$var_info + roles <- var_info$role + roles <- chr_explicit_na(roles) + + required_var <- var_info$variable[required_roles[roles]] + + ptype <- ptype[names(ptype) %in% required_var] + } + + ptype +} diff --git a/R/recipe.R b/R/recipe.R index e61f3b0b7..67ba1a6b4 100644 --- a/R/recipe.R +++ b/R/recipe.R @@ -184,7 +184,8 @@ recipe.data.frame <- template = x, levels = NULL, retained = NA, - requirements = requirements + requirements = requirements, + ptype = vctrs::vec_ptype(x) ) class(out) <- "recipe" out diff --git a/_pkgdown.yml b/_pkgdown.yml index 3f8d9a43b..44742f863 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -165,6 +165,7 @@ reference: - prepper - recipes_eval_select - recipes_extension_check + - recipes_ptype - recipes-role-indicator - update.step - title: Tidy Methods diff --git a/man/developer_functions.Rd b/man/developer_functions.Rd index 024200535..a62163ad5 100644 --- a/man/developer_functions.Rd +++ b/man/developer_functions.Rd @@ -128,6 +128,11 @@ new columns. } \section{Interacting with recipe objects}{ +\code{\link[=recipes_ptype]{recipes_ptype()}} returns the ptype, expected variables and types, that a +recipe object expects at \code{prep()} and \code{bake()} time. Controlled using the +\code{stage} argument. Can be used by functions that interact with recipes to +verify data is correct before passing it to \code{prep()} and \code{bake()}. + \code{\link[=detect_step]{detect_step()}} returns a logical indicator to determine if a given step or check is included in a recipe. diff --git a/man/recipes_ptype.Rd b/man/recipes_ptype.Rd new file mode 100644 index 000000000..7f518ae49 --- /dev/null +++ b/man/recipes_ptype.Rd @@ -0,0 +1,93 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ptype.R +\name{recipes_ptype} +\alias{recipes_ptype} +\title{Prototype of recipe object} +\usage{ +recipes_ptype(x, ..., stage = "prep") +} +\arguments{ +\item{x}{A \code{recipe} object.} + +\item{...}{currently not used.} + +\item{stage}{A single character. Must be one of \code{"prep"} or \code{"bake"}. See +details for more. Defaults to \code{"prep"}.} +} +\value{ +A zero row tibble. +} +\description{ +This helper function returns the prototype of the input data set expected by +the recipe object. +} +\details{ +The returned ptype is a tibble of the data set that the recipe object is +expecting. The specifics of which columns depend on the \code{stage}. + +At \code{prep()} time, when \code{stage = "prep"}, the ptype is the data passed to +\code{recipe()}. The following code chunk represents a possible recipe scenario. +\code{recipes_ptype(rec_spec, stage = "prep")} and +\code{recipes_ptype(rec_prep, stage = "prep")} both return a ptype tibble +corresponding to \code{data_ptype}. This information is used internally in +\code{prep()} to verify that \code{data_training} has the right columns with the right +types. + +\if{html}{\out{
}}\preformatted{rec_spec <- recipe(outcome ~ ., data = data_ptype) \%>\% + step_normalize(all_numeric_predictors()) \%>\% + step_dummy(all_nominal_predictors()) + +rec_prep <- prep(rec_spec, training = data_training) +}\if{html}{\out{
}} + +At \code{bake()} time, when \code{stage = "bake"}, the ptype represents the data +that are required for \code{bake()} to run. + +\if{html}{\out{
}}\preformatted{data_bake <- bake(rec_prep, new_data = data_testing) +}\if{html}{\out{
}} + +What this means in practice is that unless otherwise specified, everything +but outcomes and case weights are required. These requirements can be changed +with \code{update_role_requirements()}, and \code{recipes_ptype()} respects those +changes. + +Note that the order of the columns aren't guaranteed to align with +\code{data_ptype} as the data internally is ordered according to roles. +} +\examples{ +training <- tibble( + y = 1:10, + id = 1:10, + x1 = letters[1:10], + x2 = factor(letters[1:10]), + cw = hardhat::importance_weights(1:10) +) +training + +rec_spec <- recipe(y ~ ., data = training) + +# outcomes and case_weights are not required at bake time +recipes_ptype(rec_spec, stage = "prep") +recipes_ptype(rec_spec, stage = "bake") + +rec_spec <- recipe(y ~ ., data = training) \%>\% + update_role(x1, new_role = "id") + +# outcomes and case_weights are not required at bake time +# "id" column is assumed to be needed +recipes_ptype(rec_spec, stage = "prep") +recipes_ptype(rec_spec, stage = "bake") + +rec_spec <- recipe(y ~ ., data = training) \%>\% + update_role(x1, new_role = "id") \%>\% + update_role_requirements("id", bake = FALSE) + +# update_role_requirements() is used to specify that "id" isn't needed +recipes_ptype(rec_spec, stage = "prep") +recipes_ptype(rec_spec, stage = "bake") + +} +\seealso{ +\link{developer_functions} +} +\keyword{internal} diff --git a/tests/testthat/_snaps/ptype.md b/tests/testthat/_snaps/ptype.md new file mode 100644 index 000000000..a1e63cfb8 --- /dev/null +++ b/tests/testthat/_snaps/ptype.md @@ -0,0 +1,9 @@ +# recipes_ptype errors on old recipes + + Code + recipes_ptype(rec) + Condition + Error in `recipes_ptype()`: + x Doesn't work on recipes created prior to version 1.1.0. + i Please recreate recipe. + diff --git a/tests/testthat/test-ptype.R b/tests/testthat/test-ptype.R new file mode 100644 index 000000000..b5b06c446 --- /dev/null +++ b/tests/testthat/test-ptype.R @@ -0,0 +1,131 @@ +test_that("recipes_ptype() works", { + data_orig <- tibble( + y = 1:10, + id = 1:10, + x1 = letters[1:10], + x2 = factor(letters[1:10]), + cw = hardhat::importance_weights(1:10) + ) + + rec_spec <- recipe(y ~ ., data = data_orig) + + exp_ptype <- vctrs::vec_ptype(data_orig) + + expect_identical( + recipes_ptype(rec_spec, stage = "prep")[names(exp_ptype)], + exp_ptype + ) + expect_identical( + recipes_ptype(rec_spec, stage = "bake"), + exp_ptype[c("id", "x1", "x2")] + ) +}) + +test_that("recipes_ptype() isn't affected by prepping recipe", { + data_orig <- tibble( + y = 1:10, + id = 1:10, + x1 = letters[1:10], + x2 = factor(letters[1:10]), + cw = hardhat::importance_weights(1:10) + ) + + rec_spec <- recipe(y ~ ., data = data_orig) %>% + step_dummy(all_nominal_predictors()) %>% + prep() + + exp_ptype <- vctrs::vec_ptype(data_orig) + + expect_identical( + recipes_ptype(rec_spec, stage = "prep")[names(exp_ptype)], + exp_ptype + ) + expect_identical( + recipes_ptype(rec_spec, stage = "bake"), + exp_ptype[c("id", "x1", "x2")] + ) +}) + +test_that("recipes_ptype() works with update_role()", { + data_orig <- tibble( + y = 1:10, + id = 1:10, + x1 = letters[1:10], + x2 = factor(letters[1:10]), + cw = hardhat::importance_weights(1:10) + ) + + rec_spec <- recipe(y ~ ., data = data_orig) %>% + update_role(id, new_role = "id") + + exp_ptype <- vctrs::vec_ptype(data_orig) + + expect_identical( + recipes_ptype(rec_spec, stage = "prep")[names(exp_ptype)], + exp_ptype + ) + expect_identical( + recipes_ptype(rec_spec, stage = "bake"), + exp_ptype[c("id", "x1", "x2")] + ) +}) + +test_that("recipes_ptype() works with update_role_requirements()", { + data_orig <- tibble( + y = 1:10, + id = 1:10, + x1 = letters[1:10], + x2 = factor(letters[1:10]), + cw = hardhat::importance_weights(1:10) + ) + + rec_spec <- recipe(y ~ ., data = data_orig) %>% + update_role(id, new_role = "id") %>% + update_role_requirements("id", bake = FALSE) + + exp_ptype <- vctrs::vec_ptype(data_orig) + + expect_identical( + recipes_ptype(rec_spec, stage = "prep")[names(exp_ptype)], + exp_ptype + ) + expect_identical( + recipes_ptype(rec_spec, stage = "bake"), + exp_ptype[c("x1", "x2")] + ) +}) + +test_that("recipes_ptype() works with NA roles", { + data_orig <- tibble( + y = 1:10, + id = 1:10, + x1 = letters[1:10], + x2 = factor(letters[1:10]), + cw = hardhat::importance_weights(1:10) + ) + + rec_spec <- recipe(data_orig) + + exp_ptype <- vctrs::vec_ptype(data_orig) + + expect_identical( + recipes_ptype(rec_spec, stage = "prep")[names(exp_ptype)], + exp_ptype + ) + expect_identical( + recipes_ptype(rec_spec, stage = "bake"), + exp_ptype[c("y", "id", "x1", "x2")] + ) +}) + +test_that("recipes_ptype errors on old recipes", { + rec <- recipe(mpg ~ ., data = mtcars) + + # simulate pre-1.1.0 recipe + rec$ptype <- NULL + + expect_snapshot( + error = TRUE, + recipes_ptype(rec) + ) +}) diff --git a/tests/testthat/test-selections.R b/tests/testthat/test-selections.R index d131c5b13..baef83d5b 100644 --- a/tests/testthat/test-selections.R +++ b/tests/testthat/test-selections.R @@ -310,6 +310,9 @@ test_that("old recipes from 1.0.1 work with new get_types", { ) expect_false(identical(old_rec_sac, rec_sac)) + # Avoid issue with new ptype field in 1.1.0 + rec_sac$ptype <- NULL + expect_identical( prep(old_rec_sac), prep(rec_sac)