Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement pivot_longer() #204

Merged
merged 14 commits into from
Mar 8, 2021
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

* `fill()` (@markfairbanks, #197)

* `pivot_longer()` (@markfairbanks, #204)

* `replace_na()` (@markfairbanks, #202)

# dtplyr 1.1.0
Expand Down
388 changes: 388 additions & 0 deletions R/step-call-pivot_longer.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,388 @@
#' Pivot data from wide to long
#'
#' @description
#' This is a method for the tidyr `pivot_longer()` generic. It is translated to
#' [data.table::melt()]
#'
#' @param data A [lazy_dt()].
#' @inheritParams tidyr::pivot_longer
#' @examples
#' library(tidyr)
#'
#' # Simplest case where column names are character data
#' relig_income_dt <- lazy_dt(relig_income)
#' relig_income_dt %>%
#' pivot_longer(!religion, names_to = "income", values_to = "count")
#'
#' # Slightly more complex case where columns have common prefix,
#' # and missing missings are structural so should be dropped.
#' billboard_dt <- lazy_dt(billboard)
#' billboard %>%
#' pivot_longer(
#' cols = starts_with("wk"),
#' names_to = "week",
#' names_prefix = "wk",
#' values_to = "rank",
#' values_drop_na = TRUE
#' )
#'
#' # Multiple variables stored in column names
#' lazy_dt(who) %>%
#' pivot_longer(
#' cols = new_sp_m014:newrel_f65,
#' names_to = c("diagnosis", "gender", "age"),
#' names_pattern = "new_?(.*)_(.)(.*)",
#' values_to = "count"
#' )
#'
#' # Multiple observations per row
#' anscombe_dt <- lazy_dt(anscombe)
#' anscombe_dt %>%
#' pivot_longer(
#' everything(),
#' names_to = c(".value", "set"),
#' names_pattern = "(.)(.)"
#' )
# exported onLoad
pivot_longer.dtplyr_step <- function(data,
cols,
names_to = "name",
names_prefix = NULL,
names_sep = NULL,
names_pattern = NULL,
names_ptypes = list(),
names_transform = list(),
names_repair = "check_unique",
values_to = "value",
values_drop_na = FALSE,
values_ptypes = list(),
values_transform = list(),
...) {

sim_data <- simulate_vars(data)
sim_vars <- names(sim_data)
markfairbanks marked this conversation as resolved.
Show resolved Hide resolved
measure_vars <- names(tidyselect::eval_select(enquo(cols), sim_data))
if (length(measure_vars) == 0) {
abort("`cols` must select at least one column.")
}

multiple_names_to <- length(names_to) > 1
uses_dot_value <- ".value" %in% names_to

variable_name <- "variable"

if (uses_dot_value) {
if (!is.null(names_sep)) {
.value <- str_separate(measure_vars, into = names_to, sep = names_sep)$.value
} else if (!is.null(names_pattern)) {
.value <- str_extract(measure_vars, into = names_to, names_pattern)$.value
} else {
abort("If you use '.value' in `names_to` you must also supply
`names_sep' or `names_pattern")
}

v_fct <- factor(.value, levels = unique(.value))
measure_vars <- split(measure_vars, v_fct)
values_to <- names(measure_vars)
names(measure_vars) <- NULL

if (multiple_names_to) {
variable_name <- names_to[!names_to == ".value"]
}
} else if (multiple_names_to) {
if (is.null(names_sep) && is.null(names_pattern)) {
abort("If you supply multiple names in `names_to` you must also
supply `names_sep` or `names_pattern`")
} else if (!is.null(names_sep) && !is.null(names_pattern)) {
abort("only one of names_sep or names_pattern should be provided")
}
} else {
variable_name <- names_to
}

args <- list(
measure.vars = measure_vars,
variable.name = variable_name,
value.name = values_to,
na.rm = values_drop_na,
variable.factor = FALSE
)

# Clean up call args if defaults are used
if (variable_name == "variable") {
args$variable.name <- NULL
}

if (length(values_to) == 1) {
markfairbanks marked this conversation as resolved.
Show resolved Hide resolved
if (values_to == "value") {
args$value.name <- NULL
}
}

if (is_false(values_drop_na)) {
args$na.rm <- NULL
}

id_vars <- sim_vars[!sim_vars %in% unlist(measure_vars)]

out <- step_call(
data,
"melt",
args = args,
vars = c(id_vars, variable_name, values_to)
)

if (!is.null(names_prefix)) {
out <- mutate(out, !!variable_name := gsub(paste0("^", names_prefix), "", !!sym(variable_name)))
}

if (multiple_names_to && !uses_dot_value) {
if (!is.null(names_sep)) {
into_cols <- str_separate(pull(out, !!sym(variable_name)), names_to, sep = names_sep)
} else {
into_cols <- str_extract(pull(out, !!sym(variable_name)), into = names_to, regex = names_pattern)
}
out <- mutate(out, !!!into_cols)

# Need to drop variable_name and move names_to vars to correct position
# Recreates relocate logic so only select is necessary, not relocate + select
out_vars <- out$vars
var_idx <- which(out_vars == variable_name)
before_vars <- out_vars[seq_along(out_vars) < var_idx]
after_vars <- out_vars[seq_along(out_vars) > var_idx]

out <- select(out, !!!syms(before_vars), !!!syms(names_to), !!!syms(after_vars))
hadley marked this conversation as resolved.
Show resolved Hide resolved
} else if (!multiple_names_to && uses_dot_value) {
out <- mutate(out, variable = NULL)
}

out <- step_repair(out, repair = names_repair)

## names_ptype & names_transform
cast_vars <- intersect(names_to, names(names_ptypes))
markfairbanks marked this conversation as resolved.
Show resolved Hide resolved
if (length(cast_vars) > 0) {
cast_calls <- vector("list", length(cast_vars))
names(cast_calls) <- cast_vars
for (i in seq_along(cast_calls)) {
cast_calls[[i]] <- call2("vec_cast", sym(cast_vars[[i]]), names_ptypes[[i]])
}
out <- mutate(out, !!!cast_calls)
}

coerce_vars <- intersect(names_to, names(names_transform))
if (length(coerce_vars) > 0) {
coerce_calls <- vector("list", length(coerce_vars))
names(coerce_calls) <- coerce_vars
for (i in seq_along(coerce_calls)) {
.fn <- as_function(names_transform[[i]])
coerce_calls[[i]] <- call2(.fn, sym(coerce_vars[[i]]))
}
out <- mutate(out, !!!coerce_calls)
}

## values_ptype & values_transform
cast_vars <- intersect(values_to, names(values_ptypes))
if (length(cast_vars) > 0) {
cast_calls <- vector("list", length(cast_vars))
names(cast_calls) <- cast_vars
for (i in seq_along(cast_calls)) {
cast_calls[[i]] <- call2(expr(vec_cast), sym(cast_vars[[i]]), values_ptypes[[i]])
}
out <- mutate(out, !!!cast_calls)
}

coerce_vars <- intersect(values_to, names(values_transform))
if (length(coerce_vars) > 0) {
coerce_calls <- vector("list", length(coerce_vars))
names(coerce_calls) <- coerce_vars
for (i in seq_along(coerce_calls)) {
.fn <- as_function(values_transform[[i]])
coerce_calls[[i]] <- call2(.fn, sym(coerce_vars[[i]]))
}
out <- mutate(out, !!!coerce_calls)
}

out
}

# exported onLoad
pivot_longer.data.table <- function(data,
cols,
names_to = "name",
names_prefix = NULL,
names_sep = NULL,
names_pattern = NULL,
names_ptypes = list(),
names_transform = list(),
names_repair = "check_unique",
values_to = "value",
values_drop_na = FALSE,
values_ptypes = list(),
values_transform = list(),
...) {
data <- lazy_dt(data)
tidyr::pivot_longer(
data = data,
cols = {{ cols }},
names_to = names_to,
names_prefix = names_prefix,
names_sep = names_sep,
names_pattern = names_pattern,
names_ptypes = names_ptypes,
names_transform = names_transform,
names_repair = names_repair,
values_to = values_to,
values_drop_na = values_drop_na,
values_ptypes = values_ptypes,
values_transform = values_transform,
...
)
}

# str_extract() -----------------------------------------------------------------
str_extract <- function(x, into, regex, convert = FALSE) {
stopifnot(
is_string(regex),
is_character(into)
)

out <- str_match_first(x, regex)
if (length(out) != length(into)) {
stop(
"`regex` should define ", length(into), " groups; ", ncol(out), " found.",
call. = FALSE
)
}

# Handle duplicated names
if (anyDuplicated(into)) {
pieces <- split(out, into)
into <- names(pieces)
out <- lapply(pieces, pmap_chr, paste0, sep = "")
}

into <- as_utf8_character(into)

non_na_into <- !is.na(into)
out <- out[non_na_into]
names(out) <- into[non_na_into]

if (convert) {
out[] <- lapply(out, utils::type.convert, as.is = TRUE)
}

out
}

str_match_first <- function(string, regex) {
loc <- regexpr(regex, string, perl = TRUE)
loc <- group_loc(loc)

out <- lapply(
seq_len(loc$matches),
function(i) substr(string, loc$start[, i], loc$end[, i])
)
out[-1]
}

group_loc <- function(x) {
start <- cbind(as.vector(x), attr(x, "capture.start"))
end <- start + cbind(attr(x, "match.length"), attr(x, "capture.length")) - 1L

no_match <- start == -1L
start[no_match] <- NA
end[no_match] <- NA

list(matches = ncol(start), start = start, end = end)
}

# str_separate() -----------------------------------------------------------------

str_separate <- function(x, into, sep, convert = FALSE, extra = "warn", fill = "warn") {
if (!is.character(into)) {
abort("`into` must be a character vector")
}

if (is.numeric(sep)) {
out <- strsep(x, sep)
} else if (is_character(sep)) {
out <- data.table::tstrsplit(x, sep, fixed = TRUE, names = TRUE)
out <- as_tibble(out)
} else {
abort("`sep` must be either numeric or character")
}

names(out) <- as_utf8_character(into)
out <- out[!is.na(names(out))]
if (convert) {
out[] <- lapply(out, utils::type.convert, as.is = TRUE)
}
out
}

strsep <- function(x, sep) {
nchar <- nchar(x)
pos <- lapply(sep, function(i) {
if (i >= 0) return(i)
pmax(0, nchar + i)
})
pos <- c(list(0), pos, list(nchar))

lapply(1:(length(pos) - 1), function(i) {
substr(x, pos[[i]] + 1, pos[[i + 1]])
})
}

str_split_n <- function(x, pattern, n_max = -1) {
if (is.factor(x)) {
x <- as.character(x)
}
m <- gregexpr(pattern, x, perl = TRUE)
if (n_max > 0) {
m <- lapply(m, function(x) slice_match(x, seq_along(x) < n_max))
}
regmatches(x, m, invert = TRUE)
}

slice_match <- function(x, i) {
structure(
x[i],
match.length = attr(x, "match.length")[i],
index.type = attr(x, "index.type"),
useBytes = attr(x, "useBytes")
)
}

list_indices <- function(x, max = 20) {
if (length(x) > max) {
x <- c(x[seq_len(max)], "...")
}

paste(x, collapse = ", ")
}

# pmap()/pmap_chr() -----------------------------------------------------------------

args_recycle <- function(args) {
lengths <- vapply(args, length, integer(1))
n <- max(lengths)

stopifnot(all(lengths == 1L | lengths == n))
to_recycle <- lengths == 1L
args[to_recycle] <- lapply(args[to_recycle], function(x) rep.int(x, n))

args
}

pmap <- function(.l, .f, ...) {
args <- args_recycle(.l)
do.call("mapply", c(
FUN = list(quote(.f)),
args, MoreArgs = quote(list(...)),
SIMPLIFY = FALSE, USE.NAMES = FALSE
))
}

pmap_chr <- function(.l, .f, ...) {
as.character(pmap(.l, .f, ...))
}
Loading