Skip to content

Commit

Permalink
handle mixed atomic/data frame column and better explanations in comm…
Browse files Browse the repository at this point in the history
…ents
  • Loading branch information
wdearden committed Jan 22, 2018
1 parent 7a39088 commit c14c972
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 25 deletions.
56 changes: 31 additions & 25 deletions R/elasticsearch_parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -391,8 +391,7 @@ unpack_nested_data <- function(chomped_df, col_to_unpack) {
futile.logger::flog.fatal(msg)
stop(msg)
}
if (!("character" %in% class(col_to_unpack)) || length(col_to_unpack) !=
1) {
if (!("character" %in% class(col_to_unpack)) || length(col_to_unpack) != 1) {
msg <- "For unpack_nested_data, col_to_unpack must be a character of length 1"
futile.logger::flog.fatal(msg)
stop(msg)
Expand All @@ -416,41 +415,48 @@ unpack_nested_data <- function(chomped_df, col_to_unpack) {
listDT[lengths(listDT) == 0] <- NA

is_df <- purrr::map_lgl(listDT, is.data.frame)
is_atomic <- purrr::map_lgl(listDT, is.atomic)
is_na <- is.na(listDT)
is_atomic <- purrr::map_lgl(listDT, purrr::is_atomic)

# Bind packed column into one data.table
if (all(is_df | is_na)) {
# If the packed column contained data.frames, we need to create a row of
# NAs data.table for each empty row in the packed column
first_nonmissing <- min(which(is_df))
DTNames <- names(listDT[[first_nonmissing]])
if (all(is_atomic)) {
newDT <- data.table::as.data.table(unlist(listDT))
} else if (all(is_df | is_atomic)) {
# If the packed column contains a mixture of data tables, we need to
# to convert the atomic vectors to data.tables

# Find column name to use for NA vectors
first_df <- min(which(is_df))
col_name <- names(listDT[[first_df]])[1]

# Convert non data.frame rows to data.table and assign name to rows
# with no name
prep_row <- function(x) {
if (is.atomic(x)) {
x <- data.table::as.data.table(x)
if (is.na(x)) names(x) <- col_name
else names(x) <- col_to_unpack
}
x
}
newDT <- purrr::map(listDT, prep_row)

newDT <- purrr::map(
listDT,
function(x) {
if(!is.data.frame(x)) {
x <- as.list(setNames(rep(NA, length(DTNames)), DTNames))
x <- data.table::as.data.table(x)
}
x
}
)
newDT <- data.table::rbindlist(newDT, fill = TRUE)
} else if (all(is_atomic)) {
newDT <- data.table::as.data.table(unlist(listDT))
} else {
msg <- "For unpack_nested_data, col_to_unpack must be all atomic vectors or all data frames"
msg <- paste0("Each row in column ", col_to_unpack, " must be a data frame or a vector.")
futile.logger::flog.fatal(msg)
stop(msg)
}

# Create the unpacked data.table by replicating the originally unpacked
# columns by the number of rows in each entry in the original unpacked column
group_vars <- setdiff(names(chomped_df), c(names(newDT), col_to_unpack))
n <- pmax(purrr::map_int(listDT, NROW), 1)
rest <- chomped_df[rep(1:nrow(chomped_df), n), ..group_vars, drop = FALSE]
outDT <- data.table::data.table(newDT, rest)
times_to_replicate <- pmax(purrr::map_int(listDT, NROW), 1)
# Replicate the rows of the data.table by entries of times_to_replicate but drop col_to_unpack
replicatedDT <- chomped_df[rep(1:nrow(chomped_df), times_to_replicate)]
replicatedDT[, col_to_unpack] <- NULL

# Then bind the replicated columns with the unpacked column
outDT <- data.table::data.table(newDT, replicatedDT)
if ("V1" %in% names(outDT)) {
data.table::setnames(outDT, "V1", col_to_unpack)
}
Expand Down
29 changes: 29 additions & 0 deletions tests/testthat/test-elasticsearch_parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,35 @@ futile.logger::flog.threshold(0)
regexp = "The column given to unpack_nested_data had no data in it")}
)

test_that("unpack_nested_data should break if the column contains a non data frame/vector", {
DT <- data.table::data.table(x = 1:2, y = list(list(2), 3))
expect_error(unpack_nested_data(chomped_df = DT, col_to_unpack = "y")
, regexp = "must be a data frame or a vector")
})

test_that("unpack_nested_data should handle NA and empty rows", {
DT <- data.table::data.table(x = 1:2, y = list(z = NA, data.table(w = 5:6, z = 7:8)))
DT2 <- data.table::data.table(x = 1:2, y = list(z = list(), data.table(w = 5:6, z = 7:8)))
unpackedDT <- data.table::data.table(
w = c(NA, 5, 6)
, z = c(NA, 7, 8)
, x = c(1, 2, 2)
)
expect_equal(unpack_nested_data(DT, col_to_unpack = "y"), unpackedDT)
expect_equal(unpack_nested_data(DT2, col_to_unpack = "y"), unpackedDT)
})

test_that("unpack_nested_data should handle mixed atomic/data frame column", {
DT <- data.table::data.table(x = 1:2, y = list(1, data.table(w = 5:6, z = 7:8)))
unpackedDT <- data.table::data.table(
y = c(1, NA, NA)
, w = c(NA, 5, 6)
, z = c(NA, 7, 8)
, x = c(1, 2, 2)
)
expect_equal(unpack_nested_data(DT, col_to_unpack = "y"), unpackedDT)
})

#---- 5. .ConvertToSec

# .ConvertToSec should work for seconds
Expand Down

0 comments on commit c14c972

Please sign in to comment.