handle mixed atomic/data frame column and better explanations in comm…

…ents
uptake · Jan 22, 2018 · c14c972 · c14c972
1 parent 7a39088
commit c14c972
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 25 deletions.
diff --git a/R/elasticsearch_parsers.R b/R/elasticsearch_parsers.R
@@ -391,8 +391,7 @@ unpack_nested_data <- function(chomped_df, col_to_unpack)  {
         futile.logger::flog.fatal(msg)
         stop(msg)
     }
-    if (!("character" %in% class(col_to_unpack)) || length(col_to_unpack) != 
-        1) {
+    if (!("character" %in% class(col_to_unpack)) || length(col_to_unpack) != 1) {
         msg <- "For unpack_nested_data, col_to_unpack must be a character of length 1"
         futile.logger::flog.fatal(msg)
         stop(msg)
@@ -416,41 +415,48 @@ unpack_nested_data <- function(chomped_df, col_to_unpack)  {
     listDT[lengths(listDT) == 0] <- NA
 
     is_df <- purrr::map_lgl(listDT, is.data.frame)
+    is_atomic <- purrr::map_lgl(listDT, is.atomic)
     is_na <- is.na(listDT)
-    is_atomic <- purrr::map_lgl(listDT, purrr::is_atomic)
 
     # Bind packed column into one data.table
-    if (all(is_df | is_na)) {
-        # If the packed column contained data.frames, we need to create a row of 
-        # NAs data.table for each empty row in the packed column
-        first_nonmissing <- min(which(is_df))
-        DTNames <- names(listDT[[first_nonmissing]])
+    if (all(is_atomic)) {
+        newDT <- data.table::as.data.table(unlist(listDT))
+    } else if (all(is_df | is_atomic)) {
+        # If the packed column contains a mixture of data tables, we need to 
+        # to convert the atomic vectors to data.tables
+
+        # Find column name to use for NA vectors
+        first_df <- min(which(is_df))
+        col_name <- names(listDT[[first_df]])[1]
+
+        # Convert non data.frame rows to data.table and assign name to rows
+        # with no name
+        prep_row <- function(x) {
+            if (is.atomic(x)) {
+                x <- data.table::as.data.table(x)
+                if (is.na(x)) names(x) <- col_name
+                else names(x) <- col_to_unpack
+            } 
+            x
+        }
+        newDT <- purrr::map(listDT, prep_row)
 
-        newDT <- purrr::map(
-            listDT, 
-            function(x) {
-                if(!is.data.frame(x)) {
-                    x <- as.list(setNames(rep(NA, length(DTNames)), DTNames))
-                    x <- data.table::as.data.table(x)
-                }
-                x
-            }
-        )
         newDT <- data.table::rbindlist(newDT, fill = TRUE)
-    } else if (all(is_atomic)) {
-        newDT <- data.table::as.data.table(unlist(listDT))
     } else {
-        msg <- "For unpack_nested_data, col_to_unpack must be all atomic vectors or all data frames"
+        msg <- paste0("Each row in column ", col_to_unpack, " must be a data frame or a vector.")
         futile.logger::flog.fatal(msg)
         stop(msg)
     }
 
     # Create the unpacked data.table by replicating the originally unpacked
     # columns by the number of rows in each entry in the original unpacked column
-    group_vars <- setdiff(names(chomped_df), c(names(newDT), col_to_unpack))
-    n <- pmax(purrr::map_int(listDT, NROW), 1)
-    rest <- chomped_df[rep(1:nrow(chomped_df), n), ..group_vars, drop = FALSE]
-    outDT <- data.table::data.table(newDT, rest)
+    times_to_replicate <- pmax(purrr::map_int(listDT, NROW), 1)
+    # Replicate the rows of the data.table by entries of times_to_replicate but drop col_to_unpack
+    replicatedDT <- chomped_df[rep(1:nrow(chomped_df), times_to_replicate)]
+    replicatedDT[, col_to_unpack] <- NULL
+
+    # Then bind the replicated columns with the unpacked column
+    outDT <- data.table::data.table(newDT, replicatedDT)
     if ("V1" %in% names(outDT)) {
         data.table::setnames(outDT, "V1", col_to_unpack)
     }

diff --git a/tests/testthat/test-elasticsearch_parsers.R b/tests/testthat/test-elasticsearch_parsers.R
@@ -916,6 +916,35 @@ futile.logger::flog.threshold(0)
                             regexp = "The column given to unpack_nested_data had no data in it")}
     )
 
+    test_that("unpack_nested_data should break if the column contains a non data frame/vector", {
+        DT <- data.table::data.table(x = 1:2, y = list(list(2), 3))
+        expect_error(unpack_nested_data(chomped_df = DT, col_to_unpack = "y")
+                     , regexp = "must be a data frame or a vector")
+    })
+
+    test_that("unpack_nested_data should handle NA and empty rows", {
+        DT <- data.table::data.table(x = 1:2, y = list(z = NA, data.table(w = 5:6, z = 7:8)))
+        DT2 <- data.table::data.table(x = 1:2, y = list(z = list(), data.table(w = 5:6, z = 7:8)))
+        unpackedDT <- data.table::data.table(
+            w = c(NA, 5, 6)
+            , z = c(NA, 7, 8)
+            , x = c(1, 2, 2)
+        )
+        expect_equal(unpack_nested_data(DT, col_to_unpack = "y"), unpackedDT)
+        expect_equal(unpack_nested_data(DT2, col_to_unpack = "y"), unpackedDT)
+    })
+
+    test_that("unpack_nested_data should handle mixed atomic/data frame column", {
+        DT <- data.table::data.table(x = 1:2, y = list(1, data.table(w = 5:6, z = 7:8)))
+        unpackedDT <- data.table::data.table(
+            y = c(1, NA, NA)
+            , w = c(NA, 5, 6)
+            , z = c(NA, 7, 8)
+            , x = c(1, 2, 2)
+        )
+        expect_equal(unpack_nested_data(DT, col_to_unpack = "y"), unpackedDT)
+    })
+
 #---- 5. .ConvertToSec
 
     # .ConvertToSec should work for seconds