[R-package] refactor and improvements to lgb.convert() functions (fixes

#2678, #2681) (#3269) * [R-package] improvements to lgb.convert() functions (fixes #2678, #2681) * more stuff * update docs * remove lgb.convert() * put internal functions back * update index
microsoft · Aug 6, 2020 · 083b02a · 083b02a
1 parent c454d5f
commit 083b02a
Show file tree

Hide file tree

Showing 11 changed files with 174 additions and 506 deletions.
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
@@ -14,7 +14,6 @@ export(lgb.Dataset.create.valid)
 export(lgb.Dataset.save)
 export(lgb.Dataset.set.categorical)
 export(lgb.Dataset.set.reference)
-export(lgb.convert)
 export(lgb.convert_with_rules)
 export(lgb.cv)
 export(lgb.dump)

diff --git a/R-package/R/lgb.convert.R b/R-package/R/lgb.convert.R
diff --git a/R-package/R/lgb.convert_with_rules.R b/R-package/R/lgb.convert_with_rules.R
@@ -1,15 +1,70 @@
+# [description] get all column classes of a data.table or data.frame.
+#               This function collapses the result of class() into a single string
+.get_column_classes <- function(df) {
+    return(
+        vapply(
+            X = df
+            , FUN = function(x) {paste0(class(x), collapse = ",")}
+            , FUN.VALUE = character(1L)
+        )
+    )
+}
+
+# [description] check a data frame or data table for columns tthat are any
+#               type other than numeric and integer. This is used by lgb.convert()
+#               and lgb.convert_with_rules() too warn if more action is needed by users
+#               before a dataset can be converted to a lgb.Dataset.
+.warn_for_unconverted_columns <- function(df, function_name) {
+    column_classes <- .get_column_classes(df)
+    unconverted_columns <- column_classes[!(column_classes %in% c("numeric", "integer"))]
+    if (length(unconverted_columns) > 0L) {
+        col_detail_string <- paste0(
+            paste0(
+                names(unconverted_columns)
+                , " ("
+                , unconverted_columns
+                , ")"
+            )
+            , collapse = ", "
+        )
+        msg <- paste0(
+            function_name
+            , ": "
+            , length(unconverted_columns)
+            , " columns are not numeric or integer. These need to be dropped or converted to "
+            , "be used in an lgb.Dataset object. "
+            , col_detail_string
+        )
+        warning(msg)
+    }
+    return(invisible(NULL))
+}
+
+.LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA <- function() {return(-1L)}
+.LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA <- function() {return(0L)}
+
+
 #' @name lgb.convert_with_rules
 #' @title Data preparator for LightGBM datasets with rules (integer)
 #' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
-#'              Factors and characters are converted to integer.
-#'              In addition, keeps rules created so you can convert other datasets using this converter.
-#'              This is useful if you have a specific need for integer dataset instead of numeric dataset.
+#'              Factor, character, and logical columns are converted to integer. Missing values
+#'              in factors and characters will be filled with 0L. Missing values in logicals
+#'              will be filled with -1L.
+#'
+#'              This function returns and optionally takes in "rules" the describe exactly
+#'              how to convert values in columns.
+#'
+#'              Columns that contain only NA values will be converted by this function but will
+#'              not show up in the returned \code{rules}.
 #'
 #'              NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
 #' @param data A data.frame or data.table to prepare.
-#' @param rules A set of rules from the data preparator, if already used.
+#' @param rules A set of rules from the data preparator, if already used. This should be an R list,
+#'              where names are column names in \code{data} and values are named character
+#'              vectors whose names are column values and whose values are new values to
+#'              replace them with.
 #' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
-#'         The data must be converted to a matrix format (\code{as.matrix}) for input in
+#'         Note that the data must be converted to a matrix format (\code{as.matrix}) for input in
 #'         \code{lgb.Dataset}.
 #'
 #' @examples
@@ -18,7 +73,7 @@
 #'
 #' str(iris)
 #'
-#' new_iris <- lgb.convert_with_rules(data = iris) # Autoconverter
+#' new_iris <- lgb.convert_with_rules(data = iris)
 #' str(new_iris$data)
 #'
 #' data(iris) # Erase iris dataset
@@ -54,130 +109,98 @@
 #' @export
 lgb.convert_with_rules <- function(data, rules = NULL) {
 
-  # data.table not behaving like data.frame
-  if (inherits(data, "data.table")) {
-
-    # Must use existing rules
-    if (!is.null(rules)) {
-
-      # Loop through rules
-      for (i in names(rules)) {
+    column_classes <- .get_column_classes(data)
 
-        data.table::set(data, j = i, value = unname(rules[[i]][data[[i]]]))
-        data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
+    is_char <- which(column_classes == "character")
+    is_factor <- which(column_classes == "factor")
+    is_logical <- which(column_classes == "logical")
 
-      }
+    is_data_table <- data.table::is.data.table(data)
+    is_data_frame <- is.data.frame(data)
 
-    } else {
-
-      # Get data classes
-      list_classes <- vapply(data, class, character(1L))
-
-      # Map characters/factors
-      is_fix <- which(list_classes %in% c("character", "factor"))
-      rules <- list()
+    if (!(is_data_table || is_data_frame)) {
+        stop(
+            "lgb.convert_with_rules: you provided "
+            , paste(class(data), collapse = " & ")
+            , " but data should have class data.frame or data.table"
+        )
+    }
 
-      # Need to create rules?
-      if (length(is_fix) > 0L) {
+    # if user didn't provide rules, create them
+    if (is.null(rules)) {
+        rules <- list()
+        columns_to_fix <- which(column_classes %in% c("character", "factor", "logical"))
 
-        # Go through all characters/factors
-        for (i in is_fix) {
+        for (i in columns_to_fix) {
 
-          # Store column elsewhere
-          mini_data <- data[[i]]
+          col_values <- data[[i]]
 
           # Get unique values
-          if (is.factor(mini_data)) {
-            mini_unique <- levels(mini_data) # Factor
-            mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
-          } else {
-            mini_unique <- as.factor(unique(mini_data)) # Character
-            mini_numeric <- as.integer(mini_unique) # No respect of ordinality
+          if (is.factor(col_values)) {
+              unique_vals <- levels(col_values)
+              unique_vals <- unique_vals[!is.na(unique_vals)]
+              mini_numeric <- seq_along(unique_vals) # respect ordinal
+          } else if (is.character(col_values)) {
+              unique_vals <- as.factor(unique(col_values))
+              unique_vals <- unique_vals[!is.na(unique_vals)]
+              mini_numeric <- as.integer(unique_vals)  # no respect for ordinal
+          } else if (is.logical(col_values)) {
+              unique_vals <- c(FALSE, TRUE)
+              mini_numeric <- c(0L, 1L)
           }
 
-          # Create rules
-          indexed <- colnames(data)[i] # Index value
-          rules[[indexed]] <- mini_numeric # Numeric content
-          names(rules[[indexed]]) <- mini_unique # Character equivalent
-
-          # Apply to real data column
-          data.table::set(data, j = i, value = unname(rules[[indexed]][mini_data]))
-
+          # don't add rules for all-NA columns
+          if (length(unique_vals) > 0L) {
+              col_name <- names(data)[i]
+              rules[[col_name]] <- mini_numeric
+              names(rules[[col_name]]) <- unique_vals
+          }
         }
-
-      }
-
     }
 
-  } else {
-
-    # Must use existing rules
-    if (!is.null(rules)) {
-
-      # Loop through rules
-      for (i in names(rules)) {
-
-        data[[i]] <- unname(rules[[i]][data[[i]]])
-        data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
-
-      }
-
-    } else {
-
-      # Default routine (data.frame)
-      if (inherits(data, "data.frame")) {
-
-        # Get data classes
-        list_classes <- vapply(data, class, character(1L))
-
-        # Map characters/factors
-        is_fix <- which(list_classes %in% c("character", "factor"))
-        rules <- list()
-
-        # Need to create rules?
-        if (length(is_fix) > 0L) {
-
-          # Go through all characters/factors
-          for (i in is_fix) {
-
-            # Store column elsewhere
-            mini_data <- data[[i]]
-
-            # Get unique values
-            if (is.factor(mini_data)) {
-              mini_unique <- levels(mini_data) # Factor
-              mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
-            } else {
-              mini_unique <- as.factor(unique(mini_data)) # Character
-              mini_numeric <- as.integer(mini_unique) # No respect of ordinality
-            }
-
-            # Create rules
-            indexed <- colnames(data)[i] # Index value
-            rules[[indexed]] <- mini_numeric # Numeric content
-            names(rules[[indexed]]) <- mini_unique # Character equivalent
-
-            # Apply to real data column
-            data[[i]] <- unname(rules[[indexed]][mini_data])
-
-          }
-
+    for (col_name in names(rules)) {
+        if (column_classes[[col_name]] == "logical") {
+            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
+        } else {
+            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
         }
+        if (is_data_table) {
+            data.table::set(
+                data
+                , j = col_name
+                , value = unname(rules[[col_name]][data[[col_name]]])
+            )
+            data[is.na(get(col_name)), (col_name) := default_value_for_na]
+        } else {
+            data[[col_name]] <- unname(rules[[col_name]][data[[col_name]]])
+            data[is.na(data[col_name]), col_name] <- default_value_for_na
+        }
+    }
 
-      } else {
-
-        stop(
-          "lgb.convert_with_rules: you provided "
-          , paste(class(data), collapse = " & ")
-          , " but data should have class data.frame"
+    # if any all-NA columns exist, they won't be in rules. Convert them
+    all_na_cols <- which(
+        sapply(
+            X = data
+            , FUN = function(x) {
+                (is.factor(x) || is.character(x) || is.logical(x)) && all(is.na(unique(x)))
+            }
         )
-
-      }
-
+    )
+    for (col_name in all_na_cols) {
+        if (column_classes[[col_name]] == "logical") {
+            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
+        } else {
+            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
+        }
+        if (is_data_table) {
+            data[, (col_name) := rep(default_value_for_na, .N)]
+        } else {
+            data[[col_name]] <- default_value_for_na
+        }
     }
 
-  }
+    .warn_for_unconverted_columns(df = data, function_name = "lgb.convert_with_rules")
 
-  return(list(data = data, rules = rules))
+    return(list(data = data, rules = rules))
 
 }