Skip to content

Commit

Permalink
[R-package] refactor and improvements to lgb.convert() functions (fixes
Browse files Browse the repository at this point in the history
#2678, #2681) (#3269)

* [R-package] improvements to lgb.convert() functions (fixes #2678, #2681)

* more stuff

* update docs

* remove lgb.convert()

* put internal functions back

* update index
  • Loading branch information
jameslamb authored Aug 6, 2020
1 parent c454d5f commit 083b02a
Show file tree
Hide file tree
Showing 11 changed files with 174 additions and 506 deletions.
1 change: 0 additions & 1 deletion R-package/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ export(lgb.Dataset.create.valid)
export(lgb.Dataset.save)
export(lgb.Dataset.set.categorical)
export(lgb.Dataset.set.reference)
export(lgb.convert)
export(lgb.convert_with_rules)
export(lgb.cv)
export(lgb.dump)
Expand Down
89 changes: 0 additions & 89 deletions R-package/R/lgb.convert.R

This file was deleted.

247 changes: 135 additions & 112 deletions R-package/R/lgb.convert_with_rules.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,70 @@
# [description] get all column classes of a data.table or data.frame.
# This function collapses the result of class() into a single string
.get_column_classes <- function(df) {
return(
vapply(
X = df
, FUN = function(x) {paste0(class(x), collapse = ",")}
, FUN.VALUE = character(1L)
)
)
}

# [description] check a data frame or data table for columns tthat are any
# type other than numeric and integer. This is used by lgb.convert()
# and lgb.convert_with_rules() too warn if more action is needed by users
# before a dataset can be converted to a lgb.Dataset.
.warn_for_unconverted_columns <- function(df, function_name) {
column_classes <- .get_column_classes(df)
unconverted_columns <- column_classes[!(column_classes %in% c("numeric", "integer"))]
if (length(unconverted_columns) > 0L) {
col_detail_string <- paste0(
paste0(
names(unconverted_columns)
, " ("
, unconverted_columns
, ")"
)
, collapse = ", "
)
msg <- paste0(
function_name
, ": "
, length(unconverted_columns)
, " columns are not numeric or integer. These need to be dropped or converted to "
, "be used in an lgb.Dataset object. "
, col_detail_string
)
warning(msg)
}
return(invisible(NULL))
}

.LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA <- function() {return(-1L)}
.LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA <- function() {return(0L)}


#' @name lgb.convert_with_rules
#' @title Data preparator for LightGBM datasets with rules (integer)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to integer.
#' In addition, keeps rules created so you can convert other datasets using this converter.
#' This is useful if you have a specific need for integer dataset instead of numeric dataset.
#' Factor, character, and logical columns are converted to integer. Missing values
#' in factors and characters will be filled with 0L. Missing values in logicals
#' will be filled with -1L.
#'
#' This function returns and optionally takes in "rules" the describe exactly
#' how to convert values in columns.
#'
#' Columns that contain only NA values will be converted by this function but will
#' not show up in the returned \code{rules}.
#'
#' NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used.
#' @param rules A set of rules from the data preparator, if already used. This should be an R list,
#' where names are column names in \code{data} and values are named character
#' vectors whose names are column values and whose values are new values to
#' replace them with.
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
#' The data must be converted to a matrix format (\code{as.matrix}) for input in
#' Note that the data must be converted to a matrix format (\code{as.matrix}) for input in
#' \code{lgb.Dataset}.
#'
#' @examples
Expand All @@ -18,7 +73,7 @@
#'
#' str(iris)
#'
#' new_iris <- lgb.convert_with_rules(data = iris) # Autoconverter
#' new_iris <- lgb.convert_with_rules(data = iris)
#' str(new_iris$data)
#'
#' data(iris) # Erase iris dataset
Expand Down Expand Up @@ -54,130 +109,98 @@
#' @export
lgb.convert_with_rules <- function(data, rules = NULL) {

# data.table not behaving like data.frame
if (inherits(data, "data.table")) {

# Must use existing rules
if (!is.null(rules)) {

# Loop through rules
for (i in names(rules)) {
column_classes <- .get_column_classes(data)

data.table::set(data, j = i, value = unname(rules[[i]][data[[i]]]))
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
is_char <- which(column_classes == "character")
is_factor <- which(column_classes == "factor")
is_logical <- which(column_classes == "logical")

}
is_data_table <- data.table::is.data.table(data)
is_data_frame <- is.data.frame(data)

} else {

# Get data classes
list_classes <- vapply(data, class, character(1L))

# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
if (!(is_data_table || is_data_frame)) {
stop(
"lgb.convert_with_rules: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame or data.table"
)
}

# Need to create rules?
if (length(is_fix) > 0L) {
# if user didn't provide rules, create them
if (is.null(rules)) {
rules <- list()
columns_to_fix <- which(column_classes %in% c("character", "factor", "logical"))

# Go through all characters/factors
for (i in is_fix) {
for (i in columns_to_fix) {

# Store column elsewhere
mini_data <- data[[i]]
col_values <- data[[i]]

# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.integer(mini_unique) # No respect of ordinality
if (is.factor(col_values)) {
unique_vals <- levels(col_values)
unique_vals <- unique_vals[!is.na(unique_vals)]
mini_numeric <- seq_along(unique_vals) # respect ordinal
} else if (is.character(col_values)) {
unique_vals <- as.factor(unique(col_values))
unique_vals <- unique_vals[!is.na(unique_vals)]
mini_numeric <- as.integer(unique_vals) # no respect for ordinal
} else if (is.logical(col_values)) {
unique_vals <- c(FALSE, TRUE)
mini_numeric <- c(0L, 1L)
}

# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent

# Apply to real data column
data.table::set(data, j = i, value = unname(rules[[indexed]][mini_data]))

# don't add rules for all-NA columns
if (length(unique_vals) > 0L) {
col_name <- names(data)[i]
rules[[col_name]] <- mini_numeric
names(rules[[col_name]]) <- unique_vals
}
}

}

}

} else {

# Must use existing rules
if (!is.null(rules)) {

# Loop through rules
for (i in names(rules)) {

data[[i]] <- unname(rules[[i]][data[[i]]])
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer

}

} else {

# Default routine (data.frame)
if (inherits(data, "data.frame")) {

# Get data classes
list_classes <- vapply(data, class, character(1L))

# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()

# Need to create rules?
if (length(is_fix) > 0L) {

# Go through all characters/factors
for (i in is_fix) {

# Store column elsewhere
mini_data <- data[[i]]

# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.integer(mini_unique) # No respect of ordinality
}

# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent

# Apply to real data column
data[[i]] <- unname(rules[[indexed]][mini_data])

}

for (col_name in names(rules)) {
if (column_classes[[col_name]] == "logical") {
default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
} else {
default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
}
if (is_data_table) {
data.table::set(
data
, j = col_name
, value = unname(rules[[col_name]][data[[col_name]]])
)
data[is.na(get(col_name)), (col_name) := default_value_for_na]
} else {
data[[col_name]] <- unname(rules[[col_name]][data[[col_name]]])
data[is.na(data[col_name]), col_name] <- default_value_for_na
}
}

} else {

stop(
"lgb.convert_with_rules: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
# if any all-NA columns exist, they won't be in rules. Convert them
all_na_cols <- which(
sapply(
X = data
, FUN = function(x) {
(is.factor(x) || is.character(x) || is.logical(x)) && all(is.na(unique(x)))
}
)

}

)
for (col_name in all_na_cols) {
if (column_classes[[col_name]] == "logical") {
default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
} else {
default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
}
if (is_data_table) {
data[, (col_name) := rep(default_value_for_na, .N)]
} else {
data[[col_name]] <- default_value_for_na
}
}

}
.warn_for_unconverted_columns(df = data, function_name = "lgb.convert_with_rules")

return(list(data = data, rules = rules))
return(list(data = data, rules = rules))

}
Loading

0 comments on commit 083b02a

Please sign in to comment.