This repository has been archived by the owner on Jul 31, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbackend_preprocessing.R
93 lines (83 loc) · 2.88 KB
/
backend_preprocessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#' @title backend_preprocessing
#'
#' @description The function transforms the values of a data.table object
#' so that in can be passed to an mlr3 task to be used with the
#' \code{mlr3learners.lightgbm} R package. This is necessary because
#' \code{lightgbm} can only handle numeric and integer features. Furthermore,
#' for classification tasks the target variable needs to be of type integer
#' with values starting at '0' for the first class.
#' The function is a wrapper around \code{lightgbm::prepare}.
#'
#' @param datatable A data.table object, holding the target variable and
#' features.
#' @param target_col A character. The name of the target variable.
#' @param task_type A character. The type of learning task to prepare
#' `datatable` for. Can be one of `regression`, `class:binary` or
#' `class:multiclass`.
#' @param positive A character. If `task_type` = `class:binary`, this argument
#' is required to specify the positive class of the binary classification
#' task, which will be replaced by the value `1` in the resulting object.
#'
#' @return The function returns a data.table with the transformed target
#' variable and feature variables. This object can then be used to create
#' an mlr3 task to be used with the \code{mlr3learners.lightgbm} R package.
#'
#' @seealso \code{mlr3}, \code{lightgbm::prepare}, \code{mlr3learners.lightgbm},
#' \code{plyr::revalue}
#'
#' @export
#'
backend_preprocessing = function(
datatable,
target_col,
task_type,
positive = NULL) {
stopifnot(
data.table::is.data.table(datatable)
, is.character(target_col)
, target_col %in% colnames(datatable)
, is.character(task_type)
, task_type %in% c("regression", "class:binary", "class:multiclass")
)
# extract label
label <- datatable[, get(target_col)]
if (task_type == "class:binary") {
stopifnot(
is.character(positive)
, positive %in% datatable[, unique(get(target_col))]
)
# transform label (revalue it and set "positive" to 1)
negative = setdiff(
datatable[, unique(get(target_col))],
positive
)
message(paste0("positive class: ", positive))
message(paste0("negative class: ", negative))
# replace values
repl = c(0, 1)
# create named vector
names(repl) = c(negative, positive)
# revalue
label = as.integer(plyr::revalue(
x = as.character(label),
replace = repl
))
} else if (task_type == "class:multiclass") {
label = as.integer(factor(label)) - 1L
} else {
stopifnot(is.numeric(label))
}
# get feature colnames
vec = setdiff(colnames(datatable), target_col)
# create backend
backend = cbind(
label,
lightgbm::lgb.convert_with_rules(datatable[, vec, with = F])[[1]]
)
colnames(backend)[1] = target_col
if (task_type != "regression") {
backend[, (target_col) := factor(get(target_col))]
}
# return backend
return(backend)
}