From e37f874b8f85306e4b2bb2e9ddad6e0388e5fbe7 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 14 Mar 2020 00:48:10 -0500
Subject: [PATCH 01/10] [R-package] started implementing first_metric_only

---
 R-package/R/callback.R                | 12 +++-
 R-package/R/lgb.cv.R                  |  1 +
 R-package/R/lgb.train.R               | 40 +++++++++---
 R-package/R/utils.R                   |  4 +-
 R-package/tests/testthat/test_basic.R | 93 +++++++++++++++++++++++++++
 5 files changed, 136 insertions(+), 14 deletions(-)

diff --git a/R-package/R/callback.R b/R-package/R/callback.R
index 3c8bb243783b..bb9074f21746 100644
--- a/R-package/R/callback.R
+++ b/R-package/R/callback.R
@@ -260,7 +260,7 @@ cb.record.evaluation <- function() {
 
 }
 
-cb.early.stop <- function(stopping_rounds, verbose = TRUE) {
+cb.early.stop <- function(stopping_rounds, first_metric_only = FALSE, verbose = TRUE) {
 
   # Initialize variables
   factor_to_bigger_better <- NULL
@@ -317,8 +317,16 @@ cb.early.stop <- function(stopping_rounds, verbose = TRUE) {
     # Store iteration
     cur_iter <- env$iteration
 
+    # By default, any metric can trigger early stopping. This can be disabled
+    # with 'first_metric_only = TRUE'
+    if (isTRUE(first_metric_only)) {
+      evals_to_check <- 1L
+    } else {
+      evals_to_check <- seq_len(eval_len)
+    }
+
     # Loop through evaluation
-    for (i in seq_len(eval_len)) {
+    for (i in evals_to_check) {
 
       # Store score
       score <- env$eval_list[[i]]$value * factor_to_bigger_better[i]
diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
index 3433aade6594..efe6f0a6cdd9 100644
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -257,6 +257,7 @@ lgb.cv <- function(params = list()
       callbacks
       , cb.early.stop(
         stopping_rounds = early_stopping_rounds
+        , first_metric_only = isTRUE(params[["first_metric_only"]])
         , verbose = verbose
       )
     )
diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
index d0dacecc0bd1..b018ce19620a 100644
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -6,7 +6,17 @@
 #' @param obj objective function, can be character or custom objective function. Examples include
 #'            \code{regression}, \code{regression_l1}, \code{huber},
 #'            \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
-#' @param eval evaluation function, can be (a list of) character or custom eval function
+#' @param eval evaluation function(s). This can be a function or list of functions. Each provided function
+#'              should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
+#'              list with three elements.
+#'              \itemize{
+#'                  \item{\code{name}: A string with the name of the metric, used for printing and storing results.}
+#'                  \item{\code{value}: A single number indicating the value of the metric for the given predictions and true values}
+#'                  \item{
+#'                      \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
+#'                      For example, this would be \code{FALSE} for metrics like MAE or RMSE.
+#'                  }
+#'              }
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
 #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
 #' @param categorical_feature list of str or int
@@ -89,7 +99,7 @@ lgb.train <- function(params = list(),
   params <- lgb.check.obj(params, obj)
   params <- lgb.check.eval(params, eval)
   fobj <- NULL
-  feval <- NULL
+  eval_functions <- NULL
 
   # Check for objective (function or not)
   if (is.function(params$objective)) {
@@ -97,9 +107,14 @@ lgb.train <- function(params = list(),
     params$objective <- "NONE"
   }
 
-  # Check for loss (function or not)
+  # If loss is a single function, store it as a 1-element list
+  # (for backwards compatibility). If it is a list of functions, store
+  # all of them
   if (is.function(eval)) {
-    feval <- eval
+    eval_functions <- list(eval)
+  }
+  if (methods::is(eval, "list") & all(sapply(eval, is.function))){
+    eval_functions <- eval
   }
 
   # Init predictor to empty
@@ -117,6 +132,7 @@ lgb.train <- function(params = list(),
   if (!is.null(predictor)) {
     begin_iteration <- predictor$current_iter() + 1L
   }
+
   # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
   n_trees <- .PARAMETER_ALIASES()[["num_iterations"]]
   if (any(names(params) %in% n_trees)) {
@@ -225,6 +241,7 @@ lgb.train <- function(params = list(),
       callbacks
       , cb.early.stop(
         stopping_rounds = early_stopping_rounds
+        , first_metric_only = isTRUE(params[["first_metric_only"]])
         , verbose = verbose
       )
     )
@@ -269,13 +286,16 @@ lgb.train <- function(params = list(),
     # Collection: Has validation dataset?
     if (length(valids) > 0L) {
 
-      # Validation has training dataset?
-      if (valid_contain_train) {
-        eval_list <- append(eval_list, booster$eval_train(feval = feval))
-      }
+      for (eval_function in eval_functions){
 
-      # Has no validation dataset
-      eval_list <- append(eval_list, booster$eval_valid(feval = feval))
+        # Validation has training dataset?
+        if (valid_contain_train) {
+          eval_list <- append(eval_list, booster$eval_train(feval = eval_function))
+        }
+
+        # Has no validation dataset
+        eval_list <- append(eval_list, booster$eval_valid(feval = eval_function))
+      }
     }
 
     # Write evaluation result in environment
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 9b036f91db8d..df0ebe911eea 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -257,8 +257,8 @@ lgb.check.eval <- function(params, eval) {
     params$metric <- list()
   }
 
-  # If 'eval' is a list or character vector, store it in 'metric'
-  if (is.character(eval) || identical(class(eval), "list")) {
+  # If 'eval' is a list of strings or character vector, store it in 'metric'
+  if (is.character(eval) || (is.list(eval) && all(sapply(eval, is.character)))) {
     params$metric <- append(params$metric, eval)
   }
 
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index d35f257ddac9..3d588dafdf9b 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -571,3 +571,96 @@ test_that("lgb.train() works with early stopping for regression", {
     , early_stopping_rounds + 1L
   )
 })
+
+
+test_that("lgb.train() only considers the first metric for early stopping if first_metric_only is provided", {
+  set.seed(708L)
+  trainDF <- data.frame(
+    "feat1" = rnorm(100)
+    , "target" = rnorm(100)
+  )
+  validDF <- data.frame(
+    "feat1" = rnorm(50)
+    , "target" = rnorm(50)
+  )
+
+  .increasing_metric <- function(preds, dtrain){
+    return(list(
+      name = "increasing_metric"
+      , value = as.double(Sys.time())
+      , higher_better = TRUE
+    ))
+  }
+
+  .constant_metric <- function(preds, dtrain){
+    return(list(
+      name = "constant_metric"
+      , value = 0.2
+      , higher_better = FALSE
+    ))
+  }
+
+  dtrain <- lgb.Dataset(
+    data = as.matrix(trainDF[["feat1"]], drop = FALSE)
+    , label = trainDF[["target"]]
+  )
+  dvalid <- lgb.Dataset(
+    data = as.matrix(validDF[["feat1"]], drop = FALSE)
+    , label = validDF[["target"]]
+  )
+  nrounds <- 10L
+
+  ################################
+  # train with no early stopping #
+  ################################
+  bst <- lgb.train(
+    params = list(
+      objective = "regression"
+      , metric = "rmse"
+      , min_data_in_bin = 5L
+    )
+    , data = dtrain
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = dvalid
+    )
+    , eval = list(
+      .increasing_metric
+      , .constant_metric
+    )
+    , verbose = TRUE
+  )
+
+  # the best possible model should come from the first iteration, but
+  # all 10 training iterations should happen
+  expect_equal(bst$best_score, 55.0)
+  expect_equal(bst$best_iter, 1L)
+  expect_equal(length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]), nrounds)
+
+  #############################
+  # train with early stopping #
+  #############################
+  early_stopping_rounds <- 5L
+  bst  <- lgb.train(
+    params = list(
+      objective = "regression"
+      , metric = "rmse"
+      , min_data_in_bin = 5L
+      , early_stopping_rounds = early_stopping_rounds
+    )
+    , data = dtrain
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = dvalid
+    )
+  )
+
+  # the best model should be from the first iteration, and only 6 rounds
+  # should have happen (1 with improvement, 5 consecutive with no improvement)
+  expect_equal(bst$best_score, 55.0)
+  expect_equal(bst$best_iter, 1L)
+  expect_equal(
+    length(bst$record_evals[["valid1"]][["rmse"]][["eval"]])
+    , early_stopping_rounds + 1L
+  )
+})

From c0954a9063efb02e79763a06d46e6bd75a81786d Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 14 Mar 2020 14:53:28 -0500
Subject: [PATCH 02/10] trying stuff

---
 R-package/tests/testthat/test_basic.R | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 3d588dafdf9b..69fe7c295f03 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -528,8 +528,6 @@ test_that("lgb.train() works with early stopping for regression", {
   bst <- lgb.train(
     params = list(
       objective = "regression"
-      , metric = "rmse"
-      , min_data_in_bin = 5L
     )
     , data = dtrain
     , nrounds = nrounds
@@ -625,10 +623,10 @@ test_that("lgb.train() only considers the first metric for early stopping if fir
       "valid1" = dvalid
     )
     , eval = list(
-      .increasing_metric
-      , .constant_metric
+      .constant_metric
+      , .increasing_metric
     )
-    , verbose = TRUE
+    , verbose = 1
   )
 
   # the best possible model should come from the first iteration, but

From eddb2e27e88d731e65cb19f058d446d86083cd8a Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 14 Mar 2020 21:02:20 -0500
Subject: [PATCH 03/10] more changes

---
 R-package/R/callback.R                |   2 +-
 R-package/R/lgb.train.R               |  65 ++++-
 R-package/R/utils.R                   |  28 +-
 R-package/man/lgb.train.Rd            |  45 ++-
 R-package/man/lgb_shared_params.Rd    |   3 +-
 R-package/man/lightgbm.Rd             |   3 +-
 R-package/tests/testthat/test_basic.R | 376 ++++++++++++++++++++++----
 7 files changed, 441 insertions(+), 81 deletions(-)

diff --git a/R-package/R/callback.R b/R-package/R/callback.R
index bb9074f21746..3ac62f1765b4 100644
--- a/R-package/R/callback.R
+++ b/R-package/R/callback.R
@@ -285,7 +285,7 @@ cb.early.stop <- function(stopping_rounds, first_metric_only = FALSE, verbose =
       cat("Will train until there is no improvement in ", stopping_rounds, " rounds.\n\n", sep = "")
     }
 
-    # Maximization or minimization task
+    # Internally treat everything as a maximization task
     factor_to_bigger_better <<- rep.int(1.0, eval_len)
     best_iter <<- rep.int(-1L, eval_len)
     best_score <<- rep.int(-Inf, eval_len)
diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
index b018ce19620a..af1441c1b8d6 100644
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -6,17 +6,33 @@
 #' @param obj objective function, can be character or custom objective function. Examples include
 #'            \code{regression}, \code{regression_l1}, \code{huber},
 #'            \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
-#' @param eval evaluation function(s). This can be a function or list of functions. Each provided function
-#'              should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
-#'              list with three elements.
-#'              \itemize{
-#'                  \item{\code{name}: A string with the name of the metric, used for printing and storing results.}
-#'                  \item{\code{value}: A single number indicating the value of the metric for the given predictions and true values}
-#'                  \item{
-#'                      \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
-#'                      For example, this would be \code{FALSE} for metrics like MAE or RMSE.
-#'                  }
-#'              }
+#' @param eval evaluation function(s). This can be a character vector, function, or list with a mixture of
+#'             strings and functions.
+#'
+#'             \itemize{
+#'                 \item{\bold{a. character vector}:
+#'                     If you provide a character vector to this argument, it should contain strings with valid
+#'                     evaluation metrics. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{The "metric" section of the documentation}
+#'                     for a list of valid metrics.
+#'                 }
+#'                 \item{\bold{b. function}:
+#'                      You can provide a custom evaluation function. This
+#'                      should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
+#'                      list with three elements:
+#'                      \itemize{
+#'                          \item{\code{name}: A string with the name of the metric, used for printing and storing results.}
+#'                          \item{\code{value}: A single number indicating the value of the metric for the given predictions and true values}
+#'                          \item{
+#'                              \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
+#'                              For example, this would be \code{FALSE} for metrics like MAE or RMSE.
+#'                          }
+#'                      }
+#'                 }
+#'                 \item{\bold{c. list}:
+#'                     If a list is given, it should only contain character vectors and functions. These should follow the
+#'                     requirements from the descriptions above.
+#'                 }
+#'             }
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
 #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
 #' @param categorical_feature list of str or int
@@ -36,6 +52,21 @@
 #'                                   the number of real CPU cores, not the number of threads (most
 #'                                   CPU using hyper-threading to generate 2 threads per CPU core).}
 #'            }
+#' @section Early Stopping:
+#'
+#'          "early stopping" refers to stopping the training process if the model's performance on a given
+#'          validation set does not improve for several consecutive iterations.
+#'
+#'          If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
+#'          early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
+#'          metrics will be considered for early stopping.
+#'
+#'          If you want to only consider the first metric for early stopping, pass
+#'          \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+#'          in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+#'          a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+#'          or \code{objective} (passed into \code{params}).
+#'
 #' @return a trained booster model \code{lgb.Booster}.
 #'
 #' @examples
@@ -99,7 +130,7 @@ lgb.train <- function(params = list(),
   params <- lgb.check.obj(params, obj)
   params <- lgb.check.eval(params, eval)
   fobj <- NULL
-  eval_functions <- NULL
+  eval_functions <- list(NULL)
 
   # Check for objective (function or not)
   if (is.function(params$objective)) {
@@ -113,8 +144,13 @@ lgb.train <- function(params = list(),
   if (is.function(eval)) {
     eval_functions <- list(eval)
   }
-  if (methods::is(eval, "list") & all(sapply(eval, is.function))){
-    eval_functions <- eval
+  if (methods::is(eval, "list")) {
+    eval_functions <- Filter(
+      f = function(eval_element){
+        is.function(eval_element)
+      }
+      , x = eval
+    )
   }
 
   # Init predictor to empty
@@ -296,6 +332,7 @@ lgb.train <- function(params = list(),
         # Has no validation dataset
         eval_list <- append(eval_list, booster$eval_valid(feval = eval_function))
       }
+
     }
 
     # Write evaluation result in environment
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index df0ebe911eea..7d7f05a4727b 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -250,6 +250,10 @@ lgb.check.obj <- function(params, obj) {
 
 }
 
+# [description] Take any character values from eval and store them
+#               in params$metric. This has to account for the fact that
+#               `eval` could be a character vector, a function, a list of functions,
+#               or a list with a mix of strings and functions
 lgb.check.eval <- function(params, eval) {
 
   # Check if metric is null, if yes put a list instead
@@ -257,9 +261,27 @@ lgb.check.eval <- function(params, eval) {
     params$metric <- list()
   }
 
-  # If 'eval' is a list of strings or character vector, store it in 'metric'
-  if (is.character(eval) || (is.list(eval) && all(sapply(eval, is.character)))) {
-    params$metric <- append(params$metric, eval)
+  # if 'eval' is a character vector or list, find the character
+  # elements and add them to 'metric'
+  if (!is.function(eval)) {
+    for (i in seq_along(eval)) {
+      element <- eval[[i]]
+      if (is.character(element)) {
+        print(paste0("Adding '", element, "' to list of metrics"))
+        params$metric <- append(params$metric, element)
+      }
+    }
+  }
+
+  # If more than one character metric was given, then "None" should
+  # not be included
+  if (length(params$metric) > 1){
+    params$metric <- Filter(
+        f = function(metric){
+          metric != "None"
+        }
+        , x = params$metric
+    )
   }
 
   return(params)
diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd
index 98298ab6f954..65707450f5ad 100644
--- a/R-package/man/lgb.train.Rd
+++ b/R-package/man/lgb.train.Rd
@@ -38,7 +38,8 @@ may allow you to pass other types of data like \code{matrix} and then separately
 \code{regression}, \code{regression_l1}, \code{huber},
 \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
 
-\item{eval}{evaluation function, can be (a list of) character or custom eval function}
+\item{eval}{evaluation function(s). This can be a character vector, function, or list with a mixture of
+strings and functions.}
 
 \item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
 
@@ -82,6 +83,48 @@ a trained booster model \code{lgb.Booster}.
 \description{
 Logic to train with LightGBM
 }
+\section{Early Stopping}{
+
+
+         \itemize{
+            \item{\bold{a. character vector}:
+                If you provide a character vector to this argument, it should contain strings with valid
+                evaluation metrics. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{The "metric" section of the documentation}
+                for a list of valid metrics.
+            }
+            \item{\bold{b. function}:
+                 You can provide a custom evaluation function. This
+                 should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
+                 list with three elements:
+
+                 \itemize{
+                     \item{\code{name}: A string with the name of the metric, used for printing and storing results.}
+                     \item{\code{value}: A single number indicating the value of the metric for the given predictions and true values}
+                     \item{
+                         \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
+                         For example, this would be \code{FALSE} for metrics like MAE or RMSE.
+                     }
+                 }
+            }
+            \item{\bold{c. list}:
+                If a list is given, it should only contain character vectors and functions. These should follow the
+                requirements from the descriptions above.
+            }
+            \item{\bold{Early stopping behavior}:
+
+                If multiple arguments are given, their order will be preserved. If you enable early stopping by
+                setting \code{early_stopping_rounds} in \code{params}, by default all metrics will be
+                considered for early stopping.
+
+                If you want to only consider the first metric for early stopping, pass
+                \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+                in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+                a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+                or \code{objective} (passed into \code{params}).
+            }
+         }
+}
+
 \examples{
 library(lightgbm)
 data(agaricus.train, package = "lightgbm")
diff --git a/R-package/man/lgb_shared_params.Rd b/R-package/man/lgb_shared_params.Rd
index ae2f61a86256..a7143179495e 100644
--- a/R-package/man/lgb_shared_params.Rd
+++ b/R-package/man/lgb_shared_params.Rd
@@ -4,7 +4,8 @@
 \alias{lgb_shared_params}
 \title{Shared parameter docs}
 \arguments{
-\item{callbacks}{List of callback functions that are applied at each iteration.}
+\item{callbacks}{list of callback functions
+List of callback functions that are applied at each iteration.}
 
 \item{data}{a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}},
 may allow you to pass other types of data like \code{matrix} and then separately supply
diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd
index 88d98d13525d..256a7dc6e8e9 100644
--- a/R-package/man/lightgbm.Rd
+++ b/R-package/man/lightgbm.Rd
@@ -45,7 +45,8 @@ If early stopping occurs, the model will have 'best_iter' field.}
 
 \item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
 
-\item{callbacks}{List of callback functions that are applied at each iteration.}
+\item{callbacks}{list of callback functions
+List of callback functions that are applied at each iteration.}
 
 \item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example
 \itemize{
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 69fe7c295f03..f0e98df4163f 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -7,6 +7,57 @@ test <- agaricus.test
 
 TOLERANCE <- 1e-6
 
+# [description] Every time this function is called, it adds 0.1
+#               to an accumulator then returns the current value.
+#               This is used to mock the situation where an evaluation
+#               metric increases every iteration
+ACCUMULATOR_NAME <- "INCREASING_METRIC_ACUMULATOR"
+assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv)
+.increasing_metric <- function(preds, dtrain){
+  if (!exists(ACCUMULATOR_NAME, envir = .GlobalEnv)){
+    assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv)
+  }
+  assign(
+    x = ACCUMULATOR_NAME
+    , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) + 0.1
+    , envir = .GlobalEnv
+  )
+  return(list(
+    name = "increasing_metric"
+    , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+    , higher_better = TRUE
+  ))
+}
+
+# [description] Evaluation function that always returns the
+#               same value
+CONSTANT_METRIC_VALUE <- 0.2
+.constant_metric <- function(preds, dtrain){
+  return(list(
+    name = "constant_metric"
+    , value = CONSTANT_METRIC_VALUE
+    , higher_better = FALSE
+  ))
+}
+
+# sample datasets to test early stopping
+DTRAIN_RANDOM_REGRESSION <- lgb.Dataset(
+  data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE)
+  , label = rnorm(100L)
+)
+DVALID_RANDOM_REGRESSION <- lgb.Dataset(
+  data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE)
+  , label = rnorm(50L)
+)
+DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset(
+  data = as.matrix(rnorm(120), ncol = 1L, drop = FALSE)
+  , label = sample(c(0L, 1L), size = 120, replace = TRUE)
+)
+DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset(
+  data = as.matrix(rnorm(37), ncol = 1L, drop = FALSE)
+  , label = sample(c(0L, 1L), size = 37, replace = TRUE)
+)
+
 test_that("train and predict binary classification", {
   nrounds <- 10L
   bst <- lightgbm(
@@ -570,95 +621,300 @@ test_that("lgb.train() works with early stopping for regression", {
   )
 })
 
-
-test_that("lgb.train() only considers the first metric for early stopping if first_metric_only is provided", {
+test_that("lgb.train() does not stop early if early_stopping_rounds is not given", {
   set.seed(708L)
-  trainDF <- data.frame(
-    "feat1" = rnorm(100)
-    , "target" = rnorm(100)
+
+  increasing_metric_starting_value <- get(
+    ACCUMULATOR_NAME
+    , envir = .GlobalEnv
   )
-  validDF <- data.frame(
-    "feat1" = rnorm(50)
-    , "target" = rnorm(50)
+  nrounds <- 10L
+  metrics <- list(
+    .constant_metric
+    , .increasing_metric
+  )
+  bst <- lgb.train(
+    params = list(
+      objective = "regression"
+      , metric = "None"
+    )
+    , data = DTRAIN_RANDOM_REGRESSION
+    , nrounds = nrounds
+    , valids = list("valid1" = DVALID_RANDOM_REGRESSION)
+    , eval = metrics
   )
 
-  .increasing_metric <- function(preds, dtrain){
-    return(list(
-      name = "increasing_metric"
-      , value = as.double(Sys.time())
-      , higher_better = TRUE
-    ))
-  }
+  # Only the two functions provided to "eval" should have been evaluated
+  expect_equal(length(bst$record_evals[["valid1"]]), 2L)
 
-  .constant_metric <- function(preds, dtrain){
-    return(list(
-      name = "constant_metric"
-      , value = 0.2
-      , higher_better = FALSE
-    ))
-  }
+  # all 10 iterations should have happen, and the best_iter should be
+  # the first one (based on constant_metric)
+  best_iter <- 1L
+  expect_equal(bst$best_iter, best_iter)
 
-  dtrain <- lgb.Dataset(
-    data = as.matrix(trainDF[["feat1"]], drop = FALSE)
-    , label = trainDF[["target"]]
+  # best_score should be taken from the first metric
+  expect_equal(
+    bst$best_score
+    , bst$record_evals[["valid1"]][["constant_metric"]][["eval"]][[best_iter]]
   )
-  dvalid <- lgb.Dataset(
-    data = as.matrix(validDF[["feat1"]], drop = FALSE)
-    , label = validDF[["target"]]
+
+  # early stopping should not have happened. Even though constant_metric
+  # had 9 consecutive iterations with no improvement, it is ignored because of
+  # first_metric_only = TRUE
+  expect_equal(
+    length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]])
+    , nrounds
   )
-  nrounds <- 10L
+  expect_equal(
+    length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]])
+    , nrounds
+  )
+})
 
-  ################################
-  # train with no early stopping #
-  ################################
+test_that("If first_metric_only is not given or is FALSE, lgb.train() decides to stop early based on all metrics", {
+  set.seed(708L)
+
+  early_stopping_rounds <- 3L
+  param_variations <- list(
+    list(
+      objective = "regression"
+      , metric = "None"
+      , early_stopping_rounds = early_stopping_rounds
+    )
+    , list(
+      objective = "regression"
+      , metric = "None"
+      , early_stopping_rounds = early_stopping_rounds
+      , first_metric_only = FALSE
+    )
+  )
+
+  for (params in param_variations){
+
+    nrounds <- 10L
+    bst <- lgb.train(
+      params = params
+      , data = DTRAIN_RANDOM_REGRESSION
+      , nrounds = nrounds
+      , valids = list(
+        "valid1" = DVALID_RANDOM_REGRESSION
+      )
+      , eval = list(
+        .increasing_metric
+        , .constant_metric
+      )
+    )
+
+    # Only the two functions provided to "eval" should have been evaluated
+    expect_equal(length(bst$record_evals[["valid1"]]), 2L)
+
+    # early stopping should have happened, and should have stopped early_stopping_rounds + 1 rounds in
+    # because constant_metric never improves
+    #
+    # the best iteration should be the last one, because increasing_metric was first
+    # and gets better every iteration
+    best_iter <- early_stopping_rounds + 1L
+    expect_equal(bst$best_iter, best_iter)
+
+    # best_score should be taken from "increasing_metric" because it was first
+    expect_equal(
+      bst$best_score
+      , bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]][[best_iter]]
+    )
+
+    # early stopping should not have happened. even though increasing_metric kept
+    # getting better, early stopping should have happened because "constant_metric"
+    # did not improve
+    expect_equal(
+      length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]])
+      , early_stopping_rounds + 1L
+    )
+    expect_equal(
+      length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]])
+      , early_stopping_rounds + 1L
+    )
+  }
+
+})
+
+test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based on only the first metric", {
+  set.seed(708L)
+  nrounds <- 10L
+  early_stopping_rounds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
   bst <- lgb.train(
     params = list(
       objective = "regression"
-      , metric = "rmse"
-      , min_data_in_bin = 5L
+      , metric = "None"
+      , early_stopping_rounds = early_stopping_rounds
+      , first_metric_only = TRUE
     )
-    , data = dtrain
+    , data = DTRAIN_RANDOM_REGRESSION
     , nrounds = nrounds
     , valids = list(
-      "valid1" = dvalid
+      "valid1" = DVALID_RANDOM_REGRESSION
     )
     , eval = list(
-      .constant_metric
-      , .increasing_metric
+      .increasing_metric
+      , .constant_metric
     )
     , verbose = 1
   )
 
-  # the best possible model should come from the first iteration, but
-  # all 10 training iterations should happen
-  expect_equal(bst$best_score, 55.0)
-  expect_equal(bst$best_iter, 1L)
-  expect_equal(length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]), nrounds)
+  # Only the two functions provided to "eval" should have been evaluated
+  expect_equal(length(bst$record_evals[["valid1"]]), 2L)
 
-  #############################
-  # train with early stopping #
-  #############################
-  early_stopping_rounds <- 5L
-  bst  <- lgb.train(
+  # all 10 iterations should happen, and the best_iter should be the final one
+  expect_equal(bst$best_iter, nrounds)
+
+  # best_score should be taken from "increasing_metric"
+  expect_equal(
+    bst$best_score
+    , increasing_metric_starting_value + 0.1 * nrounds
+  )
+
+  # early stopping should not have happened. Even though constant_metric
+  # had 9 consecutive iterations with no improvement, it is ignored because of
+  # first_metric_only = TRUE
+  expect_equal(
+    length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]])
+    , nrounds
+  )
+  expect_equal(
+    length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]])
+    , nrounds
+  )
+})
+
+test_that("lgb.train() works when a mixture of functions and strings are passed to eval", {
+  set.seed(708L)
+  nrounds <- 10L
+  early_stopping_rounds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.train(
     params = list(
       objective = "regression"
-      , metric = "rmse"
-      , min_data_in_bin = 5L
-      , early_stopping_rounds = early_stopping_rounds
+      , metric = "None"
     )
-    , data = dtrain
+    , data = DTRAIN_RANDOM_REGRESSION
     , nrounds = nrounds
     , valids = list(
-      "valid1" = dvalid
+      "valid1" = DVALID_RANDOM_REGRESSION
+    )
+    , eval = list(
+      .increasing_metric
+      , "rmse"
+      , .constant_metric
+      , "l2"
     )
   )
 
-  # the best model should be from the first iteration, and only 6 rounds
-  # should have happen (1 with improvement, 5 consecutive with no improvement)
-  expect_equal(bst$best_score, 55.0)
-  expect_equal(bst$best_iter, 1L)
-  expect_equal(
-    length(bst$record_evals[["valid1"]][["rmse"]][["eval"]])
-    , early_stopping_rounds + 1L
+  # all 4 metrics should have been used
+  expect_named(
+    bst$record_evals[["valid1"]]
+    , expected = c("rmse", "l2", "increasing_metric", "constant_metric")
+    , ignore.order = TRUE
+    , ignore.case = FALSE
+  )
+
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid1"]]
+  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 0.9278173) < TOLERANCE)
+  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 0.8608449) < TOLERANCE)
+  expected_increasing_metric <- increasing_metric_starting_value + 0.1
+  expect_true(
+    abs(
+      results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric
+    ) < TOLERANCE
   )
+  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
+
+})
+
+test_that("lgb.train() works when a character vector is passed to eval", {
+  set.seed(708L)
+  nrounds <- 10L
+  early_stopping_rounds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.train(
+    params = list(
+      objective = "binary"
+      , metric = "None"
+    )
+    , data = DTRAIN_RANDOM_CLASSIFICATION
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_CLASSIFICATION
+    )
+    , eval = c(
+      "binary_error"
+      , "binary_logloss"
+    )
+    , verbose = 1
+  )
+
+  # all 4 metrics should have been used
+  expect_named(
+    bst$record_evals[["valid1"]]
+    , expected = c("rmse", "l2", "increasing_metric", "constant_metric")
+    , ignore.order = TRUE
+    , ignore.case = FALSE
+  )
+
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid1"]]
+  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 0.9278173) < TOLERANCE)
+  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 0.8608449) < TOLERANCE)
+  expected_increasing_metric <- increasing_metric_starting_value + 0.1
+  expect_true(
+    abs(
+      results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric
+    ) < TOLERANCE
+  )
+  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
+
+})
+
+test_that("lgb.train() works when a list of strings is passed to eval", {
+  set.seed(708L)
+  nrounds <- 10L
+  early_stopping_rounds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.train(
+    params = list(
+      objective = "binary"
+      , metric = "None"
+    )
+    , data = DTRAIN_RANDOM_CLASSIFICATION
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_CLASSIFICATION
+    )
+    , eval = list(
+      "binary_error"
+      , "binary_logloss"
+    )
+    , verbose = 1
+  )
+
+  # all 4 metrics should have been used
+  expect_named(
+    bst$record_evals[["valid1"]]
+    , expected = c("rmse", "l2", "increasing_metric", "constant_metric")
+    , ignore.order = TRUE
+    , ignore.case = FALSE
+  )
+
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid1"]]
+  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 0.9278173) < TOLERANCE)
+  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 0.8608449) < TOLERANCE)
+  expected_increasing_metric <- increasing_metric_starting_value + 0.1
+  expect_true(
+    abs(
+      results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric
+    ) < TOLERANCE
+  )
+  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
+
 })

From 0800f675abf6d4f3d4736c96872bcf09988bfc48 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 14 Mar 2020 22:29:37 -0500
Subject: [PATCH 04/10] fixed handling of multiple metrics

---
 R-package/R/lgb.train.R               |  35 +++++---
 R-package/R/utils.R                   |   4 +-
 R-package/man/lgb.train.Rd            |  82 ++++++++++--------
 R-package/tests/testthat/test_basic.R | 120 ++++++++++++++++++++------
 4 files changed, 164 insertions(+), 77 deletions(-)

diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
index af1441c1b8d6..0f6f104f2f66 100644
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -12,7 +12,9 @@
 #'             \itemize{
 #'                 \item{\bold{a. character vector}:
 #'                     If you provide a character vector to this argument, it should contain strings with valid
-#'                     evaluation metrics. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{The "metric" section of the documentation}
+#'                     evaluation metrics.
+#'                     See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{
+#'                     The "metric" section of the documentation}
 #'                     for a list of valid metrics.
 #'                 }
 #'                 \item{\bold{b. function}:
@@ -20,8 +22,12 @@
 #'                      should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
 #'                      list with three elements:
 #'                      \itemize{
-#'                          \item{\code{name}: A string with the name of the metric, used for printing and storing results.}
-#'                          \item{\code{value}: A single number indicating the value of the metric for the given predictions and true values}
+#'                          \item{\code{name}: A string with the name of the metric, used for printing
+#'                              and storing results.
+#'                          }
+#'                          \item{\code{value}: A single number indicating the value of the metric for the
+#'                              given predictions and true values
+#'                          }
 #'                          \item{
 #'                              \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
 #'                              For example, this would be \code{FALSE} for metrics like MAE or RMSE.
@@ -29,8 +35,8 @@
 #'                      }
 #'                 }
 #'                 \item{\bold{c. list}:
-#'                     If a list is given, it should only contain character vectors and functions. These should follow the
-#'                     requirements from the descriptions above.
+#'                     If a list is given, it should only contain character vectors and functions.
+#'                     These should follow the requirements from the descriptions above.
 #'                 }
 #'             }
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
@@ -146,9 +152,7 @@ lgb.train <- function(params = list(),
   }
   if (methods::is(eval, "list")) {
     eval_functions <- Filter(
-      f = function(eval_element){
-        is.function(eval_element)
-      }
+      f = is.function
       , x = eval
     )
   }
@@ -322,14 +326,25 @@ lgb.train <- function(params = list(),
     # Collection: Has validation dataset?
     if (length(valids) > 0L) {
 
-      for (eval_function in eval_functions){
+      # Get evaluation results with passed-in functions
+      for (eval_function in eval_functions) {
 
         # Validation has training dataset?
         if (valid_contain_train) {
           eval_list <- append(eval_list, booster$eval_train(feval = eval_function))
         }
 
-        # Has no validation dataset
+        eval_list <- append(eval_list, booster$eval_valid(feval = eval_function))
+      }
+
+      # Calling booster$eval_valid() will get
+      # evaluation results with the metrics in params$metric by calling LGBM_BoosterGetEval_R",
+      # so need to be sure that gets called, which it wouldn't be above if no functions
+      # were passed in
+      if (length(eval_functions) == 0L) {
+        if (valid_contain_train) {
+          eval_list <- append(eval_list, booster$eval_train(feval = eval_function))
+        }
         eval_list <- append(eval_list, booster$eval_valid(feval = eval_function))
       }
 
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 7d7f05a4727b..0be28f041f59 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -275,9 +275,9 @@ lgb.check.eval <- function(params, eval) {
 
   # If more than one character metric was given, then "None" should
   # not be included
-  if (length(params$metric) > 1){
+  if (length(params$metric) > 1L) {
     params$metric <- Filter(
-        f = function(metric){
+        f = function(metric) {
           metric != "None"
         }
         , x = params$metric
diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd
index 65707450f5ad..59c2e395a189 100644
--- a/R-package/man/lgb.train.Rd
+++ b/R-package/man/lgb.train.Rd
@@ -39,7 +39,38 @@ may allow you to pass other types of data like \code{matrix} and then separately
 \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
 
 \item{eval}{evaluation function(s). This can be a character vector, function, or list with a mixture of
-strings and functions.}
+            strings and functions.
+
+            \itemize{
+                \item{\bold{a. character vector}:
+                    If you provide a character vector to this argument, it should contain strings with valid
+                    evaluation metrics.
+                    See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{
+                    The "metric" section of the documentation}
+                    for a list of valid metrics.
+                }
+                \item{\bold{b. function}:
+                     You can provide a custom evaluation function. This
+                     should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
+                     list with three elements:
+                     \itemize{
+                         \item{\code{name}: A string with the name of the metric, used for printing
+                             and storing results.
+                         }
+                         \item{\code{value}: A single number indicating the value of the metric for the
+                             given predictions and true values
+                         }
+                         \item{
+                             \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
+                             For example, this would be \code{FALSE} for metrics like MAE or RMSE.
+                         }
+                     }
+                }
+                \item{\bold{c. list}:
+                    If a list is given, it should only contain character vectors and functions.
+                    These should follow the requirements from the descriptions above.
+                }
+            }}
 
 \item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
 
@@ -86,43 +117,18 @@ Logic to train with LightGBM
 \section{Early Stopping}{
 
 
-         \itemize{
-            \item{\bold{a. character vector}:
-                If you provide a character vector to this argument, it should contain strings with valid
-                evaluation metrics. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{The "metric" section of the documentation}
-                for a list of valid metrics.
-            }
-            \item{\bold{b. function}:
-                 You can provide a custom evaluation function. This
-                 should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
-                 list with three elements:
-
-                 \itemize{
-                     \item{\code{name}: A string with the name of the metric, used for printing and storing results.}
-                     \item{\code{value}: A single number indicating the value of the metric for the given predictions and true values}
-                     \item{
-                         \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
-                         For example, this would be \code{FALSE} for metrics like MAE or RMSE.
-                     }
-                 }
-            }
-            \item{\bold{c. list}:
-                If a list is given, it should only contain character vectors and functions. These should follow the
-                requirements from the descriptions above.
-            }
-            \item{\bold{Early stopping behavior}:
-
-                If multiple arguments are given, their order will be preserved. If you enable early stopping by
-                setting \code{early_stopping_rounds} in \code{params}, by default all metrics will be
-                considered for early stopping.
-
-                If you want to only consider the first metric for early stopping, pass
-                \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
-                in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
-                a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
-                or \code{objective} (passed into \code{params}).
-            }
-         }
+         "early stopping" refers to stopping the training process if the model's performance on a given
+         validation set does not improve for several consecutive iterations.
+
+         If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
+         early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
+         metrics will be considered for early stopping.
+
+         If you want to only consider the first metric for early stopping, pass
+         \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+         in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+         a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+         or \code{objective} (passed into \code{params}).
 }
 
 \examples{
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index f0e98df4163f..4012af324f1f 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -13,8 +13,8 @@ TOLERANCE <- 1e-6
 #               metric increases every iteration
 ACCUMULATOR_NAME <- "INCREASING_METRIC_ACUMULATOR"
 assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv)
-.increasing_metric <- function(preds, dtrain){
-  if (!exists(ACCUMULATOR_NAME, envir = .GlobalEnv)){
+.increasing_metric <- function(preds, dtrain) {
+  if (!exists(ACCUMULATOR_NAME, envir = .GlobalEnv)) {
     assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv)
   }
   assign(
@@ -32,7 +32,7 @@ assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv)
 # [description] Evaluation function that always returns the
 #               same value
 CONSTANT_METRIC_VALUE <- 0.2
-.constant_metric <- function(preds, dtrain){
+.constant_metric <- function(preds, dtrain) {
   return(list(
     name = "constant_metric"
     , value = CONSTANT_METRIC_VALUE
@@ -50,12 +50,12 @@ DVALID_RANDOM_REGRESSION <- lgb.Dataset(
   , label = rnorm(50L)
 )
 DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset(
-  data = as.matrix(rnorm(120), ncol = 1L, drop = FALSE)
-  , label = sample(c(0L, 1L), size = 120, replace = TRUE)
+  data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE)
+  , label = sample(c(0L, 1L), size = 120L, replace = TRUE)
 )
 DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset(
-  data = as.matrix(rnorm(37), ncol = 1L, drop = FALSE)
-  , label = sample(c(0L, 1L), size = 37, replace = TRUE)
+  data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE)
+  , label = sample(c(0L, 1L), size = 37L, replace = TRUE)
 )
 
 test_that("train and predict binary classification", {
@@ -689,7 +689,7 @@ test_that("If first_metric_only is not given or is FALSE, lgb.train() decides to
     )
   )
 
-  for (params in param_variations){
+  for (params in param_variations) {
 
     nrounds <- 10L
     bst <- lgb.train(
@@ -758,7 +758,6 @@ test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based
       .increasing_metric
       , .constant_metric
     )
-    , verbose = 1
   )
 
   # Only the two functions provided to "eval" should have been evaluated
@@ -850,7 +849,6 @@ test_that("lgb.train() works when a character vector is passed to eval", {
       "binary_error"
       , "binary_logloss"
     )
-    , verbose = 1
   )
 
   # all 4 metrics should have been used
@@ -875,7 +873,55 @@ test_that("lgb.train() works when a character vector is passed to eval", {
 
 })
 
-test_that("lgb.train() works when a list of strings is passed to eval", {
+test_that("lgb.train() works when a list of strings or a character vector is passed to eval", {
+
+  # testing list and character vector, as well as length-1 and length-2
+  eval_variations <- list(
+    c("binary_error", "binary_logloss")
+    , "binary_logloss"
+    , list("binary_error", "binary_logloss")
+    , list("binary_logloss")
+  )
+
+  for (eval_variation in eval_variations) {
+
+    set.seed(708L)
+    nrounds <- 10L
+    early_stopping_rounds <- 3L
+    increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+    bst <- lgb.train(
+      params = list(
+        objective = "binary"
+        , metric = "None"
+      )
+      , data = DTRAIN_RANDOM_CLASSIFICATION
+      , nrounds = nrounds
+      , valids = list(
+        "valid1" = DVALID_RANDOM_CLASSIFICATION
+      )
+      , eval = eval_variation
+    )
+
+    # both metrics should have been used
+    expect_named(
+      bst$record_evals[["valid1"]]
+      , expected = unlist(eval_variation)
+      , ignore.order = TRUE
+      , ignore.case = FALSE
+    )
+
+    # the difference metrics shouldn't have been mixed up with each other
+    results <- bst$record_evals[["valid1"]]
+    if ("binary_error" %in% unlist(eval_variation)) {
+      expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5135135) < TOLERANCE)
+    }
+    if ("binary_logloss" %in% unlist(eval_variation)) {
+      expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6992222) < TOLERANCE)
+    }
+  }
+})
+
+test_that("lgb.train() works when you specify both 'metric' and 'eval' with strings", {
   set.seed(708L)
   nrounds <- 10L
   early_stopping_rounds <- 3L
@@ -883,38 +929,58 @@ test_that("lgb.train() works when a list of strings is passed to eval", {
   bst <- lgb.train(
     params = list(
       objective = "binary"
-      , metric = "None"
+      , metric = "binary_error"
     )
     , data = DTRAIN_RANDOM_CLASSIFICATION
     , nrounds = nrounds
     , valids = list(
       "valid1" = DVALID_RANDOM_CLASSIFICATION
     )
-    , eval = list(
-      "binary_error"
-      , "binary_logloss"
-    )
-    , verbose = 1
+    , eval = "binary_logloss"
   )
 
-  # all 4 metrics should have been used
+  # both metrics should have been used
   expect_named(
     bst$record_evals[["valid1"]]
-    , expected = c("rmse", "l2", "increasing_metric", "constant_metric")
+    , expected = c("binary_error", "binary_logloss")
     , ignore.order = TRUE
     , ignore.case = FALSE
   )
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid1"]]
-  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 0.9278173) < TOLERANCE)
-  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 0.8608449) < TOLERANCE)
-  expected_increasing_metric <- increasing_metric_starting_value + 0.1
-  expect_true(
-    abs(
-      results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric
-    ) < TOLERANCE
+  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5135135) < TOLERANCE)
+  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6992222) < TOLERANCE)
+})
+
+test_that("lgb.train() works when you specify both 'metric' and 'eval' with strings", {
+  set.seed(708L)
+  nrounds <- 10L
+  early_stopping_rounds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.train(
+    params = list(
+      objective = "binary"
+      , metric = "binary_error"
+    )
+    , data = DTRAIN_RANDOM_CLASSIFICATION
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_CLASSIFICATION
+    )
+    , eval = "binary_logloss"
+  )
+
+  # both metrics should have been used
+  expect_named(
+    bst$record_evals[["valid1"]]
+    , expected = c("binary_error", "binary_logloss")
+    , ignore.order = TRUE
+    , ignore.case = FALSE
   )
-  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
 
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid1"]]
+  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5135135) < TOLERANCE)
+  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6992222) < TOLERANCE)
 })

From 68e784052f956d7a37d45de41f08d5f03c17a874 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 14 Mar 2020 23:02:10 -0500
Subject: [PATCH 05/10] fixed tests

---
 R-package/tests/testthat/test_basic.R | 86 ++++++++++-----------------
 1 file changed, 31 insertions(+), 55 deletions(-)

diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 4012af324f1f..70345d519b03 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -6,6 +6,7 @@ train <- agaricus.train
 test <- agaricus.test
 
 TOLERANCE <- 1e-6
+set.seed(708L)
 
 # [description] Every time this function is called, it adds 0.1
 #               to an accumulator then returns the current value.
@@ -788,7 +789,6 @@ test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based
 test_that("lgb.train() works when a mixture of functions and strings are passed to eval", {
   set.seed(708L)
   nrounds <- 10L
-  early_stopping_rounds <- 3L
   increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
   bst <- lgb.train(
     params = list(
@@ -818,51 +818,8 @@ test_that("lgb.train() works when a mixture of functions and strings are passed
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid1"]]
-  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 0.9278173) < TOLERANCE)
-  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 0.8608449) < TOLERANCE)
-  expected_increasing_metric <- increasing_metric_starting_value + 0.1
-  expect_true(
-    abs(
-      results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric
-    ) < TOLERANCE
-  )
-  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
-
-})
-
-test_that("lgb.train() works when a character vector is passed to eval", {
-  set.seed(708L)
-  nrounds <- 10L
-  early_stopping_rounds <- 3L
-  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
-  bst <- lgb.train(
-    params = list(
-      objective = "binary"
-      , metric = "None"
-    )
-    , data = DTRAIN_RANDOM_CLASSIFICATION
-    , nrounds = nrounds
-    , valids = list(
-      "valid1" = DVALID_RANDOM_CLASSIFICATION
-    )
-    , eval = c(
-      "binary_error"
-      , "binary_logloss"
-    )
-  )
-
-  # all 4 metrics should have been used
-  expect_named(
-    bst$record_evals[["valid1"]]
-    , expected = c("rmse", "l2", "increasing_metric", "constant_metric")
-    , ignore.order = TRUE
-    , ignore.case = FALSE
-  )
-
-  # the difference metrics shouldn't have been mixed up with each other
-  results <- bst$record_evals[["valid1"]]
-  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 0.9278173) < TOLERANCE)
-  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 0.8608449) < TOLERANCE)
+  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < TOLERANCE)
+  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < TOLERANCE)
   expected_increasing_metric <- increasing_metric_starting_value + 0.1
   expect_true(
     abs(
@@ -887,7 +844,6 @@ test_that("lgb.train() works when a list of strings or a character vector is pas
 
     set.seed(708L)
     nrounds <- 10L
-    early_stopping_rounds <- 3L
     increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
     bst <- lgb.train(
       params = list(
@@ -913,10 +869,10 @@ test_that("lgb.train() works when a list of strings or a character vector is pas
     # the difference metrics shouldn't have been mixed up with each other
     results <- bst$record_evals[["valid1"]]
     if ("binary_error" %in% unlist(eval_variation)) {
-      expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5135135) < TOLERANCE)
+      expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE)
     }
     if ("binary_logloss" %in% unlist(eval_variation)) {
-      expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6992222) < TOLERANCE)
+      expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE)
     }
   }
 })
@@ -924,7 +880,6 @@ test_that("lgb.train() works when a list of strings or a character vector is pas
 test_that("lgb.train() works when you specify both 'metric' and 'eval' with strings", {
   set.seed(708L)
   nrounds <- 10L
-  early_stopping_rounds <- 3L
   increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
   bst <- lgb.train(
     params = list(
@@ -949,14 +904,13 @@ test_that("lgb.train() works when you specify both 'metric' and 'eval' with stri
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid1"]]
-  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5135135) < TOLERANCE)
-  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6992222) < TOLERANCE)
+  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE)
+  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE)
 })
 
 test_that("lgb.train() works when you specify both 'metric' and 'eval' with strings", {
   set.seed(708L)
   nrounds <- 10L
-  early_stopping_rounds <- 3L
   increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
   bst <- lgb.train(
     params = list(
@@ -981,6 +935,28 @@ test_that("lgb.train() works when you specify both 'metric' and 'eval' with stri
 
   # the difference metrics shouldn't have been mixed up with each other
   results <- bst$record_evals[["valid1"]]
-  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5135135) < TOLERANCE)
-  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6992222) < TOLERANCE)
+  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE)
+  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE)
+})
+
+test_that("lgb.train() works when you give a function for eval", {
+  set.seed(708L)
+  nrounds <- 10L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.train(
+    params = list(
+      objective = "binary"
+      , metric = "None"
+    )
+    , data = DTRAIN_RANDOM_CLASSIFICATION
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_CLASSIFICATION
+    )
+    , eval = .constant_metric
+  )
+
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid1"]]
+  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
 })

From ac74501e0736d60a698befee420275a30bfd2b0c Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 2 Aug 2020 23:35:25 -0500
Subject: [PATCH 06/10] remove duplicate tests

---
 R-package/tests/testthat/test_basic.R | 31 ---------------------------
 1 file changed, 31 deletions(-)

diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index d73f1b62c308..22fdeda94237 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -1086,37 +1086,6 @@ test_that("lgb.train() works when you specify both 'metric' and 'eval' with stri
   expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE)
 })
 
-test_that("lgb.train() works when you specify both 'metric' and 'eval' with strings", {
-  set.seed(708L)
-  nrounds <- 10L
-  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
-  bst <- lgb.train(
-    params = list(
-      objective = "binary"
-      , metric = "binary_error"
-    )
-    , data = DTRAIN_RANDOM_CLASSIFICATION
-    , nrounds = nrounds
-    , valids = list(
-      "valid1" = DVALID_RANDOM_CLASSIFICATION
-    )
-    , eval = "binary_logloss"
-  )
-
-  # both metrics should have been used
-  expect_named(
-    bst$record_evals[["valid1"]]
-    , expected = c("binary_error", "binary_logloss")
-    , ignore.order = TRUE
-    , ignore.case = FALSE
-  )
-
-  # the difference metrics shouldn't have been mixed up with each other
-  results <- bst$record_evals[["valid1"]]
-  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE)
-  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE)
-})
-
 test_that("lgb.train() works when you give a function for eval", {
   set.seed(708L)
   nrounds <- 10L

From 5c685e82c8dc63699f29061a6143d7e1ba37072a Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 26 Aug 2020 20:50:14 -0500
Subject: [PATCH 07/10] get training tests

---
 R-package/R/aliases.R                 | 14 ++++++++++++++
 R-package/R/lgb.train.R               |  3 +--
 R-package/R/utils.R                   | 15 +++++++++------
 R-package/tests/testthat/test_basic.R |  7 +++++--
 R-package/tests/testthat/test_utils.R |  9 +++++++++
 5 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/R-package/R/aliases.R b/R-package/R/aliases.R
index 09cb86629872..8176125b6f2f 100644
--- a/R-package/R/aliases.R
+++ b/R-package/R/aliases.R
@@ -108,3 +108,17 @@
     )
     return(c(learning_params, .DATASET_PARAMETERS()))
 }
+
+# [description]
+#     Per https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst#metric,
+#     a few different strings can be used to indicate "no metrics".
+# [returns]
+#     A character vector
+.NO_METRIC_STRINGS <- function() {
+    return(c(
+        "na"
+        , "None"
+        , "null"
+        , "custom"
+    ))
+}
diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
index 8c0e5e15af6d..10348153d171 100644
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -173,7 +173,6 @@ lgb.train <- function(params = list(),
   if (!is.null(predictor)) {
     begin_iteration <- predictor$current_iter() + 1L
   }
-
   # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
   n_trees <- .PARAMETER_ALIASES()[["num_iterations"]]
   if (any(names(params) %in% n_trees)) {
@@ -384,7 +383,7 @@ lgb.train <- function(params = list(),
 
     # when using a custom eval function, the metric name is returned from the
     # function, so figure it out from record_evals
-    if (!is.null(feval)) {
+    if (!is.null(eval_functions[1L])) {
       first_metric <- names(booster$record_evals[[first_valid_name]])[1L]
     } else {
       first_metric <- booster$.__enclos_env__$private$eval_names[1L]
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 5472313b4d66..770598810d2e 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -317,10 +317,11 @@ lgb.check.obj <- function(params, obj) {
 
 }
 
-# [description] Take any character values from eval and store them
-#               in params$metric. This has to account for the fact that
-#               `eval` could be a character vector, a function, a list of functions,
-#               or a list with a mix of strings and functions
+# [description]
+#     Take any character values from eval and store them in params$metric.
+#     This has to account for the fact that `eval` could be a character vector,
+#     a function, a list of functions, or a list with a mix of strings and
+#     functions
 lgb.check.eval <- function(params, eval) {
 
   if (is.null(params$metric)) {
@@ -335,7 +336,6 @@ lgb.check.eval <- function(params, eval) {
     for (i in seq_along(eval)) {
       element <- eval[[i]]
       if (is.character(element)) {
-        print(paste0("Adding '", element, "' to list of metrics"))
         params$metric <- append(params$metric, element)
       }
     }
@@ -346,7 +346,7 @@ lgb.check.eval <- function(params, eval) {
   if (length(params$metric) > 1L) {
     params$metric <- Filter(
         f = function(metric) {
-          metric != "None"
+          !(metric %in% .NO_METRIC_STRINGS())
         }
         , x = params$metric
     )
@@ -356,5 +356,8 @@ lgb.check.eval <- function(params, eval) {
     params$metric <- append(params$metric, unlist(eval))
   }
 
+  # duplicate metrics should be filtered out
+  params$metric <- as.list(unique(unlist(params$metric)))
+
   return(params)
 }
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index c000b0d6df2a..2c82037a7cbb 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -13,7 +13,8 @@ set.seed(708L)
 #               This is used to mock the situation where an evaluation
 #               metric increases every iteration
 ACCUMULATOR_NAME <- "INCREASING_METRIC_ACUMULATOR"
-assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv)
+assign(x = "INCREASING_METRIC_ACUMULATOR", value = 0.0, envir = .GlobalEnv)
+
 .increasing_metric <- function(preds, dtrain) {
   if (!exists(ACCUMULATOR_NAME, envir = .GlobalEnv)) {
     assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv)
@@ -758,6 +759,7 @@ test_that("lgb.train() works with early stopping for regression", {
   bst <- lgb.train(
     params = list(
       objective = "regression"
+      , metric = "rmse"
     )
     , data = dtrain
     , nrounds = nrounds
@@ -780,7 +782,6 @@ test_that("lgb.train() works with early stopping for regression", {
     params = list(
       objective = "regression"
       , metric = "rmse"
-      , min_data_in_bin = 5L
       , early_stopping_rounds = early_stopping_rounds
     )
     , data = dtrain
@@ -1168,6 +1169,7 @@ test_that("lgb.train() works with early stopping for regression with a metric th
   )
 })
 
+
 test_that("lgb.train() supports non-ASCII feature names", {
   testthat::skip("UTF-8 feature names are not fully supported in the R package")
   dtrain <- lgb.Dataset(
@@ -1555,3 +1557,4 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
   expect_equal(pred1, pred2)
 
 })
+
diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R
index af440118f936..ab2c21950d64 100644
--- a/R-package/tests/testthat/test_utils.R
+++ b/R-package/tests/testthat/test_utils.R
@@ -115,3 +115,12 @@ test_that("lgb.check.eval adds eval to metric in params if a list is provided",
     expect_named(params, "metric")
     expect_identical(params[["metric"]], list("auc", "binary_error", "binary_logloss"))
 })
+
+test_that("lgb.check.eval drops duplicate metrics and preserves order", {
+    params <- lgb.check.eval(
+        params = list(metric = "l1")
+        , eval = list("l2", "rmse", "l1", "rmse")
+    )
+    expect_named(params, "metric")
+    expect_identical(params[["metric"]], list("l1", "l2", "rmse"))
+})

From ed44f9ee190944dc2019958a623ed7d1db2b7a4d Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 26 Aug 2020 23:41:31 -0500
Subject: [PATCH 08/10] fixes for lgb.cv()

---
 R-package/R/lgb.cv.R                  | 38 ++++++++++--------
 R-package/R/lgb.train.R               | 52 +------------------------
 R-package/R/lightgbm.R                | 51 +++++++++++++++++++++++++
 R-package/R/utils.R                   |  4 --
 R-package/man/lgb.cv.Rd               | 55 +++++++++++++++++++++++++--
 R-package/man/lgb_shared_params.Rd    | 55 +++++++++++++++++++++++++++
 R-package/man/lightgbm.Rd             | 17 +++++++++
 R-package/tests/testthat/test_basic.R |  1 -
 8 files changed, 199 insertions(+), 74 deletions(-)

diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
index e38a2d9a82b7..4734d4bad743 100644
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -24,10 +24,6 @@ CVBooster <- R6::R6Class(
 #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
 #' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
 #' @param weight vector of response values. If not NULL, will set to dataset
-#' @param obj objective function, can be character or custom objective function. Examples include
-#'            \code{regression}, \code{regression_l1}, \code{huber},
-#'             \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
-#' @param eval evaluation function, can be (list of) character or custom eval function
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
 #' @param showsd \code{boolean}, whether to show standard deviation of cross validation
 #' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
@@ -52,7 +48,7 @@ CVBooster <- R6::R6Class(
 #'                                   the number of real CPU cores, not the number of threads (most
 #'                                   CPU using hyper-threading to generate 2 threads per CPU core).}
 #'            }
-#'
+#' @inheritSection lgb_shared_params Early Stopping
 #' @return a trained model \code{lgb.CVBooster}.
 #'
 #' @examples
@@ -114,7 +110,7 @@ lgb.cv <- function(params = list()
   params <- lgb.check.obj(params, obj)
   params <- lgb.check.eval(params, eval)
   fobj <- NULL
-  feval <- NULL
+  eval_functions <- list(NULL)
 
   # Check for objective (function or not)
   if (is.function(params$objective)) {
@@ -122,9 +118,17 @@ lgb.cv <- function(params = list()
     params$objective <- "NONE"
   }
 
-  # Check for loss (function or not)
+  # If loss is a single function, store it as a 1-element list
+  # (for backwards compatibility). If it is a list of functions, store
+  # all of them
   if (is.function(eval)) {
-    feval <- eval
+    eval_functions <- list(eval)
+  }
+  if (methods::is(eval, "list")) {
+    eval_functions <- Filter(
+      f = is.function
+      , x = eval
+    )
   }
 
   # Init predictor to empty
@@ -358,7 +362,11 @@ lgb.cv <- function(params = list()
     # Update one boosting iteration
     msg <- lapply(cv_booster$boosters, function(fd) {
       fd$booster$update(fobj = fobj)
-      fd$booster$eval_valid(feval = feval)
+      out <- list()
+      for (eval_function in eval_functions) {
+        out <- append(out, fd$booster$eval_valid(feval = eval_function))
+      }
+      return(out)
     })
 
     # Prepare collection of evaluation results
@@ -585,7 +593,6 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
     ret_eval[[j]]$value <- mean(eval_result[[j]])
   }
 
-  # Preinit evaluation error
   ret_eval_err <- NULL
 
   # Check for standard deviation
@@ -604,10 +611,11 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
 
   }
 
-  # Return errors
-  list(
-    eval_list = ret_eval
-    , eval_err_list = ret_eval_err
-  )
+  return({
+    list(
+      eval_list = ret_eval
+      , eval_err_list = ret_eval_err
+    )
+  })
 
 }
diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
index 10348153d171..e1637ed3c007 100644
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -3,42 +3,6 @@
 #' @description Logic to train with LightGBM
 #' @inheritParams lgb_shared_params
 #' @param valids a list of \code{lgb.Dataset} objects, used for validation
-#' @param obj objective function, can be character or custom objective function. Examples include
-#'            \code{regression}, \code{regression_l1}, \code{huber},
-#'            \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
-#' @param eval evaluation function(s). This can be a character vector, function, or list with a mixture of
-#'             strings and functions.
-#'
-#'             \itemize{
-#'                 \item{\bold{a. character vector}:
-#'                     If you provide a character vector to this argument, it should contain strings with valid
-#'                     evaluation metrics.
-#'                     See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{
-#'                     The "metric" section of the documentation}
-#'                     for a list of valid metrics.
-#'                 }
-#'                 \item{\bold{b. function}:
-#'                      You can provide a custom evaluation function. This
-#'                      should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
-#'                      list with three elements:
-#'                      \itemize{
-#'                          \item{\code{name}: A string with the name of the metric, used for printing
-#'                              and storing results.
-#'                          }
-#'                          \item{\code{value}: A single number indicating the value of the metric for the
-#'                              given predictions and true values
-#'                          }
-#'                          \item{
-#'                              \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
-#'                              For example, this would be \code{FALSE} for metrics like MAE or RMSE.
-#'                          }
-#'                      }
-#'                 }
-#'                 \item{\bold{c. list}:
-#'                     If a list is given, it should only contain character vectors and functions.
-#'                     These should follow the requirements from the descriptions above.
-#'                 }
-#'             }
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
 #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
 #' @param categorical_feature list of str or int
@@ -58,21 +22,7 @@
 #'                                   the number of real CPU cores, not the number of threads (most
 #'                                   CPU using hyper-threading to generate 2 threads per CPU core).}
 #'            }
-#' @section Early Stopping:
-#'
-#'          "early stopping" refers to stopping the training process if the model's performance on a given
-#'          validation set does not improve for several consecutive iterations.
-#'
-#'          If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
-#'          early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
-#'          metrics will be considered for early stopping.
-#'
-#'          If you want to only consider the first metric for early stopping, pass
-#'          \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
-#'          in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
-#'          a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
-#'          or \code{objective} (passed into \code{params}).
-#'
+#' @inheritSection lgb_shared_params Early Stopping
 #' @return a trained booster model \code{lgb.Booster}.
 #'
 #' @examples
diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R
index 84e2503db2f6..2ea789278af6 100644
--- a/R-package/R/lightgbm.R
+++ b/R-package/R/lightgbm.R
@@ -10,11 +10,61 @@
 #'                              and one metric. If there's more than one, will check all of them
 #'                              except the training data. Returns the model with (best_iter + early_stopping_rounds).
 #'                              If early stopping occurs, the model will have 'best_iter' field.
+#' @param eval evaluation function(s). This can be a character vector, function, or list with a mixture of
+#'             strings and functions.
+#'
+#'             \itemize{
+#'                 \item{\bold{a. character vector}:
+#'                     If you provide a character vector to this argument, it should contain strings with valid
+#'                     evaluation metrics.
+#'                     See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{
+#'                     The "metric" section of the documentation}
+#'                     for a list of valid metrics.
+#'                 }
+#'                 \item{\bold{b. function}:
+#'                      You can provide a custom evaluation function. This
+#'                      should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
+#'                      list with three elements:
+#'                      \itemize{
+#'                          \item{\code{name}: A string with the name of the metric, used for printing
+#'                              and storing results.
+#'                          }
+#'                          \item{\code{value}: A single number indicating the value of the metric for the
+#'                              given predictions and true values
+#'                          }
+#'                          \item{
+#'                              \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
+#'                              For example, this would be \code{FALSE} for metrics like MAE or RMSE.
+#'                          }
+#'                      }
+#'                 }
+#'                 \item{\bold{c. list}:
+#'                     If a list is given, it should only contain character vectors and functions.
+#'                     These should follow the requirements from the descriptions above.
+#'                 }
+#'             }
 #' @param eval_freq evaluation output frequency, only effect when verbose > 0
 #' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
 #' @param nrounds number of training rounds
+#' @param obj objective function, can be character or custom objective function. Examples include
+#'            \code{regression}, \code{regression_l1}, \code{huber},
+#'            \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
 #' @param params List of parameters
 #' @param verbose verbosity for output, if <= 0, also will disable the print of evaluation during training
+#' @section Early Stopping:
+#'
+#'          "early stopping" refers to stopping the training process if the model's performance on a given
+#'          validation set does not improve for several consecutive iterations.
+#'
+#'          If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
+#'          early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
+#'          metrics will be considered for early stopping.
+#'
+#'          If you want to only consider the first metric for early stopping, pass
+#'          \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+#'          in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+#'          a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+#'          or \code{objective} (passed into \code{params}).
 #' @keywords internal
 NULL
 
@@ -47,6 +97,7 @@ NULL
 #'                             the number of real CPU cores, not the number of threads (most
 #'                             CPU using hyper-threading to generate 2 threads per CPU core).}
 #'     }
+#' @inheritSection lgb_shared_params Early Stopping
 #' @export
 lightgbm <- function(data,
                      label = NULL,
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 770598810d2e..b3816a6debd7 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -352,10 +352,6 @@ lgb.check.eval <- function(params, eval) {
     )
   }
 
-  if (identical(class(eval), "list")) {
-    params$metric <- append(params$metric, unlist(eval))
-  }
-
   # duplicate metrics should be filtered out
   params$metric <- as.list(unique(unlist(params$metric)))
 
diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd
index d4f62c2b2207..987b04ec1a0a 100644
--- a/R-package/man/lgb.cv.Rd
+++ b/R-package/man/lgb.cv.Rd
@@ -45,9 +45,41 @@ may allow you to pass other types of data like \code{matrix} and then separately
 
 \item{obj}{objective function, can be character or custom objective function. Examples include
 \code{regression}, \code{regression_l1}, \code{huber},
- \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
-
-\item{eval}{evaluation function, can be (list of) character or custom eval function}
+\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
+
+\item{eval}{evaluation function(s). This can be a character vector, function, or list with a mixture of
+            strings and functions.
+
+            \itemize{
+                \item{\bold{a. character vector}:
+                    If you provide a character vector to this argument, it should contain strings with valid
+                    evaluation metrics.
+                    See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{
+                    The "metric" section of the documentation}
+                    for a list of valid metrics.
+                }
+                \item{\bold{b. function}:
+                     You can provide a custom evaluation function. This
+                     should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
+                     list with three elements:
+                     \itemize{
+                         \item{\code{name}: A string with the name of the metric, used for printing
+                             and storing results.
+                         }
+                         \item{\code{value}: A single number indicating the value of the metric for the
+                             given predictions and true values
+                         }
+                         \item{
+                             \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
+                             For example, this would be \code{FALSE} for metrics like MAE or RMSE.
+                         }
+                     }
+                }
+                \item{\bold{c. list}:
+                    If a list is given, it should only contain character vectors and functions.
+                    These should follow the requirements from the descriptions above.
+                }
+            }}
 
 \item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
 
@@ -99,6 +131,23 @@ a trained model \code{lgb.CVBooster}.
 \description{
 Cross validation logic used by LightGBM
 }
+\section{Early Stopping}{
+
+
+         "early stopping" refers to stopping the training process if the model's performance on a given
+         validation set does not improve for several consecutive iterations.
+
+         If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
+         early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
+         metrics will be considered for early stopping.
+
+         If you want to only consider the first metric for early stopping, pass
+         \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+         in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+         a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+         or \code{objective} (passed into \code{params}).
+}
+
 \examples{
 \dontrun{
 data(agaricus.train, package = "lightgbm")
diff --git a/R-package/man/lgb_shared_params.Rd b/R-package/man/lgb_shared_params.Rd
index 501997fd2e2d..ed1922bc2225 100644
--- a/R-package/man/lgb_shared_params.Rd
+++ b/R-package/man/lgb_shared_params.Rd
@@ -16,12 +16,50 @@ and one metric. If there's more than one, will check all of them
 except the training data. Returns the model with (best_iter + early_stopping_rounds).
 If early stopping occurs, the model will have 'best_iter' field.}
 
+\item{eval}{evaluation function(s). This can be a character vector, function, or list with a mixture of
+            strings and functions.
+
+            \itemize{
+                \item{\bold{a. character vector}:
+                    If you provide a character vector to this argument, it should contain strings with valid
+                    evaluation metrics.
+                    See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{
+                    The "metric" section of the documentation}
+                    for a list of valid metrics.
+                }
+                \item{\bold{b. function}:
+                     You can provide a custom evaluation function. This
+                     should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
+                     list with three elements:
+                     \itemize{
+                         \item{\code{name}: A string with the name of the metric, used for printing
+                             and storing results.
+                         }
+                         \item{\code{value}: A single number indicating the value of the metric for the
+                             given predictions and true values
+                         }
+                         \item{
+                             \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
+                             For example, this would be \code{FALSE} for metrics like MAE or RMSE.
+                         }
+                     }
+                }
+                \item{\bold{c. list}:
+                    If a list is given, it should only contain character vectors and functions.
+                    These should follow the requirements from the descriptions above.
+                }
+            }}
+
 \item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
 
 \item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
 
 \item{nrounds}{number of training rounds}
 
+\item{obj}{objective function, can be character or custom objective function. Examples include
+\code{regression}, \code{regression_l1}, \code{huber},
+\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
+
 \item{params}{List of parameters}
 
 \item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
@@ -29,4 +67,21 @@ If early stopping occurs, the model will have 'best_iter' field.}
 \description{
 Parameter docs shared by \code{lgb.train}, \code{lgb.cv}, and \code{lightgbm}
 }
+\section{Early Stopping}{
+
+
+         "early stopping" refers to stopping the training process if the model's performance on a given
+         validation set does not improve for several consecutive iterations.
+
+         If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
+         early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
+         metrics will be considered for early stopping.
+
+         If you want to only consider the first metric for early stopping, pass
+         \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+         in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+         a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+         or \code{objective} (passed into \code{params}).
+}
+
 \keyword{internal}
diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd
index 256a7dc6e8e9..13806fbc05a7 100644
--- a/R-package/man/lightgbm.Rd
+++ b/R-package/man/lightgbm.Rd
@@ -74,3 +74,20 @@ List of callback functions that are applied at each iteration.}
 \description{
 Simple interface for training a LightGBM model.
 }
+\section{Early Stopping}{
+
+
+         "early stopping" refers to stopping the training process if the model's performance on a given
+         validation set does not improve for several consecutive iterations.
+
+         If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
+         early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
+         metrics will be considered for early stopping.
+
+         If you want to only consider the first metric for early stopping, pass
+         \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+         in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+         a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+         or \code{objective} (passed into \code{params}).
+}
+
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 2c82037a7cbb..69a7fb188beb 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -1557,4 +1557,3 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
   expect_equal(pred1, pred2)
 
 })
-

From 62f4e793229e0368bb86f66a0645f5df883eba37 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 31 Aug 2020 01:02:04 -0500
Subject: [PATCH 09/10] fixes for lgb.cv()

---
 R-package/R/lgb.cv.R                  |  23 ++--
 R-package/tests/testthat/test_basic.R | 166 ++++++++++++++++++++++++++
 2 files changed, 181 insertions(+), 8 deletions(-)

diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
index 4734d4bad743..671c0f10a850 100644
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -393,7 +393,13 @@ lgb.cv <- function(params = list()
   # When early stopping is not activated, we compute the best iteration / score ourselves
   # based on the first first metric
   if (record && is.na(env$best_score)) {
-    first_metric <- cv_booster$boosters[[1L]][[1L]]$.__enclos_env__$private$eval_names[1L]
+    # when using a custom eval function, the metric name is returned from the
+    # function, so figure it out from record_evals
+    if (!is.null(eval_functions[1L])) {
+      first_metric <- names(cv_booster$record_evals[["valid"]])[1L]
+    } else {
+      first_metric <- cv_booster$.__enclos_env__$private$eval_names[1L]
+    }
     .find_best <- which.min
     if (isTRUE(env$eval_list[[1L]]$higher_better[1L])) {
       .find_best <- which.max
@@ -585,7 +591,8 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
       msg[[i]][[j]]$value }))
   })
 
-  # Get evaluation
+  # Get evaluation. Just taking the first element here to
+  # get structture (name, higher_bettter, data_name)
   ret_eval <- msg[[1L]]
 
   # Go through evaluation length items
@@ -593,6 +600,7 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
     ret_eval[[j]]$value <- mean(eval_result[[j]])
   }
 
+  # Preinit evaluation error
   ret_eval_err <- NULL
 
   # Check for standard deviation
@@ -611,11 +619,10 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
 
   }
 
-  return({
-    list(
-      eval_list = ret_eval
-      , eval_err_list = ret_eval_err
-    )
-  })
+  # Return errors
+  list(
+    eval_list = ret_eval
+    , eval_err_list = ret_eval_err
+  )
 
 }
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 69a7fb188beb..7a7545ce1f60 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -1458,6 +1458,172 @@ test_that("using lightgbm() without early stopping, best_iter and best_score com
   expect_identical(bst$best_score, auc_scores[which.max(auc_scores)])
 })
 
+test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings", {
+  set.seed(708L)
+  nrounds <- 10L
+  nfolds <- 4L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.cv(
+    params = list(
+      objective = "binary"
+      , metric = "binary_error"
+    )
+    , data = DTRAIN_RANDOM_CLASSIFICATION
+    , nrounds = nrounds
+    , nfold = nfolds
+    , eval = "binary_logloss"
+  )
+
+  # both metrics should have been used
+  expect_named(
+    bst$record_evals[["valid"]]
+    , expected = c("binary_error", "binary_logloss")
+    , ignore.order = TRUE
+    , ignore.case = FALSE
+  )
+
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid"]]
+  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < TOLERANCE)
+  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < TOLERANCE)
+
+  # all boosters should have been created
+  expect_length(bst$boosters, nfolds)
+})
+
+test_that("lgb.cv() works when you give a function for eval", {
+  set.seed(708L)
+  nrounds <- 10L
+  nfolds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.cv(
+    params = list(
+      objective = "binary"
+      , metric = "None"
+    )
+    , data = DTRAIN_RANDOM_CLASSIFICATION
+    , nfold = nfolds
+    , nrounds = nrounds
+    , eval = .constant_metric
+  )
+
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid"]]
+  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
+  expect_named(results, "constant_metric")
+})
+
+test_that("If first_metric_only is TRUE, lgb.cv() decides to stop early based on only the first metric", {
+  set.seed(708L)
+  nrounds <- 10L
+  nfolds <- 5L
+  early_stopping_rounds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.cv(
+    params = list(
+      objective = "regression"
+      , metric = "None"
+      , early_stopping_rounds = early_stopping_rounds
+      , first_metric_only = TRUE
+    )
+    , data = DTRAIN_RANDOM_REGRESSION
+    , nfold = nfolds
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_REGRESSION
+    )
+    , eval = list(
+      .increasing_metric
+      , .constant_metric
+    )
+  )
+
+  # Only the two functions provided to "eval" should have been evaluated
+  expect_named(bst$record_evals[["valid"]], c("increasing_metric", "constant_metric"))
+
+  # all 10 iterations should happen, and the best_iter should be the final one
+  expect_equal(bst$best_iter, nrounds)
+
+  # best_score should be taken from "increasing_metric"
+  #
+  # this expected value looks magical and confusing, but it's because
+  # evaluation metrics are averaged over all folds.
+  #
+  # consider 5-fold CV with a metric that adds 0.1 to a global accumulator
+  # each time it's called
+  #
+  # * iter 1: [0.1, 0.2, 0.3, 0.4, 0.5] (mean = 0.3)
+  # * iter 2: [0.6, 0.7, 0.8, 0.9, 1.0] (mean = 1.3)
+  # * iter 3: [1.1, 1.2, 1.3, 1.4, 1.5] (mean = 1.8)
+  #
+  cv_value <- increasing_metric_starting_value + mean(seq_len(nfolds) / 10.0) + (nrounds  - 1L) * 0.1 * nfolds
+  expect_equal(bst$best_score, cv_value)
+
+  # early stopping should not have happened. Even though constant_metric
+  # had 9 consecutive iterations with no improvement, it is ignored because of
+  # first_metric_only = TRUE
+  expect_equal(
+    length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]])
+    , nrounds
+  )
+  expect_equal(
+    length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]])
+    , nrounds
+  )
+})
+
+test_that("early stopping works with lgb.cv()", {
+  set.seed(708L)
+  nrounds <- 10L
+  nfolds <- 5L
+  early_stopping_rounds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.cv(
+    params = list(
+      objective = "regression"
+      , metric = "None"
+      , early_stopping_rounds = early_stopping_rounds
+      , first_metric_only = TRUE
+    )
+    , data = DTRAIN_RANDOM_REGRESSION
+    , nfold = nfolds
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_REGRESSION
+    )
+    , eval = list(
+      .constant_metric
+      , .increasing_metric
+    )
+  )
+
+  # only the two functions provided to "eval" should have been evaluated
+  expect_named(bst$record_evals[["valid"]], c("constant_metric", "increasing_metric"))
+
+  # best_iter should be based on the first metric. Since constant_metric
+  # never changes, its first iteration was the best oone
+  expect_equal(bst$best_iter, 1L)
+
+  # best_score should be taken from the first metri
+  expect_equal(bst$best_score, 0.2)
+
+  # early stopping should have happened, since constant_metric was the first
+  # one passed to eval and it will not improve over consecutive iterations
+  #
+  # note that this test is identical to the previous one, but with the
+  # order of the eval metrics switched
+  expect_equal(
+    length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]])
+    , early_stopping_rounds + 1
+  )
+  expect_equal(
+    length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]])
+    , early_stopping_rounds + 1
+  )
+})
+
+context("interaction constraints")
+
 test_that("lgb.train() throws an informative error if interaction_constraints is not a list", {
   dtrain <- lgb.Dataset(train$data, label = train$label)
   params <- list(objective = "regression", interaction_constraints = "[1,2],[3]")

From ea88e44a90c97b15451391c44e9642dc038acca0 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 31 Aug 2020 20:42:01 -0500
Subject: [PATCH 10/10] fix linting

---
 R-package/tests/testthat/test_basic.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 7a7545ce1f60..638981e60cdf 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -1614,11 +1614,11 @@ test_that("early stopping works with lgb.cv()", {
   # order of the eval metrics switched
   expect_equal(
     length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]])
-    , early_stopping_rounds + 1
+    , early_stopping_rounds + 1L
   )
   expect_equal(
     length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]])
-    , early_stopping_rounds + 1
+    , early_stopping_rounds + 1L
   )
 })