microsoft · shiyu1994 · Mar 12, 2022 · Apr 12, 2021 · May 9, 2021 · May 9, 2021
diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
@@ -41,7 +41,8 @@ CVBooster <- R6::R6Class(
 #'                   into a predictor model which frees up memory and the original datasets
 #' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
 #'            \itemize{
-#'                \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.}
+#'                \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"}, \code{"goss"}
+#'                                                   or \code{"mvs"}.}
 #'                \item{\code{num_leaves}: Maximum number of leaves in one tree.}
 #'                \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with
 #'                                 overfit when #data is small. Tree still grow by leaf-wise.}

diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
@@ -15,7 +15,8 @@
 #' @param ... other parameters, see \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{
 #'            the "Parameters" section of the documentation} for more information. A few key parameters:
 #'            \itemize{
-#'                \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.}
+#'                \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"},
+#'                                                      \code{"goss"} or \code{"mvs"}.}
 #'                \item{\code{num_leaves}: Maximum number of leaves in one tree.}
 #'                \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with
 #'                                 overfitting. Tree still grow by leaf-wise.}

diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R
@@ -90,7 +90,8 @@ NULL
 #'                            say "the first and tenth columns").}
 #'        \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model
 #'                          into a predictor model which frees up memory and the original datasets}
-#'         \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.}
+#'         \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"},
+#'                                               \code{"goss"} or \code{"mvs"}.}
 #'         \item{\code{num_leaves}: Maximum number of leaves in one tree.}
 #'         \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with
 #'                          overfit when #data is small. Tree still grow by leaf-wise.}

diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd
diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd
diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd
@@ -26,6 +26,7 @@ OBJECTS = \
     boosting/gbdt_model_text.o \
     boosting/gbdt_prediction.o \
     boosting/prediction_early_stop.o \
+    boosting/mvs.o \
     io/bin.o \
     io/config.o \
     io/config_auto.o \

@@ -27,6 +27,7 @@ OBJECTS = \
     boosting/gbdt_model_text.o \
     boosting/gbdt_prediction.o \
     boosting/prediction_early_stop.o \
+    boosting/mvs.o \
     io/bin.o \
     io/config.o \
     io/config_auto.o \

@@ -119,6 +119,8 @@ Core Parameters
 
       -  **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
 
+   -  ``mvs``, Minimal variance sampling <https://arxiv.org/abs/1910.13204>__
+
 -  ``linear_tree`` :raw-html:`<a id="linear_tree" title="Permalink to this parameter" href="#linear_tree">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``linear_trees``
 
    -  fit piecewise linear gradient boosting tree
@@ -336,6 +338,28 @@ Learning Control Parameters
 
    -  **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored
 
+-  ``mvs_lambda`` :raw-html:`<a id="mvs_lambda" title="Permalink to this parameter" href="#mvs_lambda">&#x1F517;&#xFE0E;</a>`, default = ``1e-4``, type = double, constraints: ``mvs_lambda > 0.0``
+
+   -  used in MVS boosting if ``mvs_adaptive == true`` than this value is ignored
+
+   -  used only in ``mvs``
+
+-  ``mvs_adaptive`` :raw-html:`<a id="mvs_adaptive" title="Permalink to this parameter" href="#mvs_adaptive">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  use adaptive variant of mvs boosting
+
+   -  used only in ``mvs``
+
+-  ``mvs_max_sequential_size`` :raw-html:`<a id="mvs_max_sequential_size" title="Permalink to this parameter" href="#mvs_max_sequential_size">&#x1F517;&#xFE0E;</a>`, default = ``256000``, type = int, constraints: ``mvs_max_sequential_size > 0``
+
+   -  used in MVS boosting training dataset size is greater than ``mvs_max_sequential_size``, than threshold
+
+   -  for MVS is chosen for each thread independently.
+
+   -  used only in ``mvs``
+
+   -  **Note**: on small dataset setting this parameter less than size of dataset may produce results depending on number of threads
+
 -  ``bagging_freq`` :raw-html:`<a id="bagging_freq" title="Permalink to this parameter" href="#bagging_freq">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = int, aliases: ``subsample_freq``
 
    -  frequency for bagging

@@ -147,6 +147,7 @@ struct Config {
   // desc = ``dart``, `Dropouts meet Multiple Additive Regression Trees <https://arxiv.org/abs/1505.01866>`__
   // desc = ``goss``, Gradient-based One-Side Sampling
   // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
+  // desc = ``mvs``, Minimal variance sampling <https://arxiv.org/abs/1910.13204>__
   std::string boosting = "gbdt";
 
   // alias = linear_trees
@@ -318,6 +319,25 @@ struct Config {
   // desc = **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored
   double neg_bagging_fraction = 1.0;
 
+  // default = 1e-4
+  // check = >0.0
+  // desc = used in MVS boosting if ``mvs_adaptive == true`` than this value is ignored
+  // desc = used only in ``mvs``
+  double mvs_lambda = 1e-4;
+
+  // default = false
+  // desc = use adaptive variant of mvs boosting
+  // desc = used only in ``mvs``
+  bool mvs_adaptive = false;
+
+  // default = 256000
+  // check = >0
+  // desc = used in MVS boosting training dataset size is greater than ``mvs_max_sequential_size``, than threshold
+  // desc = for MVS is chosen for each thread independently.
+  // desc = used only in ``mvs``
+  // desc = **Note**: on small dataset setting this parameter less than size of dataset may produce results depending on number of threads
+  int mvs_max_sequential_size = 256000;
+
   // alias = subsample_freq
   // desc = frequency for bagging
   // desc = ``0`` means disable bagging; ``k`` means perform bagging at every ``k`` iteration. Every ``k``-th iteration, LightGBM will randomly select ``bagging_fraction * 100 %`` of the data to use for the next ``k`` iterations

@@ -9,6 +9,7 @@
 #include <LightGBM/utils/threading.h>
 
 #include <algorithm>
+#include <numeric>
 #include <utility>
 #include <vector>
 
@@ -103,7 +104,7 @@ class ArrayArgs {
     int j = end - 1;
     int p = i;
     int q = j;
-    if (start >= end) {
+    if (start >= end - 1) {
       return;
     }
     std::vector<VAL_T>& ref = *arr;
@@ -183,6 +184,36 @@ class ArrayArgs {
     }
     return true;
   }
+
+  static double CalculateThresholdMVS(std::vector<VAL_T>* gradients, data_size_t begin, data_size_t end,
+                                             const double sample_size) {
+    double current_sum_small = 0.0;
+    data_size_t big_grad_size = 0;
+
+    while (begin != end) {
+      data_size_t middle_begin = begin - 1, middle_end = end;
+      ArrayArgs<score_t>::Partition(gradients, begin, end, &middle_begin, &middle_end);
+      ++middle_begin;  // for half intervals
+      const data_size_t n_middle = middle_end - middle_begin;
+      const data_size_t large_size = middle_begin - begin;
+
+      const double sum_small = std::accumulate(gradients->begin() + middle_end, gradients->begin() + end, 0.0);
+      const double sum_middle = (*gradients)[middle_begin] * n_middle;
+
+      const double
+          current_sampling_rate = (current_sum_small + sum_small) / (*gradients)[middle_begin] + big_grad_size + n_middle + large_size;
+
+      if (current_sampling_rate > sample_size) {
+        current_sum_small += sum_small + sum_middle;
+        end = middle_begin;
+      } else {
+        big_grad_size += n_middle + large_size;
+        begin = middle_end;
+      }
+    }
+
+    return current_sum_small / (sample_size - big_grad_size + kEpsilon);
+  }
 };
 
 }  // namespace LightGBM

@@ -364,6 +364,7 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
             'dart', Dropouts meet Multiple Additive Regression Trees.
             'goss', Gradient-based One-Side Sampling.
             'rf', Random Forest.
+            'mvs', Minimal Variance Sampling.
         num_leaves : int, optional (default=31)
             Maximum tree leaves for base learners.
         max_depth : int, optional (default=-1)

@@ -8,6 +8,7 @@
 #include "gbdt.h"
 #include "goss.hpp"
 #include "rf.hpp"
+#include "mvs.hpp"
 
 namespace LightGBM {
 
@@ -42,6 +43,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
       return new GOSS();
     } else if (type == std::string("rf")) {
       return new RF();
+    } else if (type == std::string("mvs")) {
+      return new MVS();
     } else {
       return nullptr;
     }
@@ -56,6 +59,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
         ret.reset(new GOSS());
       } else if (type == std::string("rf")) {
         return new RF();
+      } else if (type == std::string("mvs")) {
+        return new MVS();
       } else {
         Log::Fatal("Unknown boosting type %s", type.c_str());
       }