Minimal Varianсe Sampling booster (#4266)

* Added base for minimal variance sampling booster * Implemented MVS booster with support for multioutput targets, deterministic execution on small datasets/ * Updated documentation and fixed some linting errors * fixed python sklearn documentation, tryed to fix R Cran CI * Second attempt to fix R pipeline * Fixed R package build for windows and linting error * Revert "Fixed R package build for windows and linting error" This reverts commit d50769e * Revert "Revert "Fixed R package build for windows and linting error"" This reverts commit f531f3a. * Fixed some documentation * Fixed intendation error in mvs.hpp, fixed some windows build issues, added spinx version upper bound * Fixed intendation error in mvs.hpp, fixed some windows build issues, added spinx version upper bound * Update requirements_base.txt * Update R-package/src/Makevars.in Co-authored-by: James Lamb <jaylamb20@gmail.com> * Update R-package/src/Makevars.win.in Co-authored-by: James Lamb <jaylamb20@gmail.com> * Added MVS booster support for dask tests * Moved CalculateThresholdSequential to array_args.h and renamed it to CalculateThresholdMVS * Added cpp tests for ArrayArgs::CalculateThresholdMVS and ArrayArgs::Partition. * Fix linter errors in test_dask.py * Fixed UB in ArrayArgs::Partition, when it is called with one element. * Fixed linter errors * Added more cpp tests and fixed linting errors * Fixed linting errors * Updated R-package documentation Updated documentation Updated test_mvs_threshold_search.cpp Added parallel computation of regularized absolute value term. Added new mvs parameter from constant. * Updated MVS Lambda algorithm * Updated documentation, MVS::GetLambda, MVS::GetThreshold, updated MVS::ResetConfig * [ci] fix current `master` fails with graphviz-related error (#5068) * Update test_windows.ps1 * Update .appveyor.yml * Update test_windows.ps1 * Update .appveyor.yml Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru> Co-authored-by: Yu Shi <shiyu_k1994@qq.com>
microsoft · Mar 12, 2022 · 86822d6 · 86822d6
1 parent f6d654b
commit 86822d6
Show file tree

Hide file tree

Showing 16 changed files with 499 additions and 2 deletions.
diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1
@@ -52,7 +52,7 @@ if ($env:TASK -eq "swig") {
 
 conda install -q -y -n $env:CONDA_ENV joblib matplotlib numpy pandas psutil pytest scikit-learn scipy ; Check-Output $?
 # python-graphviz has to be installed separately to prevent conda from downgrading to pypy
-conda install -q -y -n $env:CONDA_ENV python-graphviz ; Check-Output $?
+conda install -q -y -n $env:CONDA_ENV libxml2 python-graphviz ; Check-Output $?
 
 if ($env:TASK -eq "regular") {
   mkdir $env:BUILD_SOURCESDIRECTORY/build; cd $env:BUILD_SOURCESDIRECTORY/build

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
@@ -26,6 +26,7 @@ OBJECTS = \
     boosting/gbdt_model_text.o \
     boosting/gbdt_prediction.o \
     boosting/prediction_early_stop.o \
+    boosting/mvs.o \
     io/bin.o \
     io/config.o \
     io/config_auto.o \

diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
@@ -27,6 +27,7 @@ OBJECTS = \
     boosting/gbdt_model_text.o \
     boosting/gbdt_prediction.o \
     boosting/prediction_early_stop.o \
+    boosting/mvs.o \
     io/bin.o \
     io/config.o \
     io/config_auto.o \

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
@@ -139,6 +139,8 @@ Core Parameters
 
       -  **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
 
+   -  ``mvs``, Minimal variance sampling <https://arxiv.org/abs/1910.13204>__
+
 -  ``data`` :raw-html:`<a id="data" title="Permalink to this parameter" href="#data">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string, aliases: ``train``, ``train_data``, ``train_data_file``, ``data_filename``
 
    -  path of training data, LightGBM will train from this data
@@ -336,6 +338,28 @@ Learning Control Parameters
 
    -  **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored
 
+-  ``mvs_lambda`` :raw-html:`<a id="mvs_lambda" title="Permalink to this parameter" href="#mvs_lambda">&#x1F517;&#xFE0E;</a>`, default = ``1e-4``, type = double, constraints: ``mvs_lambda > 0.0``
+
+   -  used in MVS boosting. If ``mvs_adaptive == true`` then this value is ignored.
+
+   -  used only in ``mvs``
+
+-  ``mvs_adaptive`` :raw-html:`<a id="mvs_adaptive" title="Permalink to this parameter" href="#mvs_adaptive">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  use adaptive variant of mvs boosting
+
+   -  used only in ``mvs``
+
+-  ``mvs_max_sequential_size`` :raw-html:`<a id="mvs_max_sequential_size" title="Permalink to this parameter" href="#mvs_max_sequential_size">&#x1F517;&#xFE0E;</a>`, default = ``256000``, type = int, constraints: ``mvs_max_sequential_size > 0``
+
+   -  used in MVS boosting training. If dataset size is greater than ``mvs_max_sequential_size``, then threshold
+
+   -  for MVS is chosen for each thread independently.
+
+   -  used only in ``mvs``
+
+   -  **Note**: on small dataset setting this parameter less than size of dataset may produce results depending on number of threads
+
 -  ``bagging_freq`` :raw-html:`<a id="bagging_freq" title="Permalink to this parameter" href="#bagging_freq">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = int, aliases: ``subsample_freq``
 
    -  frequency for bagging

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
@@ -147,6 +147,7 @@ struct Config {
   // desc = ``dart``, `Dropouts meet Multiple Additive Regression Trees <https://arxiv.org/abs/1505.01866>`__
   // desc = ``goss``, Gradient-based One-Side Sampling
   // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
+  // desc = ``mvs``, Minimal variance sampling <https://arxiv.org/abs/1910.13204>__
   std::string boosting = "gbdt";
 
   // alias = train, train_data, train_data_file, data_filename
@@ -306,6 +307,25 @@ struct Config {
   // desc = **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored
   double neg_bagging_fraction = 1.0;
 
+  // default = 1e-4
+  // check = >0.0
+  // desc = used in MVS boosting. If ``mvs_adaptive == true`` then this value is ignored.
+  // desc = used only in ``mvs``
+  double mvs_lambda = 1e-4;
+
+  // default = false
+  // desc = use adaptive variant of mvs boosting
+  // desc = used only in ``mvs``
+  bool mvs_adaptive = false;
+
+  // default = 256000
+  // check = >0
+  // desc = used in MVS boosting training. If dataset size is greater than ``mvs_max_sequential_size``, then threshold
+  // desc = for MVS is chosen for each thread independently.
+  // desc = used only in ``mvs``
+  // desc = **Note**: on small dataset setting this parameter less than size of dataset may produce results depending on number of threads
+  int mvs_max_sequential_size = 256000;
+
   // alias = subsample_freq
   // desc = frequency for bagging
   // desc = ``0`` means disable bagging; ``k`` means perform bagging at every ``k`` iteration. Every ``k``-th iteration, LightGBM will randomly select ``bagging_fraction * 100 %`` of the data to use for the next ``k`` iterations

diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h
@@ -9,6 +9,7 @@
 #include <LightGBM/utils/threading.h>
 
 #include <algorithm>
+#include <numeric>
 #include <utility>
 #include <vector>
 
@@ -185,6 +186,36 @@ class ArrayArgs {
     }
     return true;
   }
+
+  static double CalculateThresholdMVS(std::vector<VAL_T>* gradients, data_size_t begin, data_size_t end,
+                                             const double sample_size) {
+    double current_sum_small = 0.0;
+    data_size_t big_grad_size = 0;
+
+    while (begin != end) {
+      data_size_t middle_begin = begin - 1, middle_end = end;
+      ArrayArgs<score_t>::Partition(gradients, begin, end, &middle_begin, &middle_end);
+      ++middle_begin;  // for half intervals
+      const data_size_t n_middle = middle_end - middle_begin;
+      const data_size_t large_size = middle_begin - begin;
+
+      const double sum_small = std::accumulate(gradients->begin() + middle_end, gradients->begin() + end, 0.0);
+      const double sum_middle = (*gradients)[middle_begin] * n_middle;
+
+      const double
+          current_sampling_rate = (current_sum_small + sum_small) / (*gradients)[middle_begin] + big_grad_size + n_middle + large_size;
+
+      if (current_sampling_rate > sample_size) {
+        current_sum_small += sum_small + sum_middle;
+        end = middle_begin;
+      } else {
+        big_grad_size += n_middle + large_size;
+        begin = middle_end;
+      }
+    }
+
+    return current_sum_small / (sample_size - big_grad_size + kEpsilon);
+  }
 };
 
 }  // namespace LightGBM

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
@@ -375,6 +375,7 @@ def __init__(
             'dart', Dropouts meet Multiple Additive Regression Trees.
             'goss', Gradient-based One-Side Sampling.
             'rf', Random Forest.
+            'mvs', Minimal Variance Sampling.
         num_leaves : int, optional (default=31)
             Maximum tree leaves for base learners.
         max_depth : int, optional (default=-1)

diff --git a/src/boosting/boosting.cpp b/src/boosting/boosting.cpp
@@ -8,6 +8,7 @@
 #include "gbdt.h"
 #include "goss.hpp"
 #include "rf.hpp"
+#include "mvs.hpp"
 
 namespace LightGBM {
 
@@ -42,6 +43,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
       return new GOSS();
     } else if (type == std::string("rf")) {
       return new RF();
+    } else if (type == std::string("mvs")) {
+      return new MVS();
     } else {
       return nullptr;
     }
@@ -56,6 +59,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
         ret.reset(new GOSS());
       } else if (type == std::string("rf")) {
         return new RF();
+      } else if (type == std::string("mvs")) {
+        return new MVS();
       } else {
         Log::Fatal("Unknown boosting type %s", type.c_str());
       }

diff --git a/src/boosting/mvs.cpp b/src/boosting/mvs.cpp
@@ -0,0 +1,169 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#include "mvs.hpp"
+
+#include <memory>
+
+namespace LightGBM {
+
+using ConstTreeIterator = std::vector<std::unique_ptr<Tree>>::const_iterator;
+
+MVS::MVS() : GBDT() {}
+
+static double ComputeLeavesMeanSquaredValue(ConstTreeIterator begin,
+                                            ConstTreeIterator end,
+                                            const data_size_t num_leaves) {
+  double sum_values = 0.0;
+#pragma omp parallel for schedule(static, 2048) reduction(+ : sum_values)
+  for (data_size_t leaf_idx = 0; leaf_idx < num_leaves; ++leaf_idx) {
+    double leave_value = 0.0;
+    for (ConstTreeIterator it = begin; it != end; ++it) {
+      if (leaf_idx < (**it).num_leaves()) {
+        const double value = (*it)->LeafOutput(leaf_idx);
+        leave_value += value * value;
+      }
+    }
+    sum_values += std::sqrt(leave_value);
+  }
+  return sum_values / num_leaves;
+}
+
+static double ComputeMeanGradValues(score_t *gradients, score_t *hessians,
+                                    data_size_t size,
+                                    data_size_t num_tree_per_iteration) {
+  double sum = 0.0;
+#pragma omp parallel for schedule(static, 1024) reduction(+ : sum)
+  for (data_size_t i = 0; i < size; ++i) {
+    double local_hessians = 0.0, local_gradients = 0.0;
+    for (data_size_t j = 0; j < num_tree_per_iteration; ++j) {
+      size_t idx = static_cast<size_t>(size) * j + i;
+      local_hessians += hessians[idx] * hessians[idx];
+      local_gradients += gradients[idx] * gradients[idx];
+    }
+    sum += std::sqrt(local_gradients / local_hessians);
+  }
+  return sum / size;
+}
+
+double MVS::GetLambda() {
+  if (!mvs_adaptive_) {
+    return mvs_lambda_;
+  }
+  if (this->iter_ > 0) {
+    return ComputeLeavesMeanSquaredValue(models_.cend() - num_tree_per_iteration_,
+                                         models_.cend(), config_->num_leaves);
+  }
+  return ComputeMeanGradValues(gradients_.data(), hessians_.data(), num_data_,
+                               num_tree_per_iteration_);
+}
+
+void MVS::Bagging(int iter) {
+  if (iter % config_->bagging_freq != 0 && !need_re_bagging_) {
+    return;
+  }
+  need_re_bagging_ = false;
+  bag_data_cnt_ = num_data_;
+  mvs_lambda_ = GetLambda();
+
+  //#pragma omp parallel for schedule(static, 1024)
+  for (data_size_t i = 0; i < num_data_; ++i) {
+    tmp_derivatives_[i] = 0.0f;
+    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+      size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + i;
+      tmp_derivatives_[i] += gradients_[idx] * gradients_[idx] + mvs_lambda_ * hessians_[idx] * hessians_[idx];
+    }
+    tmp_derivatives_[i] = std::sqrt(tmp_derivatives_[i]);
+  }
+
+  if (num_data_ <= config_->mvs_max_sequential_size) {
+    threshold_ = GetThreshold(0, num_data_);
+  }
+
+  auto left_cnt = bagging_runner_.Run<true>(
+      num_data_,
+      [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t *left,
+          data_size_t *) {
+        data_size_t left_count = BaggingHelper(cur_start, cur_cnt, left);
+        return left_count;
+      },
+      bag_data_indices_.data());
+
+  bag_data_cnt_ = left_cnt;
+  if (!is_use_subset_) {
+    tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
+  } else {
+    tmp_subset_->ReSize(bag_data_cnt_);
+    tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
+                            bag_data_cnt_, false);
+    tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
+                                  bag_data_cnt_);
+  }
+  threshold_ = 0.0;
+  Log::Debug("MVS Sample size %d %d", left_cnt, static_cast<data_size_t>(config_->bagging_fraction * num_data_));
+}
+
+data_size_t MVS::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t *buffer) {
+  if (cnt <= 0) {
+    return 0;
+  }
+
+  const double threshold = GetThreshold(start, cnt);
+
+  data_size_t left_cnt = 0;
+  data_size_t right_pos = cnt;
+  data_size_t big_weight_cnt = 0;
+  for (data_size_t i = 0; i < cnt; ++i) {
+    data_size_t position = start + i;
+
+    double derivative = 0.0;
+    for (data_size_t j = 0; j < num_tree_per_iteration_; ++j) {
+      size_t idx = static_cast<size_t>(j) * num_data_ + position;
+      derivative += gradients_[idx] * gradients_[idx] + mvs_lambda_ * hessians_[idx] * hessians_[idx];
+    }
+    derivative = std::sqrt(derivative);
+
+    if (derivative >= threshold) {
+      buffer[left_cnt++] = position;
+      ++big_weight_cnt;
+    } else {
+      const double proba_threshold = derivative / threshold;
+      const double proba = bagging_rands_[position / bagging_rand_block_].NextFloat();
+      if (proba < proba_threshold) {
+        buffer[left_cnt++] = position;
+        for (data_size_t tree_id = 0; tree_id < num_tree_per_iteration_; ++tree_id) {
+          size_t idx = static_cast<size_t>(num_data_) * tree_id + position;
+          gradients_[idx] /= proba_threshold;
+          hessians_[idx] /= proba_threshold;
+        }
+      } else {
+        buffer[--right_pos] = position;
+      }
+    }
+  }
+
+  return left_cnt;
+}
+
+double MVS::GetThreshold(data_size_t begin, data_size_t cnt) {
+  if (num_data_ <= config_->mvs_max_sequential_size && threshold_ != 0.0) {
+    return threshold_;
+  }
+
+  double threshold = ArrayArgs<score_t>::CalculateThresholdMVS(&tmp_derivatives_, begin, begin + cnt,
+                                                  cnt * config_->bagging_fraction);
+  return threshold;
+}
+
+void MVS::ResetMVS() {
+  CHECK(config_->bagging_fraction > 0.0f && config_->bagging_fraction < 1.0f && config_->bagging_freq > 0);
+  CHECK(config_->mvs_lambda >= 0.0f);
+  CHECK(!balanced_bagging_);
+  bag_data_indices_.resize(num_data_);
+  tmp_derivatives_.resize(num_data_);
+  Log::Info("Using MVS");
+}
+
+}  // namespace LightGBM