microsoft · shiyu1994 · Mar 12, 2022 · Apr 12, 2021 · May 9, 2021 · May 9, 2021
@@ -26,6 +26,7 @@ OBJECTS = \
     boosting/gbdt_model_text.o \
     boosting/gbdt_prediction.o \
     boosting/prediction_early_stop.o \
+    boosting/mvs.o\
     io/bin.o \
     io/config.o \
     io/config_auto.o \

@@ -27,6 +27,7 @@ OBJECTS = \
     boosting/gbdt_model_text.o \
     boosting/gbdt_prediction.o \
     boosting/prediction_early_stop.o \
+    boosting/mvs.o\
     io/bin.o \
     io/config.o \
     io/config_auto.o \

@@ -119,6 +119,8 @@ Core Parameters
 
       -  **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
 
+   -  ``mvs``, Minimal variance sampling <https://arxiv.org/abs/1910.13204>__
+
 -  ``linear_tree`` :raw-html:`<a id="linear_tree" title="Permalink to this parameter" href="#linear_tree">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``linear_trees``
 
    -  fit piecewise linear gradient boosting tree
@@ -336,6 +338,18 @@ Learning Control Parameters
 
    -  **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored
 
+-  ``mvs_lambda`` :raw-html:`<a id="mvs_lambda" title="Permalink to this parameter" href="#mvs_lambda">&#x1F517;&#xFE0E;</a>`, default = ``1e-4``, type = double, constraints: ``mvs_lambda > 0.0``
+
+   -  used in MVS boosting if ``mvs_adaptive == true`` than this value is ignored
+
+   -  used only in ``mvs``
+
+-  ``mvs_adaptive`` :raw-html:`<a id="mvs_adaptive" title="Permalink to this parameter" href="#mvs_adaptive">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  use adaptive variant of mvs boosting
+
+   -  used only in ``mvs``
+
 -  ``bagging_freq`` :raw-html:`<a id="bagging_freq" title="Permalink to this parameter" href="#bagging_freq">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = int, aliases: ``subsample_freq``
 
    -  frequency for bagging

@@ -147,6 +147,7 @@ struct Config {
   // desc = ``dart``, `Dropouts meet Multiple Additive Regression Trees <https://arxiv.org/abs/1505.01866>`__
   // desc = ``goss``, Gradient-based One-Side Sampling
   // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
+  // desc = ``mvs``, Minimal variance sampling <https://arxiv.org/abs/1910.13204>__
   std::string boosting = "gbdt";
 
   // alias = linear_trees
@@ -318,6 +319,17 @@ struct Config {
   // desc = **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored
   double neg_bagging_fraction = 1.0;
 
+  // default = 1e-4
+  // check = >0.0
+  // desc = used in MVS boosting if ``mvs_adaptive == true`` than this value is ignored
+  // desc = used only in ``mvs``
+  double mvs_lambda = 1e-4;
+
+  // default = false
+  // desc = use adaptive variant of mvs boosting
+  // desc = used only in ``mvs``
+  bool mvs_adaptive = false;
+
   // alias = subsample_freq
   // desc = frequency for bagging
   // desc = ``0`` means disable bagging; ``k`` means perform bagging at every ``k`` iteration. Every ``k``-th iteration, LightGBM will randomly select ``bagging_fraction * 100 %`` of the data to use for the next ``k`` iterations

@@ -364,6 +364,7 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
             'dart', Dropouts meet Multiple Additive Regression Trees.
             'goss', Gradient-based One-Side Sampling.
             'rf', Random Forest.
+            'mvs', Minimal Variance Sampling.
         num_leaves : int, optional (default=31)
             Maximum tree leaves for base learners.
         max_depth : int, optional (default=-1)

@@ -8,6 +8,7 @@
 #include "gbdt.h"
 #include "goss.hpp"
 #include "rf.hpp"
+#include "mvs.hpp"
 
 namespace LightGBM {
 
@@ -42,6 +43,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
       return new GOSS();
     } else if (type == std::string("rf")) {
       return new RF();
+    } else if (type == std::string("mvs")) {
+      return new MVS();
     } else {
       return nullptr;
     }
@@ -56,6 +59,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
         ret.reset(new GOSS());
       } else if (type == std::string("rf")) {
         return new RF();
+      } else if (type == std::string("mvs")) {
+        return new MVS();
       } else {
         Log::Fatal("Unknown boosting type %s", type.c_str());
       }

@@ -0,0 +1,206 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#include "mvs.hpp"
+
+#include <memory>
+#include <numeric>
+
+
+namespace LightGBM {
+
+using ConstTreeIterator = std::vector<std::unique_ptr<Tree>>::const_iterator;
+
+MVS::MVS() : GBDT() {}
+
+static double CalculateThresholdSequential(std::vector<score_t>* gradients, data_size_t begin, data_size_t end,
+                                    const double sample_size) {
+  double current_sum_small = 0.0;
+  data_size_t big_grad_size = 0;
+
+  while (begin != end) {
+    data_size_t middle_begin = 0, middle_end = 0;
+    ArrayArgs<score_t>::Partition(gradients, begin, end, &middle_begin, &middle_end);
+    ++middle_begin;  // for half intervals
+    const data_size_t n_middle = middle_end - middle_begin;
+    const data_size_t large_size = middle_begin - begin;
+
+    const double sum_small = std::accumulate(gradients->begin() + middle_end, gradients->begin() + end, 0.0);
+    const double sum_middle = (*gradients)[middle_begin] * n_middle;
+
+    const double
+        current_sampling_rate = (current_sum_small + sum_small) / (*gradients)[middle_begin] + big_grad_size + n_middle + large_size;
+
+    if (current_sampling_rate > sample_size) {
+      current_sum_small += sum_small + sum_middle;
+      end = middle_begin;
+    } else {
+      big_grad_size += n_middle + large_size;
+      begin = middle_end;
+    }
+  }
+
+  return current_sum_small / (sample_size - big_grad_size + kEpsilon);
+}
+
+static double ComputeLeavesMeanSquaredValue(ConstTreeIterator begin, ConstTreeIterator end) {
+  double sum_values = 0.0;
+  data_size_t num_leaves = (*begin)->num_leaves();
+#pragma omp parallel for schedule(static, 2048) reduction(+:sum_values)
+  for (data_size_t leaf_idx = 0; leaf_idx < num_leaves; ++leaf_idx) {
+    double leave_value = 0.0;
+    for (ConstTreeIterator it = begin; it != end; ++it) {
+      if (leaf_idx < (**it).num_leaves()) {
+        const double value = (*it)->LeafOutput(leaf_idx);
+        leave_value += value * value;
+      }
+    }
+    sum_values += std::sqrt(leave_value);
+  }
+  return sum_values / num_leaves;
+}
+
+static double ComputeMeanGradValues(score_t *gradients,
+                                    score_t *hessians,
+                                    data_size_t size,
+                                    data_size_t num_tree_per_iteration) {
+  double sum = 0.0;
+#pragma omp parallel for schedule(static, 1024) reduction(+:sum)
+  for (data_size_t i = 0; i < size; ++i) {
+    double local_hessians = 0.0, local_gradients = 0.0;
+    for (data_size_t j = 0; j < num_tree_per_iteration; ++j) {
+      size_t idx = static_cast<size_t>(size) * j + i;
+      local_hessians += hessians[idx] * hessians[idx];
+      local_gradients += gradients[idx] * gradients[idx];
+    }
+    sum += std::sqrt(local_gradients / local_hessians);
+  }
+  return sum / size;
+}
+
+double MVS::GetLambda() {
+  if (!mvs_adaptive_) {
+    return mvs_lambda_;
+  }
+  double lambda =
+      (this->iter_ > 0) ? ComputeLeavesMeanSquaredValue(models_.cend() - num_tree_per_iteration_, models_.cend())
+          / config_->learning_rate
+                        : ComputeMeanGradValues(gradients_.data(),
+                                                hessians_.data(),
+                                                num_data_,
+                                                num_tree_per_iteration_);
+
+  return lambda;
+}
+
+void MVS::Bagging(int iter) {
+  if (iter % config_->bagging_freq != 0 && !need_re_bagging_) {
 void GBDT::Bagging(int iter) { 
   Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); 
   // if need bagging 
   if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || 
       need_re_bagging_) { 
     need_re_bagging_ = false; 
 void GBDT::Bagging(int iter) { 
   Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); 
   // if need bagging 
   if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || 
       need_re_bagging_) { 
     need_re_bagging_ = false; 
+    return;
+  }
+
+  bag_data_cnt_ = num_data_;
+  mvs_lambda_ = GetLambda();
+
+  if (num_data_ <= kMaxSequentialSize) {
+    threshold_ = GetThreshold(0, num_data_);
+  }
+
+  auto left_cnt = bagging_runner_.Run<true>(
+      num_data_,
+      [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t *left,
+          data_size_t *) {
+        data_size_t left_count = BaggingHelper(cur_start, cur_cnt, left);
+        return left_count;
+      },
+      bag_data_indices_.data());
+
+  bag_data_cnt_ = left_cnt;
+  if (!is_use_subset_) {
+    tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
+  } else {
+    tmp_subset_->ReSize(bag_data_cnt_);
+    tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
+                            bag_data_cnt_, false);
+    tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
+                                  bag_data_cnt_);
+  }
+  threshold_ = 0.0;
+  Log::Debug("MVS Sample size %d %d", left_cnt, static_cast<data_size_t>(config_->bagging_fraction * num_data_));
+}
+
+data_size_t MVS::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t *buffer) {
+  if (cnt <= 0) {
+    return 0;
+  }
+
+  const double threshold = GetThreshold(start, cnt);
+
+  data_size_t left_cnt = 0;
+  data_size_t right_pos = cnt;
+  data_size_t big_weight_cnt = 0;
+  for (data_size_t i = 0; i < cnt; ++i) {
+    data_size_t position = start + i;
+
+    double derivative = 0.0;
+    for (data_size_t j = 0; j < num_tree_per_iteration_; ++j) {
+      size_t idx = static_cast<size_t>(j) * num_data_ + position;
+      derivative += gradients_[idx] * gradients_[idx] + mvs_lambda_ * hessians_[idx] * hessians_[idx];
+    }
+    derivative = std::sqrt(derivative);
+
+    if (derivative >= threshold) {
+      buffer[left_cnt++] = position;
+      ++big_weight_cnt;
+    } else {
+      const double proba_threshold = derivative / threshold;
+      const double proba = bagging_rands_[position / bagging_rand_block_].NextFloat();
+      if (proba < proba_threshold) {
+        buffer[left_cnt++] = position;
+        for (data_size_t tree_id = 0; tree_id < num_tree_per_iteration_; ++tree_id) {
+          size_t idx = static_cast<size_t>(num_data_) * tree_id + position;
+          gradients_[idx] /= proba_threshold;
+          hessians_[idx] /= proba_threshold;
+        }
+      } else {
+        buffer[--right_pos] = position;
+      }
+    }
+  }
+
+  return left_cnt;
+}
+
+double MVS::GetThreshold(data_size_t begin, data_size_t cnt) {
+  data_size_t n_blocks, block_size;
+  Threading::BlockInfoForceSize<data_size_t>(num_data_, bagging_rand_block_, &n_blocks, &block_size);
+  if (num_data_ < kMaxSequentialSize && block_size > 1 && threshold_ != 0.0) {
+    return threshold_;
+  }
+
+  for (data_size_t i = begin; i < begin + cnt; ++i) {
+    tmp_derivatives_[i] = 0.0f;
+    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
+      size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + i;
+      tmp_derivatives_[i] += gradients_[idx] * gradients_[idx] + mvs_lambda_ * hessians_[idx] * hessians_[idx];
+    }
+    tmp_derivatives_[i] = std::sqrt(tmp_derivatives_[i]);
+  }
+
+  double threshold = CalculateThresholdSequential(&tmp_derivatives_, begin, begin + cnt,
+                                                  cnt * config_->bagging_fraction);
+  return threshold;
+}
+
+void MVS::ResetMVS() {
+  CHECK(config_->bagging_fraction > 0.0f && config_->bagging_fraction < 1.0f && config_->bagging_freq > 0);
+  CHECK(config_->mvs_lambda >= 0.0f);
+  CHECK(!balanced_bagging_);
+
+  bag_data_indices_.resize(num_data_);
+  tmp_derivatives_.resize(num_data_);
+  Log::Info("Using MVS");
+}
+
+}  // namespace LightGBM