Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

code refactoring: cost effective gradient boosting #2407

Merged
merged 11 commits into from
Sep 26, 2019
112 changes: 112 additions & 0 deletions src/treelearner/cost_effective_gradient_boosting.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*!
* Copyright (c) 2016 Microsoft Corporation. All rights reserved.
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
#define LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_

#include <LightGBM/config.h>
#include <LightGBM/dataset.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>

#include <vector>

#include "data_partition.hpp"
#include "serial_tree_learner.h"
#include "split_info.hpp"

namespace LightGBM {

class CostEfficientGradientBoosting {
public:
explicit CostEfficientGradientBoosting(const SerialTreeLearner* tree_learner):tree_learner_(tree_learner) {
}
static bool IsEnable(const Config* config) {
if (config->cegb_tradeoff >= 1.0f && config->cegb_penalty_split <= 0.0f
&& config->cegb_penalty_feature_coupled.empty() && config->cegb_penalty_feature_lazy.empty()) {
return false;
} else {
return true;
}
}
void Init() {
auto train_data = tree_learner_->train_data_;
splits_per_leaf_.resize(static_cast<size_t>(tree_learner_->config_->num_leaves) * train_data->num_features());
is_feature_used_in_split_.clear();
is_feature_used_in_split_.resize(train_data->num_features());

if (!tree_learner_->config_->cegb_penalty_feature_coupled.empty()) {
CHECK(tree_learner_->config_->cegb_penalty_feature_coupled.size() == static_cast<size_t>(train_data->num_total_features()));
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
}
if (!tree_learner_->config_->cegb_penalty_feature_lazy.empty()) {
CHECK(tree_learner_->config_->cegb_penalty_feature_lazy.size() == static_cast<size_t>(train_data->num_total_features()));
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved
feature_used_in_data_ = Common::EmptyBitset(train_data->num_features() * tree_learner_->num_data_);
}
}
double DetlaGain(int feature_index, int real_fidx, int leaf_index, int num_data_in_leaf, SplitInfo split_info) {
auto config = tree_learner_->config_;
double delta = config->cegb_tradeoff * config->cegb_penalty_split * num_data_in_leaf;
if (!config->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) {
delta += config->cegb_tradeoff * config->cegb_penalty_feature_coupled[real_fidx];
}
if (!config->cegb_penalty_feature_lazy.empty()) {
delta += config->cegb_tradeoff * CalculateOndemandCosts(feature_index, real_fidx, leaf_index);
}
splits_per_leaf_[static_cast<size_t>(leaf_index) * tree_learner_->train_data_->num_features() + feature_index] = split_info;
return delta;
}
void UpdateLeafBestSplits(Tree* tree, int best_leaf, const SplitInfo* best_split_info, std::vector<SplitInfo>* best_split_per_leaf) {
auto config = tree_learner_->config_;
auto train_data = tree_learner_->train_data_;
const int inner_feature_index = train_data->InnerFeatureIndex(best_split_info->feature);
if (!config->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[inner_feature_index]) {
is_feature_used_in_split_[inner_feature_index] = true;
for (int i = 0; i < tree->num_leaves(); ++i) {
if (i == best_leaf) continue;
auto split = &splits_per_leaf_[static_cast<size_t>(i) * train_data->num_features() + inner_feature_index];
split->gain += config->cegb_tradeoff * config->cegb_penalty_feature_coupled[best_split_info->feature];
if (*split > best_split_per_leaf->at(i))
best_split_per_leaf->at(i) = *split;
}
}
if (!config->cegb_penalty_feature_lazy.empty()) {
data_size_t cnt_leaf_data = 0;
auto tmp_idx = tree_learner_->data_partition_->GetIndexOnLeaf(best_leaf, &cnt_leaf_data);
for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
int real_idx = tmp_idx[i_input];
Common::InsertBitset(feature_used_in_data_, train_data->num_data() * inner_feature_index + real_idx);
}
}
}

private:
double CalculateOndemandCosts(int feature_index, int real_fidx, int leaf_index) const {
if (tree_learner_->config_->cegb_penalty_feature_lazy.empty()) {
return 0.0f;
}
auto train_data = tree_learner_->train_data_;
double penalty = tree_learner_->config_->cegb_penalty_feature_lazy[real_fidx];

double total = 0.0f;
data_size_t cnt_leaf_data = 0;
auto tmp_idx = tree_learner_->data_partition_->GetIndexOnLeaf(leaf_index, &cnt_leaf_data);

for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
int real_idx = tmp_idx[i_input];
if (Common::FindInBitset(feature_used_in_data_.data(), train_data->num_data() * train_data->num_features(), train_data->num_data() * feature_index + real_idx)) {
continue;
}
total += penalty;
}
return total;
}

const SerialTreeLearner* tree_learner_;
std::vector<SplitInfo> splits_per_leaf_;
std::vector<bool> is_feature_used_in_split_;
std::vector<uint32_t> feature_used_in_data_;
};

} // namespace LightGBM
#endif // LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
85 changes: 18 additions & 67 deletions src/treelearner/serial_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include <unordered_map>
#include <utility>

#include "cost_effective_gradient_boosting.hpp"

namespace LightGBM {

#ifdef TIMETAG
Expand Down Expand Up @@ -69,8 +71,6 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves);
// push split information for all leaves
best_split_per_leaf_.resize(config_->num_leaves);
splits_per_leaf_.resize(config_->num_leaves*train_data_->num_features());
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved

// get ordered bin
train_data_->CreateOrderedBins(&ordered_bins_);

Expand Down Expand Up @@ -104,15 +104,9 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
}
}
Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_);
is_feature_used_in_split_.clear();
is_feature_used_in_split_.resize(train_data->num_features());

if (!config_->cegb_penalty_feature_coupled.empty()) {
CHECK(config_->cegb_penalty_feature_coupled.size() == static_cast<size_t>(train_data_->num_total_features()));
}
if (!config_->cegb_penalty_feature_lazy.empty()) {
CHECK(config_->cegb_penalty_feature_lazy.size() == static_cast<size_t>(train_data_->num_total_features()));
feature_used_in_data = Common::EmptyBitset(train_data->num_features() * num_data_);
if (CostEfficientGradientBoosting::IsEnable(config_)) {
cegb_.reset(new CostEfficientGradientBoosting(this));
cegb_->Init();
}
}

Expand All @@ -139,6 +133,9 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
is_data_in_leaf_.resize(num_data_);
std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), static_cast<char>(0));
}
if (cegb_ != nullptr) {
cegb_->Init();
}
}

void SerialTreeLearner::ResetConfig(const Config* config) {
Expand Down Expand Up @@ -166,8 +163,11 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
} else {
config_ = config;
}

histogram_pool_.ResetConfig(config_);
if (CostEfficientGradientBoosting::IsEnable(config_)) {
cegb_.reset(new CostEfficientGradientBoosting(this));
cegb_->Init();
}
}

Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, Json& forced_split_json) {
Expand Down Expand Up @@ -520,28 +520,6 @@ void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_featur
#endif
}

double SerialTreeLearner::CalculateOndemandCosts(int feature_index, int leaf_index) {
if (config_->cegb_penalty_feature_lazy.empty())
return 0.0f;

double penalty = config_->cegb_penalty_feature_lazy[feature_index];

const int inner_fidx = train_data_->InnerFeatureIndex(feature_index);

double total = 0.0f;
data_size_t cnt_leaf_data = 0;
auto tmp_idx = data_partition_->GetIndexOnLeaf(leaf_index, &cnt_leaf_data);

for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
int real_idx = tmp_idx[i_input];
if (Common::FindInBitset(feature_used_in_data.data(), train_data_->num_data()*train_data_->num_features(), train_data_->num_data() * inner_fidx + real_idx))
continue;
total += penalty;
}

return total;
}

void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
Expand Down Expand Up @@ -575,14 +553,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
smaller_leaf_splits_->max_constraint(),
&smaller_split);
smaller_split.feature = real_fidx;
smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * smaller_leaf_splits_->num_data_in_leaf();
if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) {
smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx];
if (cegb_ != nullptr) {
smaller_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, smaller_leaf_splits_->LeafIndex(), smaller_leaf_splits_->num_data_in_leaf(), smaller_split);
}
if (!config_->cegb_penalty_feature_lazy.empty()) {
smaller_split.gain -= config_->cegb_tradeoff * CalculateOndemandCosts(real_fidx, smaller_leaf_splits_->LeafIndex());
}
splits_per_leaf_[smaller_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = smaller_split;
if (smaller_split > smaller_best[tid] && smaller_node_used_features[feature_index]) {
smaller_best[tid] = smaller_split;
}
Expand All @@ -606,14 +579,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
larger_leaf_splits_->max_constraint(),
&larger_split);
larger_split.feature = real_fidx;
larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * larger_leaf_splits_->num_data_in_leaf();
if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) {
larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx];
}
if (!config_->cegb_penalty_feature_lazy.empty()) {
larger_split.gain -= config_->cegb_tradeoff*CalculateOndemandCosts(real_fidx, larger_leaf_splits_->LeafIndex());
if (cegb_ != nullptr) {
larger_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, larger_leaf_splits_->LeafIndex(), larger_leaf_splits_->num_data_in_leaf(), larger_split);
}
splits_per_leaf_[larger_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = larger_split;
if (larger_split > larger_best[tid] && larger_node_used_features[feature_index]) {
larger_best[tid] = larger_split;
}
Expand Down Expand Up @@ -802,26 +770,9 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int*
void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) {
const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[inner_feature_index]) {
is_feature_used_in_split_[inner_feature_index] = true;
for (int i = 0; i < tree->num_leaves(); ++i) {
if (i == best_leaf) continue;
auto split = &splits_per_leaf_[i*train_data_->num_features() + inner_feature_index];
split->gain += config_->cegb_tradeoff*config_->cegb_penalty_feature_coupled[best_split_info.feature];
if (*split > best_split_per_leaf_[i])
best_split_per_leaf_[i] = *split;
}
}

if (!config_->cegb_penalty_feature_lazy.empty()) {
data_size_t cnt_leaf_data = 0;
auto tmp_idx = data_partition_->GetIndexOnLeaf(best_leaf, &cnt_leaf_data);
for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
int real_idx = tmp_idx[i_input];
Common::InsertBitset(feature_used_in_data, train_data_->num_data() * inner_feature_index + real_idx);
}
if (cegb_ != nullptr) {
cegb_->UpdateLeafBestSplits(tree, best_leaf, &best_split_info, &best_split_per_leaf_);
}

// left = parent
*left_leaf = best_leaf;
bool is_numerical_split = train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin;
Expand Down
10 changes: 4 additions & 6 deletions src/treelearner/serial_tree_learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@
using namespace json11;

namespace LightGBM {

/*! \brief forward declaration */
class CostEfficientGradientBoosting;
/*!
* \brief Used for learning a tree by single machine
*/
class SerialTreeLearner: public TreeLearner {
public:
friend CostEfficientGradientBoosting;
explicit SerialTreeLearner(const Config* config);

~SerialTreeLearner();
Expand Down Expand Up @@ -117,8 +119,6 @@ class SerialTreeLearner: public TreeLearner {
*/
inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;

double CalculateOndemandCosts(int feature_index, int leaf_index);

/*! \brief number of data */
data_size_t num_data_;
/*! \brief number of features */
Expand Down Expand Up @@ -180,9 +180,7 @@ class SerialTreeLearner: public TreeLearner {
int num_threads_;
std::vector<int> ordered_bin_indices_;
bool is_constant_hessian_;

std::vector<bool> is_feature_used_in_split_;
std::vector<uint32_t> feature_used_in_data;
std::unique_ptr<CostEfficientGradientBoosting> cegb_;
};

inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const {
Expand Down