Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

code refactoring: cost effective gradient boosting #2407

Merged
merged 11 commits into from
Sep 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions src/treelearner/cost_effective_gradient_boosting.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*!
* Copyright (c) 2019 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
#define LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_

#include <LightGBM/config.h>
#include <LightGBM/dataset.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>

#include <vector>

#include "data_partition.hpp"
#include "serial_tree_learner.h"
#include "split_info.hpp"

namespace LightGBM {

class CostEfficientGradientBoosting {
public:
explicit CostEfficientGradientBoosting(const SerialTreeLearner* tree_learner):tree_learner_(tree_learner) {
}
static bool IsEnable(const Config* config) {
if (config->cegb_tradeoff >= 1.0f && config->cegb_penalty_split <= 0.0f
&& config->cegb_penalty_feature_coupled.empty() && config->cegb_penalty_feature_lazy.empty()) {
return false;
} else {
return true;
}
}
void Init() {
auto train_data = tree_learner_->train_data_;
splits_per_leaf_.resize(static_cast<size_t>(tree_learner_->config_->num_leaves) * train_data->num_features());
is_feature_used_in_split_.clear();
is_feature_used_in_split_.resize(train_data->num_features());

if (!tree_learner_->config_->cegb_penalty_feature_coupled.empty()
&& tree_learner_->config_->cegb_penalty_feature_coupled.size() != static_cast<size_t>(train_data->num_total_features())) {
Log::Fatal("cegb_penalty_feature_coupled should be the same size as feature number.");
}
if (!tree_learner_->config_->cegb_penalty_feature_lazy.empty()) {
if (tree_learner_->config_->cegb_penalty_feature_lazy.size() != static_cast<size_t>(train_data->num_total_features())) {
Log::Fatal("cegb_penalty_feature_lazy should be the same size as feature number.");
}
feature_used_in_data_ = Common::EmptyBitset(train_data->num_features() * tree_learner_->num_data_);
}
}
double DetlaGain(int feature_index, int real_fidx, int leaf_index, int num_data_in_leaf, SplitInfo split_info) {
auto config = tree_learner_->config_;
double delta = config->cegb_tradeoff * config->cegb_penalty_split * num_data_in_leaf;
if (!config->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) {
delta += config->cegb_tradeoff * config->cegb_penalty_feature_coupled[real_fidx];
}
if (!config->cegb_penalty_feature_lazy.empty()) {
delta += config->cegb_tradeoff * CalculateOndemandCosts(feature_index, real_fidx, leaf_index);
}
splits_per_leaf_[static_cast<size_t>(leaf_index) * tree_learner_->train_data_->num_features() + feature_index] = split_info;
return delta;
}
void UpdateLeafBestSplits(Tree* tree, int best_leaf, const SplitInfo* best_split_info, std::vector<SplitInfo>* best_split_per_leaf) {
auto config = tree_learner_->config_;
auto train_data = tree_learner_->train_data_;
const int inner_feature_index = train_data->InnerFeatureIndex(best_split_info->feature);
if (!config->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[inner_feature_index]) {
is_feature_used_in_split_[inner_feature_index] = true;
for (int i = 0; i < tree->num_leaves(); ++i) {
if (i == best_leaf) continue;
auto split = &splits_per_leaf_[static_cast<size_t>(i) * train_data->num_features() + inner_feature_index];
split->gain += config->cegb_tradeoff * config->cegb_penalty_feature_coupled[best_split_info->feature];
if (*split > best_split_per_leaf->at(i))
best_split_per_leaf->at(i) = *split;
}
}
if (!config->cegb_penalty_feature_lazy.empty()) {
data_size_t cnt_leaf_data = 0;
auto tmp_idx = tree_learner_->data_partition_->GetIndexOnLeaf(best_leaf, &cnt_leaf_data);
for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
int real_idx = tmp_idx[i_input];
Common::InsertBitset(&feature_used_in_data_, train_data->num_data() * inner_feature_index + real_idx);
}
}
}

private:
double CalculateOndemandCosts(int feature_index, int real_fidx, int leaf_index) const {
if (tree_learner_->config_->cegb_penalty_feature_lazy.empty()) {
return 0.0f;
}
auto train_data = tree_learner_->train_data_;
double penalty = tree_learner_->config_->cegb_penalty_feature_lazy[real_fidx];

double total = 0.0f;
data_size_t cnt_leaf_data = 0;
auto tmp_idx = tree_learner_->data_partition_->GetIndexOnLeaf(leaf_index, &cnt_leaf_data);

for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
int real_idx = tmp_idx[i_input];
if (Common::FindInBitset(feature_used_in_data_.data(), train_data->num_data() * train_data->num_features(), train_data->num_data() * feature_index + real_idx)) {
continue;
}
total += penalty;
}
return total;
}

const SerialTreeLearner* tree_learner_;
std::vector<SplitInfo> splits_per_leaf_;
std::vector<bool> is_feature_used_in_split_;
std::vector<uint32_t> feature_used_in_data_;
};

} // namespace LightGBM

#endif // LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
85 changes: 18 additions & 67 deletions src/treelearner/serial_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include <unordered_map>
#include <utility>

#include "cost_effective_gradient_boosting.hpp"

namespace LightGBM {

#ifdef TIMETAG
Expand Down Expand Up @@ -69,8 +71,6 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves);
// push split information for all leaves
best_split_per_leaf_.resize(config_->num_leaves);
splits_per_leaf_.resize(config_->num_leaves*train_data_->num_features());
StrikerRUS marked this conversation as resolved.
Show resolved Hide resolved

// get ordered bin
train_data_->CreateOrderedBins(&ordered_bins_);

Expand Down Expand Up @@ -104,15 +104,9 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
}
}
Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_);
is_feature_used_in_split_.clear();
is_feature_used_in_split_.resize(train_data->num_features());

if (!config_->cegb_penalty_feature_coupled.empty()) {
CHECK(config_->cegb_penalty_feature_coupled.size() == static_cast<size_t>(train_data_->num_total_features()));
}
if (!config_->cegb_penalty_feature_lazy.empty()) {
CHECK(config_->cegb_penalty_feature_lazy.size() == static_cast<size_t>(train_data_->num_total_features()));
feature_used_in_data = Common::EmptyBitset(train_data->num_features() * num_data_);
if (CostEfficientGradientBoosting::IsEnable(config_)) {
cegb_.reset(new CostEfficientGradientBoosting(this));
cegb_->Init();
}
}

Expand All @@ -139,6 +133,9 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
is_data_in_leaf_.resize(num_data_);
std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), static_cast<char>(0));
}
if (cegb_ != nullptr) {
cegb_->Init();
}
}

void SerialTreeLearner::ResetConfig(const Config* config) {
Expand Down Expand Up @@ -166,8 +163,11 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
} else {
config_ = config;
}

histogram_pool_.ResetConfig(config_);
if (CostEfficientGradientBoosting::IsEnable(config_)) {
cegb_.reset(new CostEfficientGradientBoosting(this));
cegb_->Init();
}
}

Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) {
Expand Down Expand Up @@ -521,28 +521,6 @@ void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_featur
#endif
}

double SerialTreeLearner::CalculateOndemandCosts(int feature_index, int leaf_index) {
if (config_->cegb_penalty_feature_lazy.empty())
return 0.0f;

double penalty = config_->cegb_penalty_feature_lazy[feature_index];

const int inner_fidx = train_data_->InnerFeatureIndex(feature_index);

double total = 0.0f;
data_size_t cnt_leaf_data = 0;
auto tmp_idx = data_partition_->GetIndexOnLeaf(leaf_index, &cnt_leaf_data);

for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
int real_idx = tmp_idx[i_input];
if (Common::FindInBitset(feature_used_in_data.data(), train_data_->num_data()*train_data_->num_features(), train_data_->num_data() * inner_fidx + real_idx))
continue;
total += penalty;
}

return total;
}

void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
Expand Down Expand Up @@ -576,14 +554,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
smaller_leaf_splits_->max_constraint(),
&smaller_split);
smaller_split.feature = real_fidx;
smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * smaller_leaf_splits_->num_data_in_leaf();
if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) {
smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx];
if (cegb_ != nullptr) {
smaller_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, smaller_leaf_splits_->LeafIndex(), smaller_leaf_splits_->num_data_in_leaf(), smaller_split);
}
if (!config_->cegb_penalty_feature_lazy.empty()) {
smaller_split.gain -= config_->cegb_tradeoff * CalculateOndemandCosts(real_fidx, smaller_leaf_splits_->LeafIndex());
}
splits_per_leaf_[smaller_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = smaller_split;
if (smaller_split > smaller_best[tid] && smaller_node_used_features[feature_index]) {
smaller_best[tid] = smaller_split;
}
Expand All @@ -607,14 +580,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
larger_leaf_splits_->max_constraint(),
&larger_split);
larger_split.feature = real_fidx;
larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * larger_leaf_splits_->num_data_in_leaf();
if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) {
larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx];
}
if (!config_->cegb_penalty_feature_lazy.empty()) {
larger_split.gain -= config_->cegb_tradeoff*CalculateOndemandCosts(real_fidx, larger_leaf_splits_->LeafIndex());
if (cegb_ != nullptr) {
larger_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, larger_leaf_splits_->LeafIndex(), larger_leaf_splits_->num_data_in_leaf(), larger_split);
}
splits_per_leaf_[larger_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = larger_split;
if (larger_split > larger_best[tid] && larger_node_used_features[feature_index]) {
larger_best[tid] = larger_split;
}
Expand Down Expand Up @@ -803,26 +771,9 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json
void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) {
const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[inner_feature_index]) {
is_feature_used_in_split_[inner_feature_index] = true;
for (int i = 0; i < tree->num_leaves(); ++i) {
if (i == best_leaf) continue;
auto split = &splits_per_leaf_[i*train_data_->num_features() + inner_feature_index];
split->gain += config_->cegb_tradeoff*config_->cegb_penalty_feature_coupled[best_split_info.feature];
if (*split > best_split_per_leaf_[i])
best_split_per_leaf_[i] = *split;
}
}

if (!config_->cegb_penalty_feature_lazy.empty()) {
data_size_t cnt_leaf_data = 0;
auto tmp_idx = data_partition_->GetIndexOnLeaf(best_leaf, &cnt_leaf_data);
for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
int real_idx = tmp_idx[i_input];
Common::InsertBitset(&feature_used_in_data, train_data_->num_data() * inner_feature_index + real_idx);
}
if (cegb_ != nullptr) {
cegb_->UpdateLeafBestSplits(tree, best_leaf, &best_split_info, &best_split_per_leaf_);
}

// left = parent
*left_leaf = best_leaf;
bool is_numerical_split = train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin;
Expand Down
10 changes: 4 additions & 6 deletions src/treelearner/serial_tree_learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@
using namespace json11;

namespace LightGBM {

/*! \brief forward declaration */
class CostEfficientGradientBoosting;
/*!
* \brief Used for learning a tree by single machine
*/
class SerialTreeLearner: public TreeLearner {
public:
friend CostEfficientGradientBoosting;
explicit SerialTreeLearner(const Config* config);

~SerialTreeLearner();
Expand Down Expand Up @@ -116,8 +118,6 @@ class SerialTreeLearner: public TreeLearner {
*/
inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;

double CalculateOndemandCosts(int feature_index, int leaf_index);

/*! \brief number of data */
data_size_t num_data_;
/*! \brief number of features */
Expand Down Expand Up @@ -179,9 +179,7 @@ class SerialTreeLearner: public TreeLearner {
int num_threads_;
std::vector<int> ordered_bin_indices_;
bool is_constant_hessian_;

std::vector<bool> is_feature_used_in_split_;
std::vector<uint32_t> feature_used_in_data;
std::unique_ptr<CostEfficientGradientBoosting> cegb_;
};

inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const {
Expand Down
3 changes: 2 additions & 1 deletion windows/LightGBM.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@
<ClInclude Include="..\src\objective\regression_objective.hpp" />
<ClInclude Include="..\src\objective\multiclass_objective.hpp" />
<ClInclude Include="..\src\objective\xentropy_objective.hpp" />
<ClInclude Include="..\src\treelearner\cost_effective_gradient_boosting.hpp" />
<ClInclude Include="..\src\treelearner\data_partition.hpp" />
<ClInclude Include="..\src\treelearner\feature_histogram.hpp" />
<ClInclude Include="..\src\treelearner\leaf_splits.hpp" />
Expand Down Expand Up @@ -283,4 +284,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>
5 changes: 4 additions & 1 deletion windows/LightGBM.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,9 @@
<ClInclude Include="..\include\LightGBM\json11.hpp">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
<ClInclude Include="..\src\treelearner\cost_effective_gradient_boosting.hpp">
<Filter>src\treelearner</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\src\application\application.cpp">
Expand Down Expand Up @@ -303,4 +306,4 @@
<Filter>src\io</Filter>
</ClCompile>
</ItemGroup>
</Project>
</Project>