diff --git a/src/treelearner/cost_effective_gradient_boosting.hpp b/src/treelearner/cost_effective_gradient_boosting.hpp new file mode 100644 index 000000000000..ccba1e32dd6e --- /dev/null +++ b/src/treelearner/cost_effective_gradient_boosting.hpp @@ -0,0 +1,116 @@ +/*! + * Copyright (c) 2019 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_ +#define LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_ + +#include +#include +#include +#include + +#include + +#include "data_partition.hpp" +#include "serial_tree_learner.h" +#include "split_info.hpp" + +namespace LightGBM { + +class CostEfficientGradientBoosting { + public: + explicit CostEfficientGradientBoosting(const SerialTreeLearner* tree_learner):tree_learner_(tree_learner) { + } + static bool IsEnable(const Config* config) { + if (config->cegb_tradeoff >= 1.0f && config->cegb_penalty_split <= 0.0f + && config->cegb_penalty_feature_coupled.empty() && config->cegb_penalty_feature_lazy.empty()) { + return false; + } else { + return true; + } + } + void Init() { + auto train_data = tree_learner_->train_data_; + splits_per_leaf_.resize(static_cast(tree_learner_->config_->num_leaves) * train_data->num_features()); + is_feature_used_in_split_.clear(); + is_feature_used_in_split_.resize(train_data->num_features()); + + if (!tree_learner_->config_->cegb_penalty_feature_coupled.empty() + && tree_learner_->config_->cegb_penalty_feature_coupled.size() != static_cast(train_data->num_total_features())) { + Log::Fatal("cegb_penalty_feature_coupled should be the same size as feature number."); + } + if (!tree_learner_->config_->cegb_penalty_feature_lazy.empty()) { + if (tree_learner_->config_->cegb_penalty_feature_lazy.size() != static_cast(train_data->num_total_features())) { + Log::Fatal("cegb_penalty_feature_lazy should be the same size as feature number."); + } + feature_used_in_data_ = Common::EmptyBitset(train_data->num_features() * tree_learner_->num_data_); + } + } + double DetlaGain(int feature_index, int real_fidx, int leaf_index, int num_data_in_leaf, SplitInfo split_info) { + auto config = tree_learner_->config_; + double delta = config->cegb_tradeoff * config->cegb_penalty_split * num_data_in_leaf; + if (!config->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) { + delta += config->cegb_tradeoff * config->cegb_penalty_feature_coupled[real_fidx]; + } + if (!config->cegb_penalty_feature_lazy.empty()) { + delta += config->cegb_tradeoff * CalculateOndemandCosts(feature_index, real_fidx, leaf_index); + } + splits_per_leaf_[static_cast(leaf_index) * tree_learner_->train_data_->num_features() + feature_index] = split_info; + return delta; + } + void UpdateLeafBestSplits(Tree* tree, int best_leaf, const SplitInfo* best_split_info, std::vector* best_split_per_leaf) { + auto config = tree_learner_->config_; + auto train_data = tree_learner_->train_data_; + const int inner_feature_index = train_data->InnerFeatureIndex(best_split_info->feature); + if (!config->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[inner_feature_index]) { + is_feature_used_in_split_[inner_feature_index] = true; + for (int i = 0; i < tree->num_leaves(); ++i) { + if (i == best_leaf) continue; + auto split = &splits_per_leaf_[static_cast(i) * train_data->num_features() + inner_feature_index]; + split->gain += config->cegb_tradeoff * config->cegb_penalty_feature_coupled[best_split_info->feature]; + if (*split > best_split_per_leaf->at(i)) + best_split_per_leaf->at(i) = *split; + } + } + if (!config->cegb_penalty_feature_lazy.empty()) { + data_size_t cnt_leaf_data = 0; + auto tmp_idx = tree_learner_->data_partition_->GetIndexOnLeaf(best_leaf, &cnt_leaf_data); + for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) { + int real_idx = tmp_idx[i_input]; + Common::InsertBitset(&feature_used_in_data_, train_data->num_data() * inner_feature_index + real_idx); + } + } + } + + private: + double CalculateOndemandCosts(int feature_index, int real_fidx, int leaf_index) const { + if (tree_learner_->config_->cegb_penalty_feature_lazy.empty()) { + return 0.0f; + } + auto train_data = tree_learner_->train_data_; + double penalty = tree_learner_->config_->cegb_penalty_feature_lazy[real_fidx]; + + double total = 0.0f; + data_size_t cnt_leaf_data = 0; + auto tmp_idx = tree_learner_->data_partition_->GetIndexOnLeaf(leaf_index, &cnt_leaf_data); + + for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) { + int real_idx = tmp_idx[i_input]; + if (Common::FindInBitset(feature_used_in_data_.data(), train_data->num_data() * train_data->num_features(), train_data->num_data() * feature_index + real_idx)) { + continue; + } + total += penalty; + } + return total; + } + + const SerialTreeLearner* tree_learner_; + std::vector splits_per_leaf_; + std::vector is_feature_used_in_split_; + std::vector feature_used_in_data_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_ diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index df2f6ba3ca4b..e3531039ec9d 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -14,6 +14,8 @@ #include #include +#include "cost_effective_gradient_boosting.hpp" + namespace LightGBM { #ifdef TIMETAG @@ -69,8 +71,6 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves); // push split information for all leaves best_split_per_leaf_.resize(config_->num_leaves); - splits_per_leaf_.resize(config_->num_leaves*train_data_->num_features()); - // get ordered bin train_data_->CreateOrderedBins(&ordered_bins_); @@ -104,15 +104,9 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian } } Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_); - is_feature_used_in_split_.clear(); - is_feature_used_in_split_.resize(train_data->num_features()); - - if (!config_->cegb_penalty_feature_coupled.empty()) { - CHECK(config_->cegb_penalty_feature_coupled.size() == static_cast(train_data_->num_total_features())); - } - if (!config_->cegb_penalty_feature_lazy.empty()) { - CHECK(config_->cegb_penalty_feature_lazy.size() == static_cast(train_data_->num_total_features())); - feature_used_in_data = Common::EmptyBitset(train_data->num_features() * num_data_); + if (CostEfficientGradientBoosting::IsEnable(config_)) { + cegb_.reset(new CostEfficientGradientBoosting(this)); + cegb_->Init(); } } @@ -139,6 +133,9 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { is_data_in_leaf_.resize(num_data_); std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), static_cast(0)); } + if (cegb_ != nullptr) { + cegb_->Init(); + } } void SerialTreeLearner::ResetConfig(const Config* config) { @@ -166,8 +163,11 @@ void SerialTreeLearner::ResetConfig(const Config* config) { } else { config_ = config; } - histogram_pool_.ResetConfig(config_); + if (CostEfficientGradientBoosting::IsEnable(config_)) { + cegb_.reset(new CostEfficientGradientBoosting(this)); + cegb_->Init(); + } } Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) { @@ -521,28 +521,6 @@ void SerialTreeLearner::ConstructHistograms(const std::vector& is_featur #endif } -double SerialTreeLearner::CalculateOndemandCosts(int feature_index, int leaf_index) { - if (config_->cegb_penalty_feature_lazy.empty()) - return 0.0f; - - double penalty = config_->cegb_penalty_feature_lazy[feature_index]; - - const int inner_fidx = train_data_->InnerFeatureIndex(feature_index); - - double total = 0.0f; - data_size_t cnt_leaf_data = 0; - auto tmp_idx = data_partition_->GetIndexOnLeaf(leaf_index, &cnt_leaf_data); - - for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) { - int real_idx = tmp_idx[i_input]; - if (Common::FindInBitset(feature_used_in_data.data(), train_data_->num_data()*train_data_->num_features(), train_data_->num_data() * inner_fidx + real_idx)) - continue; - total += penalty; - } - - return total; -} - void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) { #ifdef TIMETAG auto start_time = std::chrono::steady_clock::now(); @@ -576,14 +554,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& smaller_leaf_splits_->max_constraint(), &smaller_split); smaller_split.feature = real_fidx; - smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * smaller_leaf_splits_->num_data_in_leaf(); - if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) { - smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx]; + if (cegb_ != nullptr) { + smaller_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, smaller_leaf_splits_->LeafIndex(), smaller_leaf_splits_->num_data_in_leaf(), smaller_split); } - if (!config_->cegb_penalty_feature_lazy.empty()) { - smaller_split.gain -= config_->cegb_tradeoff * CalculateOndemandCosts(real_fidx, smaller_leaf_splits_->LeafIndex()); - } - splits_per_leaf_[smaller_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = smaller_split; if (smaller_split > smaller_best[tid] && smaller_node_used_features[feature_index]) { smaller_best[tid] = smaller_split; } @@ -607,14 +580,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& larger_leaf_splits_->max_constraint(), &larger_split); larger_split.feature = real_fidx; - larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * larger_leaf_splits_->num_data_in_leaf(); - if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) { - larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx]; - } - if (!config_->cegb_penalty_feature_lazy.empty()) { - larger_split.gain -= config_->cegb_tradeoff*CalculateOndemandCosts(real_fidx, larger_leaf_splits_->LeafIndex()); + if (cegb_ != nullptr) { + larger_split.gain -= cegb_->DetlaGain(feature_index, real_fidx, larger_leaf_splits_->LeafIndex(), larger_leaf_splits_->num_data_in_leaf(), larger_split); } - splits_per_leaf_[larger_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = larger_split; if (larger_split > larger_best[tid] && larger_node_used_features[feature_index]) { larger_best[tid] = larger_split; } @@ -803,26 +771,9 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) { const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature); - if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[inner_feature_index]) { - is_feature_used_in_split_[inner_feature_index] = true; - for (int i = 0; i < tree->num_leaves(); ++i) { - if (i == best_leaf) continue; - auto split = &splits_per_leaf_[i*train_data_->num_features() + inner_feature_index]; - split->gain += config_->cegb_tradeoff*config_->cegb_penalty_feature_coupled[best_split_info.feature]; - if (*split > best_split_per_leaf_[i]) - best_split_per_leaf_[i] = *split; - } - } - - if (!config_->cegb_penalty_feature_lazy.empty()) { - data_size_t cnt_leaf_data = 0; - auto tmp_idx = data_partition_->GetIndexOnLeaf(best_leaf, &cnt_leaf_data); - for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) { - int real_idx = tmp_idx[i_input]; - Common::InsertBitset(&feature_used_in_data, train_data_->num_data() * inner_feature_index + real_idx); - } + if (cegb_ != nullptr) { + cegb_->UpdateLeafBestSplits(tree, best_leaf, &best_split_info, &best_split_per_leaf_); } - // left = parent *left_leaf = best_leaf; bool is_numerical_split = train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin; diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index d1b9785bbcef..31743933a780 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -32,12 +32,14 @@ using namespace json11; namespace LightGBM { - +/*! \brief forward declaration */ +class CostEfficientGradientBoosting; /*! * \brief Used for learning a tree by single machine */ class SerialTreeLearner: public TreeLearner { public: + friend CostEfficientGradientBoosting; explicit SerialTreeLearner(const Config* config); ~SerialTreeLearner(); @@ -116,8 +118,6 @@ class SerialTreeLearner: public TreeLearner { */ inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const; - double CalculateOndemandCosts(int feature_index, int leaf_index); - /*! \brief number of data */ data_size_t num_data_; /*! \brief number of features */ @@ -179,9 +179,7 @@ class SerialTreeLearner: public TreeLearner { int num_threads_; std::vector ordered_bin_indices_; bool is_constant_hessian_; - - std::vector is_feature_used_in_split_; - std::vector feature_used_in_data; + std::unique_ptr cegb_; }; inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const { diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index 7a0ddde938f2..b260cbb38a9e 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -240,6 +240,7 @@ + @@ -283,4 +284,4 @@ - + \ No newline at end of file diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index 3b770e40dabc..f4b7c1c6b3c2 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -207,6 +207,9 @@ include\LightGBM\utils + + src\treelearner + @@ -303,4 +306,4 @@ src\io - + \ No newline at end of file