From a1fbe1a0716a2d7bacbd63ef8c79a6f4b2b0f802 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Fri, 14 Aug 2020 17:54:08 +0800 Subject: [PATCH 01/10] fix zero bin --- include/LightGBM/bin.h | 4 ++-- src/io/bin.cpp | 33 +++++++-------------------- src/treelearner/feature_histogram.hpp | 17 +++++++------- 3 files changed, 18 insertions(+), 36 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index f817dfabaa8c..cc5a1752cbe6 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -482,12 +482,12 @@ inline uint32_t BinMapper::ValueToBin(double value) const { int int_value = static_cast(value); // convert negative value to NaN bin if (int_value < 0) { - return num_bin_ - 1; + return 0; } if (categorical_2_bin_.count(int_value)) { return categorical_2_bin_.at(int_value); } else { - return num_bin_ - 1; + return 0; } } } diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 367edaa3f7b9..3e88acd13b39 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -449,15 +449,6 @@ namespace LightGBM { } // sort by counts Common::SortForPair(&counts_int, &distinct_values_int, 0, true); - // avoid first bin is zero - if (distinct_values_int[0] == 0) { - if (counts_int.size() == 1) { - counts_int.push_back(0); - distinct_values_int.push_back(distinct_values_int[0] + 1); - } - std::swap(counts_int[0], counts_int[1]); - std::swap(distinct_values_int[0], distinct_values_int[1]); - } // will ignore the categorical of small counts int cut_cnt = static_cast((total_sample_cnt - na_cnt) * 0.99f); size_t cur_cat = 0; @@ -466,6 +457,12 @@ namespace LightGBM { int used_cnt = 0; max_bin = std::min(static_cast(distinct_values_int.size()), max_bin); cnt_in_bin.clear(); + + // Push the dummy bin for NaN + bin_2_categorical_.push_back(-1); + categorical_2_bin_[-1] = num_bin_; + cnt_in_bin.push_back(0); + ++num_bin_; while (cur_cat < distinct_values_int.size() && (used_cnt < cut_cnt || num_bin_ < max_bin)) { if (counts_int[cur_cat] < min_data_in_bin && cur_cat > 1) { @@ -478,21 +475,14 @@ namespace LightGBM { ++num_bin_; ++cur_cat; } - // need an additional bin for NaN - if (cur_cat == distinct_values_int.size() && na_cnt > 0) { - // use -1 to represent NaN - bin_2_categorical_.push_back(-1); - categorical_2_bin_[-1] = num_bin_; - cnt_in_bin.push_back(0); - ++num_bin_; - } // Use MissingType::None to represent this bin contains all categoricals if (cur_cat == distinct_values_int.size() && na_cnt == 0) { missing_type_ = MissingType::None; } else { missing_type_ = MissingType::NaN; } - cnt_in_bin.back() += static_cast(total_sample_cnt - used_cnt); + // fix count of NaN bin + cnt_in_bin[0] = static_cast(total_sample_cnt - used_cnt); } } @@ -511,13 +501,6 @@ namespace LightGBM { default_bin_ = ValueToBin(0); most_freq_bin_ = static_cast(ArrayArgs::ArgMax(cnt_in_bin)); - if (bin_type_ == BinType::CategoricalBin) { - if (most_freq_bin_ == 0) { - CHECK_GT(num_bin_, 1); - // FIXME: how to enable `most_freq_bin_ = 0` for categorical features - most_freq_bin_ = 1; - } - } const double max_sparse_rate = static_cast(cnt_in_bin[most_freq_bin_]) / total_sample_cnt; // When most_freq_bin_ != default_bin_, there are some additional data loading costs. diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 8916ee48fd40..e56fffba8a43 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -300,8 +300,9 @@ class FeatureHistogram { } double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; - bool is_full_categorical = meta_->missing_type == MissingType::None; - int used_bin = meta_->num_bin - 1 + is_full_categorical; + const int bin_start = 1; + const int bin_end = meta_->num_bin; + int used_bin = -1; std::vector sorted_idx; double l2 = meta_->config->lambda_l2; @@ -312,11 +313,11 @@ class FeatureHistogram { int rand_threshold = 0; if (use_onehot) { if (USE_RAND) { - if (used_bin > 0) { - rand_threshold = meta_->rand.NextInt(0, used_bin); + if (bin_end - bin_start > 0) { + rand_threshold = meta_->rand.NextInt(bin_start, bin_end); } } - for (int t = 0; t < used_bin; ++t) { + for (int t = bin_start; t < bin_end; ++t) { const auto grad = GET_GRAD(data_, t); const auto hess = GET_HESS(data_, t); data_size_t cnt = @@ -366,7 +367,7 @@ class FeatureHistogram { } } } else { - for (int i = 0; i < used_bin; ++i) { + for (int i = bin_start; i < bin_end; ++i) { if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >= meta_->config->cat_smooth) { sorted_idx.push_back(i); @@ -649,9 +650,7 @@ class FeatureHistogram { double gain_shift = GetLeafGainGivenOutput( sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; - bool is_full_categorical = meta_->missing_type == MissingType::None; - int used_bin = meta_->num_bin - 1 + is_full_categorical; - if (threshold >= static_cast(used_bin)) { + if (threshold >= static_cast(meta_->num_bin) || threshold < 0) { output->gain = kMinScore; Log::Warning("Invalid categorical threshold split"); return; From ba1376fa32056035ff1da3c65132f79b34e899e1 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Fri, 14 Aug 2020 18:56:05 +0800 Subject: [PATCH 02/10] some fix --- src/io/bin.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 3e88acd13b39..4be390404383 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -439,7 +439,6 @@ namespace LightGBM { } } } - num_bin_ = 0; int rest_cnt = static_cast(total_sample_cnt - na_cnt); if (rest_cnt > 0) { const int SPARSE_RATIO = 100; @@ -450,19 +449,24 @@ namespace LightGBM { // sort by counts Common::SortForPair(&counts_int, &distinct_values_int, 0, true); // will ignore the categorical of small counts - int cut_cnt = static_cast((total_sample_cnt - na_cnt) * 0.99f); + int cut_cnt = static_cast( + Common::RoundInt((total_sample_cnt - na_cnt) * 0.99f)); size_t cur_cat = 0; categorical_2_bin_.clear(); bin_2_categorical_.clear(); int used_cnt = 0; - max_bin = std::min(static_cast(distinct_values_int.size()), max_bin); + int distinct_cnt = static_cast(distinct_values_int.size()); + if (na_cnt > 0) { + ++distinct_cnt; + } + max_bin = std::min(distinct_cnt, max_bin); cnt_in_bin.clear(); // Push the dummy bin for NaN bin_2_categorical_.push_back(-1); - categorical_2_bin_[-1] = num_bin_; + categorical_2_bin_[-1] = 0; cnt_in_bin.push_back(0); - ++num_bin_; + num_bin_ = 1; while (cur_cat < distinct_values_int.size() && (used_cnt < cut_cnt || num_bin_ < max_bin)) { if (counts_int[cur_cat] < min_data_in_bin && cur_cat > 1) { From ec865ca297a1f79edc6519038237c3828de7a8d4 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Fri, 14 Aug 2020 19:07:46 +0800 Subject: [PATCH 03/10] fix bin mapping --- include/LightGBM/bin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index cc5a1752cbe6..c5d7a79b321d 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -457,7 +457,7 @@ class MultiValBin { inline uint32_t BinMapper::ValueToBin(double value) const { if (std::isnan(value)) { if (missing_type_ == MissingType::NaN) { - return num_bin_ - 1; + return 0; } else { value = 0.0f; } From dbb983f62fe6bf7f777d870f18411fae72ac2b9a Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Fri, 14 Aug 2020 20:09:30 +0800 Subject: [PATCH 04/10] fix --- include/LightGBM/bin.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index c5d7a79b321d..4f320698c831 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -456,8 +456,10 @@ class MultiValBin { inline uint32_t BinMapper::ValueToBin(double value) const { if (std::isnan(value)) { - if (missing_type_ == MissingType::NaN) { + if (bin_type_ == BinType::CategoricalBin) { return 0; + } else if (missing_type_ == MissingType::NaN) { + return num_bin_ - 1; } else { value = 0.0f; } From 6c2a729a70b4f33cfcb3d6707760707fc056e1d4 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Fri, 14 Aug 2020 22:25:06 +0800 Subject: [PATCH 05/10] fix bug --- src/io/dense_bin.hpp | 6 ++++-- src/io/sparse_bin.hpp | 5 +++-- src/treelearner/feature_histogram.hpp | 12 +++++++----- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index d61f7e6489e8..90afa02255fe 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -318,7 +318,9 @@ class DenseBin : public Bin { data_size_t gt_count = 0; data_size_t* default_indices = gt_indices; data_size_t* default_count = >_count; - if (Common::FindInBitset(threshold, num_threahold, most_freq_bin)) { + int8_t offset = most_freq_bin == 0 ? 1 : 0; + if (most_freq_bin > 0 && + Common ::FindInBitset(threshold, num_threahold, most_freq_bin)) { default_indices = lte_indices; default_count = <e_count; } @@ -330,7 +332,7 @@ class DenseBin : public Bin { } else if (!USE_MIN_BIN && bin == 0) { default_indices[(*default_count)++] = idx; } else if (Common::FindInBitset(threshold, num_threahold, - bin - min_bin)) { + bin - min_bin + offset)) { lte_indices[lte_count++] = idx; } else { gt_indices[gt_count++] = idx; diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index aa3ed929713a..431f6cc3cda9 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -364,7 +364,8 @@ class SparseBin : public Bin { data_size_t* default_indices = gt_indices; data_size_t* default_count = >_count; SparseBinIterator iterator(this, data_indices[0]); - if (Common::FindInBitset(threshold, num_threahold, most_freq_bin)) { + int8_t offset = most_freq_bin == 0 ? 1 : 0; + if (most_freq_bin > 0 && Common::FindInBitset(threshold, num_threahold, most_freq_bin)) { default_indices = lte_indices; default_count = <e_count; } @@ -376,7 +377,7 @@ class SparseBin : public Bin { } else if (!USE_MIN_BIN && bin == 0) { default_indices[(*default_count)++] = idx; } else if (Common::FindInBitset(threshold, num_threahold, - bin - min_bin)) { + bin - min_bin + offset)) { lte_indices[lte_count++] = idx; } else { gt_indices[gt_count++] = idx; diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index e56fffba8a43..288e8f0a3e5a 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -300,8 +300,9 @@ class FeatureHistogram { } double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; - const int bin_start = 1; - const int bin_end = meta_->num_bin; + const int8_t offset = meta_->offset; + const int bin_start = 1 - offset; + const int bin_end = meta_->num_bin - offset; int used_bin = -1; std::vector sorted_idx; @@ -490,19 +491,19 @@ class FeatureHistogram { if (use_onehot) { output->num_cat_threshold = 1; output->cat_threshold = - std::vector(1, static_cast(best_threshold)); + std::vector(1, static_cast(best_threshold + offset)); } else { output->num_cat_threshold = best_threshold + 1; output->cat_threshold = std::vector(output->num_cat_threshold); if (best_dir == 1) { for (int i = 0; i < output->num_cat_threshold; ++i) { - auto t = sorted_idx[i]; + auto t = sorted_idx[i] + offset; output->cat_threshold[i] = t; } } else { for (int i = 0; i < output->num_cat_threshold; ++i) { - auto t = sorted_idx[used_bin - 1 - i]; + auto t = sorted_idx[used_bin - 1 - i] + offset; output->cat_threshold[i] = t; } } @@ -1171,6 +1172,7 @@ class HistogramPool { } } } + Log::Warning("o0 %d, bin %d", offsets->at(0), num_total_bin); return num_total_bin; } From 985d51805c903c9a5947cc421b5f95e9cf844e97 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Fri, 14 Aug 2020 22:37:00 +0800 Subject: [PATCH 06/10] use stable sort --- src/treelearner/feature_histogram.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 288e8f0a3e5a..7f13f79c71f2 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -381,11 +381,11 @@ class FeatureHistogram { auto ctr_fun = [this](double sum_grad, double sum_hess) { return (sum_grad) / (sum_hess + meta_->config->cat_smooth); }; - std::sort(sorted_idx.begin(), sorted_idx.end(), - [this, &ctr_fun](int i, int j) { - return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) < - ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j)); - }); + std::stable_sort( + sorted_idx.begin(), sorted_idx.end(), [this, &ctr_fun](int i, int j) { + return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) < + ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j)); + }); std::vector find_direction(1, 1); std::vector start_position(1, 0); From 802f0bb5c0a63a88eb6952868125ced1a4a76778 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Fri, 14 Aug 2020 22:54:04 +0800 Subject: [PATCH 07/10] fix cat forced split --- src/treelearner/feature_histogram.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 7f13f79c71f2..ac4db4b24b51 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -657,8 +657,8 @@ class FeatureHistogram { return; } const double cnt_factor = num_data / sum_hessian; - const auto grad = GET_GRAD(data_, threshold); - const auto hess = GET_HESS(data_, threshold); + const auto grad = GET_GRAD(data_, threshold - meta_->offset); + const auto hess = GET_HESS(data_, threshold - meta_->offset); data_size_t cnt = static_cast(Common::RoundInt(hess * cnt_factor)); From 7df86d545ce5233a96cd4dc9ef03b2fe90ec3369 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Fri, 14 Aug 2020 23:06:07 +0800 Subject: [PATCH 08/10] Apply suggestions from code review --- src/io/dense_bin.hpp | 2 +- src/treelearner/feature_histogram.hpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 90afa02255fe..122b4214019e 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -320,7 +320,7 @@ class DenseBin : public Bin { data_size_t* default_count = >_count; int8_t offset = most_freq_bin == 0 ? 1 : 0; if (most_freq_bin > 0 && - Common ::FindInBitset(threshold, num_threahold, most_freq_bin)) { + Common::FindInBitset(threshold, num_threahold, most_freq_bin)) { default_indices = lte_indices; default_count = <e_count; } diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index ac4db4b24b51..755a9b243f9f 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -1172,7 +1172,6 @@ class HistogramPool { } } } - Log::Warning("o0 %d, bin %d", offsets->at(0), num_total_bin); return num_total_bin; } From f67d0eb7826df46c2e4fd60be6d7349b3ca6810b Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Sat, 15 Aug 2020 11:51:39 +0800 Subject: [PATCH 09/10] Apply suggestions from code review --- src/treelearner/feature_histogram.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 755a9b243f9f..35cab03c8e5e 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -651,7 +651,7 @@ class FeatureHistogram { double gain_shift = GetLeafGainGivenOutput( sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; - if (threshold >= static_cast(meta_->num_bin) || threshold < 0) { + if (threshold >= static_cast(meta_->num_bin) || threshold < meta_->offset) { output->gain = kMinScore; Log::Warning("Invalid categorical threshold split"); return; From 51a462a9f437c1a0a687584b57cd0e4679a8fbe1 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Sat, 15 Aug 2020 11:52:12 +0800 Subject: [PATCH 10/10] Apply suggestions from code review --- src/treelearner/feature_histogram.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 35cab03c8e5e..7fa341c67acb 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -651,7 +651,7 @@ class FeatureHistogram { double gain_shift = GetLeafGainGivenOutput( sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; - if (threshold >= static_cast(meta_->num_bin) || threshold < meta_->offset) { + if (threshold >= static_cast(meta_->num_bin) || threshold == 0) { output->gain = kMinScore; Log::Warning("Invalid categorical threshold split"); return;