diff --git a/.travis.yml b/.travis.yml index bbd72bb9b207..aff0084711d2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -44,11 +44,11 @@ before_install: - export BUILD_DIRECTORY="$TRAVIS_BUILD_DIR" - if [[ $TRAVIS_OS_NAME == "osx" ]]; then export OS_NAME="macos"; - export COMPILER="gcc"; + export COMPILER="clang"; export R_MAC_VERSION=3.6.1; else export OS_NAME="linux"; - export COMPILER="clang"; + export COMPILER="gcc"; export R_TRAVIS_LINUX_VERSION=3.6.1-3bionic; fi - export CONDA="$HOME/miniconda" diff --git a/.vsts-ci.yml b/.vsts-ci.yml index 98a4f591e6ef..0ae390e07ab4 100644 --- a/.vsts-ci.yml +++ b/.vsts-ci.yml @@ -17,7 +17,7 @@ jobs: - job: Linux ########################################### variables: - COMPILER: gcc + COMPILER: clang pool: vmImage: 'ubuntu-16.04' container: ubuntu1404 @@ -72,7 +72,7 @@ jobs: - job: MacOS ########################################### variables: - COMPILER: clang + COMPILER: gcc pool: vmImage: 'macOS-10.13' strategy: diff --git a/CMakeLists.txt b/CMakeLists.txt index cd1d57e46553..53efb3bc6fed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,10 @@ if(USE_R35) ADD_DEFINITIONS(-DR_VER_ABOVE_35) endif(USE_R35) +if(USE_TIMETAG) + ADD_DEFINITIONS(-DTIMETAG) +endif(USE_TIMETAG) + if(USE_MPI) find_package(MPI REQUIRED) ADD_DEFINITIONS(-DUSE_MPI) @@ -130,6 +134,21 @@ if(${MM_PREFETCH}) ADD_DEFINITIONS(-DMM_PREFETCH) endif() +include(CheckCXXSourceCompiles) +check_cxx_source_compiles(" +#include +int main() { + char *a = (char*)_mm_malloc(8, 16); + _mm_free(a); + return 0; +} +" MM_MALLOC) + +if(${MM_MALLOC}) + message(STATUS "Use _mm_malloc") + ADD_DEFINITIONS(-DMM_MALLOC) +endif() + if(UNIX OR MINGW OR CYGWIN) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O3 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type") if(USE_SWIG) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index d74739150aa4..7d43706cf45d 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -252,3 +252,46 @@ test_that("lgb.train() throws an informative error if 'valids' contains lgb.Data ) }, regexp = "each element of valids must have a name") }) + +test_that("lgb.train() works with force_col_wise and force_row_wise", { + set.seed(1234L) + nrounds <- 10L + dtrain <- lgb.Dataset( + train$data + , label = train$label + ) + params <- list( + objective = "binary" + , metric = "binary_error" + , force_col_wise = TRUE + ) + bst_colwise <- lgb.train( + params = params + , data = dtrain + , nrounds = nrounds + ) + + params <- list( + objective = "binary" + , metric = "binary_error" + , force_row_wise = TRUE + ) + bst_row_wise <- lgb.train( + params = params + , data = dtrain + , nrounds = nrounds + ) + + expected_error <- 0.003070782 + expect_equal(bst_colwise$eval_train()[[1L]][["value"]], expected_error) + expect_equal(bst_row_wise$eval_train()[[1L]][["value"]], expected_error) + + # check some basic details of the boosters just to be sure force_col_wise + # and force_row_wise are not causing any weird side effects + for (bst in list(bst_row_wise, bst_colwise)) { + expect_equal(bst$current_iter(), nrounds) + parsed_model <- jsonlite::fromJSON(bst$dump_model()) + expect_equal(parsed_model$objective, "binary sigmoid:1") + expect_false(parsed_model$average_output) + } +}) diff --git a/R-package/tests/testthat/test_learning_to_rank.R b/R-package/tests/testthat/test_learning_to_rank.R index 049ba53c78f6..65768a9ae178 100644 --- a/R-package/tests/testthat/test_learning_to_rank.R +++ b/R-package/tests/testthat/test_learning_to_rank.R @@ -47,8 +47,8 @@ test_that("learning-to-rank with lgb.train() works as expected", { } expect_identical(sapply(eval_results, function(x) {x$name}), eval_names) expect_equal(eval_results[[1L]][["value"]], 0.825) - expect_true(abs(eval_results[[2L]][["value"]] - 0.795986) < TOLERANCE) - expect_true(abs(eval_results[[3L]][["value"]] - 0.7734639) < TOLERANCE) + expect_true(abs(eval_results[[2L]][["value"]] - 0.7766434) < TOLERANCE) + expect_true(abs(eval_results[[3L]][["value"]] - 0.7527939) < TOLERANCE) }) test_that("learning-to-rank with lgb.cv() works as expected", { diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 7ce7fdad2306..4bc708c222bf 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -190,6 +190,38 @@ Core Parameters Learning Control Parameters --------------------------- +- ``force_col_wise`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build + + - Recommend ``force_col_wise=true`` when: + + - the number of columns is large, or the total number of bin is large + + - when ``num_threads`` is large, e.g. ``>20`` + + - want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up + + - want to reduce memory cost + + - when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one + +- ``force_row_wise`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - set ``force_row_wise=true`` will force LightGBM to use row-wise histogram build + + - Recommend ``force_row_wise=true`` when: + + - the number of data is large, and the number of total bin is relatively small + + - want to use small ``bagging``, or ``goss``, to speed-up + + - when ``num_threads`` is relatively small, e.g. ``<=16`` + + - set ``force_row_wise=true`` will double the memory cost for Dataset object, if your memory is not enough, you can try ``force_col_wise=true`` + + - when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one. + - ``max_depth`` :raw-html:`🔗︎`, default = ``-1``, type = int - limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise @@ -559,22 +591,6 @@ IO Parameters - **Note**: disabling this may cause the slow training speed for sparse datasets -- ``max_conflict_rate`` :raw-html:`🔗︎`, default = ``0.0``, type = double, constraints: ``0.0 <= max_conflict_rate < 1.0`` - - - max conflict rate for bundles in EFB - - - set this to ``0.0`` to disallow the conflict and provide more accurate results - - - set this to a larger value to achieve faster speed - -- ``is_enable_sparse`` :raw-html:`🔗︎`, default = ``true``, type = bool, aliases: ``is_sparse``, ``enable_sparse``, ``sparse`` - - - used to enable/disable sparse optimization - -- ``sparse_threshold`` :raw-html:`🔗︎`, default = ``0.8``, type = double, constraints: ``0.0 < sparse_threshold <= 1.0`` - - - the threshold of zero elements percentage for treating a feature as a sparse one - - ``use_missing`` :raw-html:`🔗︎`, default = ``true``, type = bool - set this to ``false`` to disable the special handle of missing value diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index d263f516c7cb..4c7e79787cb6 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -29,36 +29,29 @@ enum MissingType { NaN }; -/*! \brief Store data for one histogram bin */ -struct HistogramBinEntry { - public: - /*! \brief Sum of gradients on this bin */ - double sum_gradients = 0.0f; - /*! \brief Sum of hessians on this bin */ - double sum_hessians = 0.0f; - /*! \brief Number of data on this bin */ - data_size_t cnt = 0; - /*! - * \brief Sum up (reducers) functions for histogram bin - */ - inline static void SumReducer(const char *src, char *dst, int type_size, comm_size_t len) { - comm_size_t used_size = 0; - const HistogramBinEntry* p1; - HistogramBinEntry* p2; - while (used_size < len) { - // convert - p1 = reinterpret_cast(src); - p2 = reinterpret_cast(dst); - // add - p2->cnt += p1->cnt; - p2->sum_gradients += p1->sum_gradients; - p2->sum_hessians += p1->sum_hessians; - src += type_size; - dst += type_size; - used_size += type_size; - } +typedef double hist_t; + +const size_t KHistEntrySize = 2 * sizeof(hist_t); +const int KHistOffset = 2; +const double kSparseThreshold = 0.7; + +#define GET_GRAD(hist, i) hist[(i) << 1] +#define GET_HESS(hist, i) hist[((i) << 1) + 1] + +inline static void HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) { + comm_size_t used_size = 0; + const hist_t* p1; + hist_t* p2; + while (used_size < len) { + // convert + p1 = reinterpret_cast(src); + p2 = reinterpret_cast(dst); + *p2 += *p1; + src += type_size; + dst += type_size; + used_size += type_size; } -}; +} /*! \brief This class used to convert feature values into bin, * and store some meta information for bin*/ @@ -252,7 +245,7 @@ class OrderedBin { * \param out Output Result */ virtual void ConstructHistogram(int leaf, const score_t* gradients, - const score_t* hessians, HistogramBinEntry* out) const = 0; + const score_t* hessians, hist_t* out) const = 0; /*! * \brief Construct histogram by using this bin @@ -262,7 +255,7 @@ class OrderedBin { * \param gradients Gradients, Note:non-ordered by leaf * \param out Output Result */ - virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0; + virtual void ConstructHistogram(int leaf, const score_t* gradients, hist_t* out) const = 0; /*! * \brief Split current bin, and perform re-order by leaf @@ -360,11 +353,11 @@ class Bin { virtual void ConstructHistogram( const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, - HistogramBinEntry* out) const = 0; + hist_t* out) const = 0; virtual void ConstructHistogram(data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, - HistogramBinEntry* out) const = 0; + hist_t* out) const = 0; /*! * \brief Construct histogram of this feature, @@ -380,10 +373,10 @@ class Bin { * \param out Output Result */ virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, HistogramBinEntry* out) const = 0; + const score_t* ordered_gradients, hist_t* out) const = 0; virtual void ConstructHistogram(data_size_t start, data_size_t end, - const score_t* ordered_gradients, HistogramBinEntry* out) const = 0; + const score_t* ordered_gradients, hist_t* out) const = 0; /*! * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices) @@ -423,30 +416,11 @@ class Bin { data_size_t* data_indices, data_size_t num_data, data_size_t* lte_indices, data_size_t* gt_indices) const = 0; - /*! - * \brief Create the ordered bin for this bin - * \return Pointer to ordered bin - */ - virtual OrderedBin* CreateOrderedBin() const = 0; - /*! * \brief After pushed all feature data, call this could have better refactor for bin data */ virtual void FinishLoad() = 0; - /*! - * \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse" - * \param num_data Total number of data - * \param num_bin Number of bin - * \param sparse_rate Sparse rate of this bins( num_bin0/num_data ) - * \param is_enable_sparse True if enable sparse feature - * \param sparse_threshold Threshold for treating a feature as a sparse feature - * \param is_sparse Will set to true if this bin is sparse - * \return The bin data object - */ - static Bin* CreateBin(data_size_t num_data, int num_bin, - double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse); - /*! * \brief Create object for bin data of one feature, used for dense feature * \param num_data Total number of data @@ -469,6 +443,46 @@ class Bin { virtual Bin* Clone() = 0; }; + +class MultiValBin { +public: + + virtual ~MultiValBin() {} + + virtual data_size_t num_data() const = 0; + + virtual int32_t num_bin() const = 0; + + virtual void ReSize(data_size_t num_data) = 0; + + virtual void PushOneRow(int tid, data_size_t idx, const std::vector& values) = 0; + + virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0; + + virtual void ConstructHistogram( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void FinishLoad() = 0; + + virtual bool IsSparse() = 0; + + static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate); + + virtual MultiValBin* Clone() = 0; +}; + inline uint32_t BinMapper::ValueToBin(double value) const { if (std::isnan(value)) { if (missing_type_ == MissingType::NaN) { diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index da10e23c8b5c..ff1646c67210 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -214,6 +214,24 @@ struct Config { #pragma region Learning Control Parameters + // desc = set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build + // desc = Recommend ``force_col_wise=true`` when: + // descl2 = the number of columns is large, or the total number of bin is large + // descl2 = when ``num_threads`` is large, e.g. ``>20`` + // descl2 = want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up + // descl2 = want to reduce memory cost + // desc = when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one + bool force_col_wise = false; + + // desc = set ``force_row_wise=true`` will force LightGBM to use row-wise histogram build + // desc = Recommend ``force_row_wise=true`` when: + // descl2 = the number of data is large, and the number of total bin is relatively small + // descl2 = want to use small ``bagging``, or ``goss``, to speed-up + // descl2 = when ``num_threads`` is relatively small, e.g. ``<=16`` + // desc = set ``force_row_wise=true`` will double the memory cost for Dataset object, if your memory is not enough, you can try ``force_col_wise=true`` + // desc = when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one. + bool force_row_wise = false; + // desc = limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise // desc = ``<= 0`` means no limit int max_depth = -1; @@ -534,22 +552,6 @@ struct Config { // desc = **Note**: disabling this may cause the slow training speed for sparse datasets bool enable_bundle = true; - // check = >=0.0 - // check = <1.0 - // desc = max conflict rate for bundles in EFB - // desc = set this to ``0.0`` to disallow the conflict and provide more accurate results - // desc = set this to a larger value to achieve faster speed - double max_conflict_rate = 0.0; - - // alias = is_sparse, enable_sparse, sparse - // desc = used to enable/disable sparse optimization - bool is_enable_sparse = true; - - // check = >0.0 - // check = <=1.0 - // desc = the threshold of zero elements percentage for treating a feature as a sparse one - double sparse_threshold = 0.8; - // desc = set this to ``false`` to disable the special handle of missing value bool use_missing = true; diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 2c8dc97e2823..2c6f9deebfca 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -381,6 +382,7 @@ class Dataset { inline uint64_t NumTotalBin() const { return group_bin_boundaries_.back(); } + inline std::vector ValidFeatureIndices() const { std::vector ret; for (int i = 0; i < num_total_features_; ++i) { @@ -394,6 +396,13 @@ class Dataset { void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data); + MultiValBin* GetMultiBinFromSparseFeatures() const; + + MultiValBin* GetMultiBinFromAllFeatures() const; + + MultiValBin* TestMultiThreadingMethod(score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, + bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const; + LIGHTGBM_EXPORT void FinishLoad(); LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element); @@ -423,15 +432,18 @@ class Dataset { void ConstructHistograms(const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data, - int leaf_idx, - std::vector>* ordered_bins, const score_t* gradients, const score_t* hessians, score_t* ordered_gradients, score_t* ordered_hessians, bool is_constant_hessian, - HistogramBinEntry* histogram_data) const; + const MultiValBin* multi_val_bin, bool is_colwise, + hist_t* histogram_data) const; + + void ConstructHistogramsMultiVal(const MultiValBin* multi_val_bin, const data_size_t* data_indices, data_size_t num_data, + const score_t* gradients, const score_t* hessians, + bool is_constant_hessian, + hist_t* histogram_data) const; - void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data, - HistogramBinEntry* data) const; + void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const; inline data_size_t Split(int feature, const uint32_t* threshold, int num_threshold, bool default_left, @@ -496,19 +508,10 @@ class Dataset { return feature_groups_[group]->bin_mappers_[sub_feature].get(); } - inline const Bin* FeatureBin(int i) const { - const int group = feature2group_[i]; - return feature_groups_[group]->bin_data_.get(); - } - inline const Bin* FeatureGroupBin(int group) const { return feature_groups_[group]->bin_data_.get(); } - inline bool FeatureGroupIsSparse(int group) const { - return feature_groups_[group]->is_sparse_; - } - inline BinIterator* FeatureIterator(int i) const { const int group = feature2group_[i]; const int sub_feature = feature2subfeature_[i]; @@ -519,6 +522,10 @@ class Dataset { return feature_groups_[group]->FeatureGroupIterator(); } + inline bool IsMultiGroup(int i) const { + return feature_groups_[i]->is_multi_val_; + } + inline double RealThreshold(int i, uint32_t threshold) const { const int group = feature2group_[i]; const int sub_feature = feature2subfeature_[i]; @@ -532,18 +539,6 @@ class Dataset { return feature_groups_[group]->bin_mappers_[sub_feature]->ValueToBin(threshold_double); } - inline void CreateOrderedBins(std::vector>* ordered_bins) const { - ordered_bins->resize(num_groups_); - OMP_INIT_EX(); - #pragma omp parallel for schedule(guided) - for (int i = 0; i < num_groups_; ++i) { - OMP_LOOP_EX_BEGIN(); - ordered_bins->at(i).reset(feature_groups_[i]->bin_data_->CreateOrderedBin()); - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - } - /*! * \brief Get meta data pointer * \return Pointer of meta data @@ -620,7 +615,7 @@ class Dataset { /*! \brief Disable copy */ Dataset(const Dataset&) = delete; - void addFeaturesFrom(Dataset* other); + void AddFeaturesFrom(Dataset* other); private: std::string data_filename_; @@ -638,8 +633,6 @@ class Dataset { Metadata metadata_; /*! \brief index of label column */ int label_idx_ = 0; - /*! \brief Threshold for treating a feature as a sparse feature */ - double sparse_threshold_; /*! \brief store feature names */ std::vector feature_names_; /*! \brief store feature names */ @@ -662,6 +655,8 @@ class Dataset { bool use_missing_; bool zero_as_missing_; std::vector feature_need_push_zeros_; + mutable std::vector> hist_buf_; + }; } // namespace LightGBM diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 439a8aea74e4..a4d066462a0b 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -30,14 +30,13 @@ class FeatureGroup { * \param is_enable_sparse True if enable sparse feature * \param sparse_threshold Threshold for treating a feature as a sparse feature */ - FeatureGroup(int num_feature, + FeatureGroup(int num_feature, bool is_multi_val, std::vector>* bin_mappers, - data_size_t num_data, double sparse_threshold, bool is_enable_sparse) : num_feature_(num_feature) { + data_size_t num_data) : num_feature_(num_feature), is_multi_val_(is_multi_val), is_sparse_(false) { CHECK(static_cast(bin_mappers->size()) == num_feature); // use bin at zero to store most_freq_bin num_total_bin_ = 1; bin_offsets_.emplace_back(num_total_bin_); - int cnt_non_zero = 0; for (int i = 0; i < num_feature_; ++i) { bin_mappers_.emplace_back(bin_mappers->at(i).release()); auto num_bin = bin_mappers_[i]->num_bin(); @@ -46,18 +45,26 @@ class FeatureGroup { } num_total_bin_ += num_bin; bin_offsets_.emplace_back(num_total_bin_); - cnt_non_zero += static_cast(num_data * (1.0f - bin_mappers_[i]->sparse_rate())); } - double sparse_rate = 1.0f - static_cast(cnt_non_zero) / (num_data); - bin_data_.reset(Bin::CreateBin(num_data, num_total_bin_, - sparse_rate, is_enable_sparse, sparse_threshold, &is_sparse_)); + if (is_multi_val_) { + multi_bin_data_.clear(); + for (int i = 0; i < num_feature_; ++i) { + int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1; + if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { + multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi)); + } else { + multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi)); + } + } + } else { + bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_)); + } } - FeatureGroup(int num_feature, - std::vector>* bin_mappers, - data_size_t num_data, bool is_sparse) : num_feature_(num_feature) { - CHECK(static_cast(bin_mappers->size()) == num_feature); - // use bin at zero to store most_freq_bin + FeatureGroup(std::vector>* bin_mappers, + data_size_t num_data) : num_feature_(1), is_multi_val_(false) { + CHECK(static_cast(bin_mappers->size()) == 1); + // use bin at zero to store default_bin num_total_bin_ = 1; bin_offsets_.emplace_back(num_total_bin_); for (int i = 0; i < num_feature_; ++i) { @@ -69,13 +76,15 @@ class FeatureGroup { num_total_bin_ += num_bin; bin_offsets_.emplace_back(num_total_bin_); } - is_sparse_ = is_sparse; - if (is_sparse_) { + if (bin_mappers_[0]->sparse_rate() >= kSparseThreshold) { + is_sparse_ = true; bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_)); } else { + is_sparse_ = false; bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_)); } } + /*! * \brief Constructor from memory * \param memory Pointer of memory @@ -86,6 +95,8 @@ class FeatureGroup { const std::vector& local_used_indices) { const char* memory_ptr = reinterpret_cast(memory); // get is_sparse + is_multi_val_ = *(reinterpret_cast(memory_ptr)); + memory_ptr += sizeof(is_multi_val_); is_sparse_ = *(reinterpret_cast(memory_ptr)); memory_ptr += sizeof(is_sparse_); num_feature_ = *(reinterpret_cast(memory_ptr)); @@ -110,13 +121,26 @@ class FeatureGroup { if (!local_used_indices.empty()) { num_data = static_cast(local_used_indices.size()); } - if (is_sparse_) { - bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_)); + if (is_multi_val_) { + for (int i = 0; i < num_feature_; ++i) { + int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1; + if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) { + multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi)); + } else { + multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi)); + } + multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices); + memory_ptr += multi_bin_data_.back()->SizesInByte(); + } } else { - bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_)); + if (is_sparse_) { + bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_)); + } else { + bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_)); + } + // get bin data + bin_data_->LoadFromMemory(memory_ptr, local_used_indices); } - // get bin data - bin_data_->LoadFromMemory(memory_ptr, local_used_indices); } /*! \brief Destructor */ ~FeatureGroup() { @@ -131,22 +155,54 @@ class FeatureGroup { inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) { uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value); if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { return; } - bin += bin_offsets_[sub_feature_idx]; if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) { bin -= 1; } - bin_data_->Push(tid, line_idx, bin); + if (is_multi_val_) { + multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1); + } else { + bin += bin_offsets_[sub_feature_idx]; + bin_data_->Push(tid, line_idx, bin); + } } inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) { - bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices); + if (!is_multi_val_) { + bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices); + } else { + for (int i = 0; i < num_feature_; ++i) { + multi_bin_data_[i]->CopySubset(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices); + } + } } inline BinIterator* SubFeatureIterator(int sub_feature) { - uint32_t min_bin = bin_offsets_[sub_feature]; - uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1; uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin(); - return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin); + if (!is_multi_val_) { + uint32_t min_bin = bin_offsets_[sub_feature]; + uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1; + return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin); + } else { + int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1; + uint32_t min_bin = 1; + uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi; + return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, most_freq_bin); + } + } + + inline void FinishLoad() { + if (is_multi_val_) { + OMP_INIT_EX(); + #pragma omp parallel for schedule(guided) + for (int i = 0; i < num_feature_; ++i) { + OMP_LOOP_EX_BEGIN(); + multi_bin_data_[i]->FinishLoad(); + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + } else { + bin_data_->FinishLoad(); + } } /*! @@ -155,6 +211,9 @@ class FeatureGroup { * \return A pointer to the BinIterator object */ inline BinIterator* FeatureGroupIterator() { + if (is_multi_val_) { + return nullptr; + } uint32_t min_bin = bin_offsets_[0]; uint32_t max_bin = bin_offsets_.back() - 1; uint32_t most_freq_bin = 0; @@ -168,17 +227,29 @@ class FeatureGroup { bool default_left, data_size_t* data_indices, data_size_t num_data, data_size_t* lte_indices, data_size_t* gt_indices) const { - - uint32_t min_bin = bin_offsets_[sub_feature]; - uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1; uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin(); uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin(); - if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) { - auto missing_type = bin_mappers_[sub_feature]->missing_type(); - return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left, - *threshold, data_indices, num_data, lte_indices, gt_indices); + if (!is_multi_val_) { + uint32_t min_bin = bin_offsets_[sub_feature]; + uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1; + if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) { + auto missing_type = bin_mappers_[sub_feature]->missing_type(); + return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left, + *threshold, data_indices, num_data, lte_indices, gt_indices); + } else { + return bin_data_->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices); + } } else { - return bin_data_->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices); + int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1; + uint32_t min_bin = 1; + uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi; + if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) { + auto missing_type = bin_mappers_[sub_feature]->missing_type(); + return multi_bin_data_[sub_feature]->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left, + *threshold, data_indices, num_data, lte_indices, gt_indices); + } else { + return multi_bin_data_[sub_feature]->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices); + } } } /*! @@ -195,22 +266,35 @@ class FeatureGroup { * \param file File want to write */ void SaveBinaryToFile(const VirtualFileWriter* writer) const { + writer->Write(&is_multi_val_, sizeof(is_multi_val_)); writer->Write(&is_sparse_, sizeof(is_sparse_)); writer->Write(&num_feature_, sizeof(num_feature_)); for (int i = 0; i < num_feature_; ++i) { bin_mappers_[i]->SaveBinaryToFile(writer); } - bin_data_->SaveBinaryToFile(writer); + if (is_multi_val_) { + for (int i = 0; i < num_feature_; ++i) { + multi_bin_data_[i]->SaveBinaryToFile(writer); + } + } else { + bin_data_->SaveBinaryToFile(writer); + } } /*! * \brief Get sizes in byte of this object */ size_t SizesInByte() const { - size_t ret = sizeof(is_sparse_) + sizeof(num_feature_); + size_t ret = sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_); for (int i = 0; i < num_feature_; ++i) { ret += bin_mappers_[i]->SizesInByte(); } - ret += bin_data_->SizesInByte(); + if (!is_multi_val_) { + ret += bin_data_->SizesInByte(); + } else { + for (int i = 0; i < num_feature_; ++i) { + ret += multi_bin_data_[i]->SizesInByte(); + } + } return ret; } /*! \brief Disable copy */ @@ -218,6 +302,7 @@ class FeatureGroup { /*! \brief Deep copy */ FeatureGroup(const FeatureGroup& other) { num_feature_ = other.num_feature_; + is_multi_val_ = other.is_multi_val_; is_sparse_ = other.is_sparse_; num_total_bin_ = other.num_total_bin_; bin_offsets_ = other.bin_offsets_; @@ -226,8 +311,14 @@ class FeatureGroup { for (auto& bin_mapper : other.bin_mappers_) { bin_mappers_.emplace_back(new BinMapper(*bin_mapper)); } - - bin_data_.reset(other.bin_data_->Clone()); + if (!is_multi_val_) { + bin_data_.reset(other.bin_data_->Clone()); + } else { + multi_bin_data_.clear(); + for (int i = 0; i < num_feature_; ++i) { + multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone()); + } + } } private: @@ -239,7 +330,9 @@ class FeatureGroup { std::vector bin_offsets_; /*! \brief Bin data of this feature */ std::unique_ptr bin_data_; + std::vector> multi_bin_data_; /*! \brief True if this feature is sparse */ + bool is_multi_val_; bool is_sparse_; int num_total_bin_; }; diff --git a/include/LightGBM/meta.h b/include/LightGBM/meta.h index 9b5f2ea313db..ea8315be1b65 100644 --- a/include/LightGBM/meta.h +++ b/include/LightGBM/meta.h @@ -71,8 +71,9 @@ typedef void(*AllgatherFunction)(char* input, comm_size_t input_size, const comm #define NO_SPECIFIC (-1) -// Prefetch size is usually 64 bytes -const int kCacheLineSize = 64; +const int kAlignedSize = 32; + +#define SIZE_ALIGNED(t) ((t) + kAlignedSize - 1) / kAlignedSize * kAlignedSize } // namespace LightGBM diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 9bc069b45a85..5a855f84c7a1 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -213,6 +213,7 @@ class Tree { void RecomputeMaxDepth(); + int NextLeafId() const { return num_leaves_; } private: std::string NumericalDecisionIfElse(int node) const; diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index 66efe3d20a03..dca104e7531e 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -71,6 +71,8 @@ class TreeLearner { virtual void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) = 0; + virtual bool IsHistColWise() const = 0; + /*! * \brief Using last trained tree to predict score then adding to out_score; * \param out_score output score diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 3bacc9372329..bbcfe6328832 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -11,22 +11,36 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include +#include #include #include -#ifdef _MSC_VER -#include "intrin.h" +#if defined(_MSC_VER) +#include +#elif MM_MALLOC +#include +#elif defined(__GNUC__) +#include +#define _mm_malloc(a, b) memalign(b, a) +#define _mm_free(a) free(a) +#else +#include +#define _mm_malloc(a, b) malloc(a) +#define _mm_free(a) free(a) #endif + namespace LightGBM { namespace Common { @@ -946,8 +960,133 @@ inline bool CheckAllowedJSON(const std::string& s) { return true; } +inline int RoundInt(double x) { + return static_cast(x + 0.5f); +} + +template +class AlignmentAllocator { +public: + typedef T value_type; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + typedef T* pointer; + typedef const T* const_pointer; + + typedef T& reference; + typedef const T& const_reference; + +public: + inline AlignmentAllocator() throw () {} + + template + inline AlignmentAllocator(const AlignmentAllocator&) throw () {} + + inline ~AlignmentAllocator() throw () {} + + inline pointer adress(reference r) { + return &r; + } + + inline const_pointer adress(const_reference r) const { + return &r; + } + + inline pointer allocate(size_type n) { + return (pointer)_mm_malloc(n * sizeof(value_type), N); + } + + inline void deallocate(pointer p, size_type) { + _mm_free(p); + } + + inline void construct(pointer p, const value_type& wert) { + new (p) value_type(wert); + } + + inline void destroy(pointer p) { + p->~value_type(); + } + + inline size_type max_size() const throw () { + return size_type(-1) / sizeof(value_type); + } + + template + struct rebind { + typedef AlignmentAllocator other; + }; + + bool operator!=(const AlignmentAllocator& other) const { + return !(*this == other); + } + + // Returns true if and only if storage allocated from *this + // can be deallocated from other, and vice versa. + // Always returns true for stateless allocators. + bool operator==(const AlignmentAllocator&) const { + return true; + } +}; + +// Note: this class is not thread-safe, don't use it inside omp blocks +class Timer { + public: + Timer() {} + ~Timer() { + Print(); + } + #ifdef TIMETAG + void Start(const std::string& name) { + auto cur_time = std::chrono::steady_clock::now(); + start_time_[name] = cur_time; + } + void Stop(const std::string& name) { + if (stats_.find(name) == stats_.end()) { + stats_[name] = std::chrono::duration(0); + } + stats_[name] += std::chrono::steady_clock::now() - start_time_[name]; + } + #else + void Start(const std::string&) { } + void Stop(const std::string&) { } + #endif // TIMETAG + + void Print() const { + #ifdef TIMETAG + std::map> ordered(stats_.begin(), stats_.end()); + for (auto it = ordered.begin(); it != ordered.end(); ++it) { + Log::Info("%s costs:\t %f ", it->first.c_str(), it->second * 1e-3); + } + #endif + } + std::unordered_map start_time_; + std::unordered_map> stats_; +}; + +// Note: this class is not thread-safe, don't use it inside omp blocks +class FunctionTimer { + public: + FunctionTimer(const std::string& name, Timer& timer): timer_(timer) { + timer.Start(name); + #ifdef TIMETAG + name_ = name; + #endif // TIMETAG + + } + ~FunctionTimer() { + timer_.Stop(name_); + } + private: + std::string name_; + Timer& timer_; +}; + } // namespace Common +extern Common::Timer global_timer; + } // namespace LightGBM #endif // LightGBM_UTILS_COMMON_FUN_H_ diff --git a/src/application/application.cpp b/src/application/application.cpp index aeeaac2606e5..e896971f4ca9 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -27,6 +27,8 @@ namespace LightGBM { +Common::Timer global_timer; + Application::Application(int argc, char** argv) { LoadParameters(argc, argv); // set number of threads for openmp diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp index 64dc099e5db7..48497b4d0663 100644 --- a/src/application/predictor.hpp +++ b/src/application/predictor.hpp @@ -62,7 +62,7 @@ class Predictor { boosting_ = boosting; num_pred_one_row_ = boosting_->NumPredictOneRow(num_iteration, predict_leaf_index, predict_contrib); num_feature_ = boosting_->MaxFeatureIdx() + 1; - predict_buf_ = std::vector>(num_threads_, std::vector(num_feature_, 0.0f)); + predict_buf_.resize(num_threads_, std::vector>(num_feature_, 0.0f)); const int kFeatureThreshold = 100000; const size_t KSparseThreshold = static_cast(0.01 * num_feature_); if (predict_leaf_index) { @@ -263,7 +263,7 @@ class Predictor { int num_feature_; int num_pred_one_row_; int num_threads_; - std::vector> predict_buf_; + std::vector>> predict_buf_; }; } // namespace LightGBM diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 2d469d4c28a3..9c9c6238183a 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -17,7 +17,6 @@ namespace LightGBM { - GBDT::GBDT() : iter_(0), train_data_(nullptr), objective_function_(nullptr), @@ -41,6 +40,7 @@ balanced_bagging_(false) { } GBDT::~GBDT() { + } void GBDT::Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, @@ -148,6 +148,7 @@ void GBDT::AddValidDataset(const Dataset* valid_data, } void GBDT::Boosting() { + Common::FunctionTimer fun_timer("GBDT::Boosting", global_timer); if (objective_function_ == nullptr) { Log::Fatal("No object function provided"); } @@ -208,23 +209,26 @@ data_size_t GBDT::BalancedBaggingHelper(Random* cur_rand, data_size_t start, dat } void GBDT::Bagging(int iter) { + Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); // if need bagging if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || need_re_bagging_) { need_re_bagging_ = false; - const data_size_t min_inner_size = 1000; - data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_; - if (inner_size < min_inner_size) { inner_size = min_inner_size; } + const data_size_t min_inner_size = 1024; + const int n_block = std::min( + num_threads_, (num_data_ + min_inner_size - 1) / min_inner_size); + data_size_t inner_size = SIZE_ALIGNED((num_data_ + n_block - 1) / n_block); OMP_INIT_EX(); #pragma omp parallel for schedule(static, 1) - for (int i = 0; i < num_threads_; ++i) { + for (int i = 0; i < n_block; ++i) { OMP_LOOP_EX_BEGIN(); - left_cnts_buf_[i] = 0; - right_cnts_buf_[i] = 0; data_size_t cur_start = i * inner_size; - if (cur_start > num_data_) { continue; } - data_size_t cur_cnt = inner_size; - if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; } + data_size_t cur_cnt = std::min(inner_size, num_data_ - cur_start); + if (cur_cnt <= 0) { + left_cnts_buf_[i] = 0; + right_cnts_buf_[i] = 0; + continue; + } Random cur_rand(config_->bagging_seed + iter * num_threads_ + i); data_size_t cur_left_count = 0; if (balanced_bagging_) { @@ -241,15 +245,14 @@ void GBDT::Bagging(int iter) { data_size_t left_cnt = 0; left_write_pos_buf_[0] = 0; right_write_pos_buf_[0] = 0; - for (int i = 1; i < num_threads_; ++i) { + for (int i = 1; i < n_block; ++i) { left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1]; right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1]; } - left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1]; + left_cnt = left_write_pos_buf_[n_block - 1] + left_cnts_buf_[n_block - 1]; #pragma omp parallel for schedule(static, 1) - for (int i = 0; i < num_threads_; ++i) { - OMP_LOOP_EX_BEGIN(); + for (int i = 0; i < n_block; ++i) { if (left_cnts_buf_[i] > 0) { std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i], tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t)); @@ -258,9 +261,7 @@ void GBDT::Bagging(int iter) { std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i], tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t)); } - OMP_LOOP_EX_END(); } - OMP_THROW_EX(); bag_data_cnt_ = left_cnt; Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_); // set bagging data to tree learner @@ -276,6 +277,7 @@ void GBDT::Bagging(int iter) { } void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { + Common::FunctionTimer fun_timer("GBDT::Train", global_timer); bool is_finished = false; auto start_time = std::chrono::steady_clock::now(); for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) { @@ -342,6 +344,7 @@ double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj, int class_id) } double GBDT::BoostFromAverage(int class_id, bool update_scorer) { + Common::FunctionTimer fun_timer("GBDT::BoostFromAverage", global_timer); // boosting from average label; or customized "average" if implemented for the current objective if (models_.empty() && !train_score_updater_->has_init_score() && objective_function_ != nullptr) { if (config_->boost_from_average || (train_data_ != nullptr && train_data_->num_features() == 0)) { @@ -366,6 +369,7 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) { } bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { + Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer); std::vector init_scores(num_tree_per_iteration_, 0.0); // boosting first if (gradients == nullptr || hessians == nullptr) { @@ -486,6 +490,7 @@ bool GBDT::EvalAndCheckEarlyStopping() { } void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { + Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer); // update training score if (!is_use_subset_) { train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id); @@ -755,17 +760,10 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { right_write_pos_buf_.resize(num_threads_); double average_bag_rate = (bag_data_cnt_ / num_data_) / config->bagging_freq; - int sparse_group = 0; - for (int i = 0; i < train_data_->num_feature_groups(); ++i) { - if (train_data_->FeatureGroupIsSparse(i)) { - ++sparse_group; - } - } is_use_subset_ = false; const int group_threshold_usesubset = 100; - const int sparse_group_threshold_usesubset = train_data_->num_feature_groups() / 4; - if (average_bag_rate <= 0.5 - && (train_data_->num_feature_groups() < group_threshold_usesubset || sparse_group < sparse_group_threshold_usesubset)) { + if (tree_learner_->IsHistColWise() && average_bag_rate <= 0.5 + && (train_data_->num_feature_groups() < group_threshold_usesubset)) { if (tmp_subset_ == nullptr || is_change_dataset) { tmp_subset_.reset(new Dataset(bag_data_cnt_)); tmp_subset_->CopyFeatureMapperFrom(train_data_); diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 5103a93d540e..e891457dfada 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -457,11 +457,11 @@ class GBDT : public GBDTBase { /*! \brief Max feature index of training data*/ int max_feature_idx_; /*! \brief First order derivative of training data */ - std::vector gradients_; + std::vector> gradients_; /*! \brief Secend order derivative of training data */ - std::vector hessians_; + std::vector> hessians_; /*! \brief Store the indices of in-bag data */ - std::vector bag_data_indices_; + std::vector> bag_data_indices_; /*! \brief Number of in-bag data */ data_size_t bag_data_cnt_; /*! \brief Store the indices of in-bag data */ diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index d9aa0fd64a5c..d12cb1f3a09a 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -22,10 +22,6 @@ namespace LightGBM { -#ifdef TIMETAG -std::chrono::duration subset_time; -std::chrono::duration re_init_tree_time; -#endif class GOSS: public GBDT { public: @@ -36,10 +32,7 @@ class GOSS: public GBDT { } ~GOSS() { - #ifdef TIMETAG - Log::Info("GOSS::subset costs %f", subset_time * 1e-3); - Log::Info("GOSS::re_init_tree costs %f", re_init_tree_time * 1e-3); - #endif + } void Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, @@ -143,19 +136,21 @@ class GOSS: public GBDT { // not subsample for first iterations if (iter < static_cast(1.0f / config_->learning_rate)) { return; } - const data_size_t min_inner_size = 100; - data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_; - if (inner_size < min_inner_size) { inner_size = min_inner_size; } + const data_size_t min_inner_size = 128; + const int n_block = std::min( + num_threads_, (num_data_ + min_inner_size - 1) / min_inner_size); + data_size_t inner_size = SIZE_ALIGNED((num_data_ + n_block - 1) / n_block); OMP_INIT_EX(); #pragma omp parallel for schedule(static, 1) - for (int i = 0; i < num_threads_; ++i) { + for (int i = 0; i < n_block; ++i) { OMP_LOOP_EX_BEGIN(); - left_cnts_buf_[i] = 0; - right_cnts_buf_[i] = 0; data_size_t cur_start = i * inner_size; - if (cur_start > num_data_) { continue; } - data_size_t cur_cnt = inner_size; - if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; } + data_size_t cur_cnt = std::min(inner_size, num_data_ - cur_start); + if (cur_cnt <= 0) { + left_cnts_buf_[i] = 0; + right_cnts_buf_[i] = 0; + continue; + } Random cur_rand(config_->bagging_seed + iter * num_threads_ + i); data_size_t cur_left_count = BaggingHelper(&cur_rand, cur_start, cur_cnt, tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start); @@ -168,14 +163,14 @@ class GOSS: public GBDT { data_size_t left_cnt = 0; left_write_pos_buf_[0] = 0; right_write_pos_buf_[0] = 0; - for (int i = 1; i < num_threads_; ++i) { + for (int i = 1; i < n_block; ++i) { left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1]; right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1]; } - left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1]; + left_cnt = left_write_pos_buf_[n_block - 1] + left_cnts_buf_[n_block - 1]; #pragma omp parallel for schedule(static, 1) - for (int i = 0; i < num_threads_; ++i) { + for (int i = 0; i < n_block; ++i) { OMP_LOOP_EX_BEGIN(); if (left_cnts_buf_[i] > 0) { std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i], @@ -193,22 +188,10 @@ class GOSS: public GBDT { if (!is_use_subset_) { tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_); } else { - // get subset - #ifdef TIMETAG - auto start_time = std::chrono::steady_clock::now(); - #endif tmp_subset_->ReSize(bag_data_cnt_); tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); - #ifdef TIMETAG - subset_time += std::chrono::steady_clock::now() - start_time; - #endif - #ifdef TIMETAG - start_time = std::chrono::steady_clock::now(); - #endif tree_learner_->ResetTrainingData(tmp_subset_.get()); - #ifdef TIMETAG - re_init_tree_time += std::chrono::steady_clock::now() - start_time; - #endif + } } diff --git a/src/boosting/score_updater.hpp b/src/boosting/score_updater.hpp index 0b2e208f3afe..d3f67a64819f 100644 --- a/src/boosting/score_updater.hpp +++ b/src/boosting/score_updater.hpp @@ -55,6 +55,7 @@ class ScoreUpdater { inline bool has_init_score() const { return has_init_score_; } inline void AddScore(double val, int cur_tree_id) { + Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; #pragma omp parallel for schedule(static) for (int i = 0; i < num_data_; ++i) { @@ -76,6 +77,7 @@ class ScoreUpdater { * \param cur_tree_id Current tree for multiclass training */ inline void AddScore(const Tree* tree, int cur_tree_id) { + Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; tree->AddPredictionToScore(data_, num_data_, score_.data() + offset); } @@ -87,6 +89,7 @@ class ScoreUpdater { * \param cur_tree_id Current tree for multiclass training */ inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) { + Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; tree_learner->AddPredictionToScore(tree, score_.data() + offset); } @@ -100,6 +103,7 @@ class ScoreUpdater { */ inline void AddScore(const Tree* tree, const data_size_t* data_indices, data_size_t data_cnt, int cur_tree_id) { + Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + offset); } @@ -119,7 +123,7 @@ class ScoreUpdater { /*! \brief Pointer of data set */ const Dataset* data_; /*! \brief Scores for data set */ - std::vector score_; + std::vector> score_; bool has_init_score_; }; diff --git a/src/c_api.cpp b/src/c_api.cpp index a06faa2ded68..1e060037de76 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -1065,7 +1065,7 @@ int LGBM_DatasetAddFeaturesFrom(DatasetHandle target, API_BEGIN(); auto target_d = reinterpret_cast(target); auto source_d = reinterpret_cast(source); - target_d->addFeaturesFrom(source_d); + target_d->AddFeaturesFrom(source_d); API_END(); } diff --git a/src/io/bin.cpp b/src/io/bin.cpp index db3b6e453954..f9ade8a91226 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -15,7 +15,8 @@ #include "dense_bin.hpp" #include "dense_nbits_bin.hpp" -#include "ordered_sparse_bin.hpp" +#include "multi_val_dense_bin.hpp" +#include "multi_val_sparse_bin.hpp" #include "sparse_bin.hpp" namespace LightGBM { @@ -636,21 +637,10 @@ namespace LightGBM { template class SparseBin; template class SparseBin; - template class OrderedSparseBin; - template class OrderedSparseBin; - template class OrderedSparseBin; + template class MultiValDenseBin; + template class MultiValDenseBin; + template class MultiValDenseBin; - Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, - bool is_enable_sparse, double sparse_threshold, bool* is_sparse) { - // sparse threshold - if (sparse_rate >= sparse_threshold && is_enable_sparse) { - *is_sparse = true; - return CreateSparseBin(num_data, num_bin); - } else { - *is_sparse = false; - return CreateDenseBin(num_data, num_bin); - } - } Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) { if (num_bin <= 16) { @@ -674,4 +664,25 @@ namespace LightGBM { } } + MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate) { + const double multi_val_bin_sparse_threshold = 0.25f; + if (sparse_rate >= multi_val_bin_sparse_threshold) { + if (num_bin <= 256) { + return new MultiValSparseBin(num_data, num_bin); + } else if (num_bin <= 65536) { + return new MultiValSparseBin(num_data, num_bin); + } else { + return new MultiValSparseBin(num_data, num_bin); + } + } else { + if (num_bin <= 256) { + return new MultiValDenseBin(num_data, num_bin, num_feature); + } else if (num_bin <= 65536) { + return new MultiValDenseBin(num_data, num_bin, num_feature); + } else { + return new MultiValDenseBin(num_data, num_bin, num_feature); + } + } + } + } // namespace LightGBM diff --git a/src/io/config.cpp b/src/io/config.cpp index f619194262fa..0ec4c8eec56a 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -312,6 +312,11 @@ void Config::CheckParamConflict() { num_leaves = static_cast(full_num_leaves); } } + // force col-wise for gpu + if (device_type == std::string("gpu")) { + force_col_wise = true; + force_row_wise = false; + } } std::string Config::ToString() const { diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 1fcaeb1b1483..91bebbc7d788 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -116,9 +116,6 @@ std::unordered_map Config::alias_table({ {"is_pre_partition", "pre_partition"}, {"is_enable_bundle", "enable_bundle"}, {"bundle", "enable_bundle"}, - {"is_sparse", "is_enable_sparse"}, - {"enable_sparse", "is_enable_sparse"}, - {"sparse", "is_enable_sparse"}, {"two_round_loading", "two_round"}, {"use_two_round_loading", "two_round"}, {"is_save_binary", "save_binary"}, @@ -181,6 +178,8 @@ std::unordered_set Config::parameter_set({ "num_threads", "device_type", "seed", + "force_col_wise", + "force_row_wise", "max_depth", "min_data_in_leaf", "min_sum_hessian_in_leaf", @@ -236,9 +235,6 @@ std::unordered_set Config::parameter_set({ "valid_data_initscores", "pre_partition", "enable_bundle", - "max_conflict_rate", - "is_enable_sparse", - "sparse_threshold", "use_missing", "zero_as_missing", "two_round", @@ -309,6 +305,10 @@ void Config::GetMembersFromString(const std::unordered_map=0.0); - CHECK(max_conflict_rate <1.0); - - GetBool(params, "is_enable_sparse", &is_enable_sparse); - - GetDouble(params, "sparse_threshold", &sparse_threshold); - CHECK(sparse_threshold >0.0); - CHECK(sparse_threshold <=1.0); - GetBool(params, "use_missing", &use_missing); GetBool(params, "zero_as_missing", &zero_as_missing); @@ -600,6 +590,8 @@ std::string Config::SaveMembersToString() const { str_buf << "[learning_rate: " << learning_rate << "]\n"; str_buf << "[num_leaves: " << num_leaves << "]\n"; str_buf << "[num_threads: " << num_threads << "]\n"; + str_buf << "[force_col_wise: " << force_col_wise << "]\n"; + str_buf << "[force_row_wise: " << force_row_wise << "]\n"; str_buf << "[max_depth: " << max_depth << "]\n"; str_buf << "[min_data_in_leaf: " << min_data_in_leaf << "]\n"; str_buf << "[min_sum_hessian_in_leaf: " << min_sum_hessian_in_leaf << "]\n"; @@ -655,9 +647,6 @@ std::string Config::SaveMembersToString() const { str_buf << "[valid_data_initscores: " << Common::Join(valid_data_initscores, ",") << "]\n"; str_buf << "[pre_partition: " << pre_partition << "]\n"; str_buf << "[enable_bundle: " << enable_bundle << "]\n"; - str_buf << "[max_conflict_rate: " << max_conflict_rate << "]\n"; - str_buf << "[is_enable_sparse: " << is_enable_sparse << "]\n"; - str_buf << "[sparse_threshold: " << sparse_threshold << "]\n"; str_buf << "[use_missing: " << use_missing << "]\n"; str_buf << "[zero_as_missing: " << zero_as_missing << "]\n"; str_buf << "[two_round: " << two_round << "]\n"; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 6dde7e23b211..8f5912016789 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -36,6 +36,7 @@ Dataset::Dataset(data_size_t num_data) { } Dataset::~Dataset() { + } std::vector> NoGroup( @@ -48,19 +49,20 @@ std::vector> NoGroup( return features_in_group; } -int GetConfilctCount(const std::vector& mark, const int* indices, int num_indices, int max_cnt) { +int GetConfilctCount(const std::vector& mark, const int* indices, int num_indices, data_size_t max_cnt) { int ret = 0; for (int i = 0; i < num_indices; ++i) { if (mark[indices[i]]) { ++ret; - if (ret > max_cnt) { - return -1; - } + } + if (ret > max_cnt) { + return -1; } } return ret; } -void MarkUsed(std::vector* mark, const int* indices, int num_indices) { + +void MarkUsed(std::vector* mark, const int* indices, data_size_t num_indices) { auto& ref_mark = *mark; for (int i = 0; i < num_indices; ++i) { ref_mark[indices[i]] = true; @@ -93,29 +95,31 @@ std::vector> FindGroups(const std::vector* multi_val_group) { const int max_search_group = 100; - const int gpu_max_bin_per_group = 256; + const int max_bin_per_group = 256; + const data_size_t single_val_max_conflict_cnt = static_cast(total_sample_cnt / 10000); + multi_val_group->clear(); + Random rand(num_data); std::vector> features_in_group; std::vector> conflict_marks; - std::vector group_conflict_cnt; - std::vector group_non_zero_cnt; + std::vector group_used_row_cnt; + std::vector group_total_data_cnt; std::vector group_num_bin; + // first round: fill the single val group for (auto fidx : find_order) { bool is_filtered_feature = fidx >= num_sample_col; - const size_t cur_non_zero_cnt = is_filtered_feature ? 0: num_per_col[fidx]; - bool need_new_group = true; + const data_size_t cur_non_zero_cnt = is_filtered_feature ? 0 : num_per_col[fidx]; std::vector available_groups; for (int gid = 0; gid < static_cast(features_in_group.size()); ++gid) { - if (group_non_zero_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + max_error_cnt) { - if (!is_use_gpu || group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0) - <= gpu_max_bin_per_group) { + auto cur_num_bin = group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0); + if (group_total_data_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + single_val_max_conflict_cnt) { + if (!is_use_gpu || cur_num_bin <= max_bin_per_group) { available_groups.push_back(gid); } } @@ -124,44 +128,82 @@ std::vector> FindGroups(const std::vector(available_groups.size()) - 1; auto indices = rand.Sample(last, std::min(last, max_search_group - 1)); + // always push the last group search_groups.push_back(available_groups.back()); for (auto idx : indices) { search_groups.push_back(available_groups[idx]); } } + int best_gid = -1; + int best_conflict_cnt = -1; for (auto gid : search_groups) { - const int rest_max_cnt = max_error_cnt - group_conflict_cnt[gid]; - const int cnt = is_filtered_feature ? 0 : GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt); - if (cnt >= 0 && cnt <= rest_max_cnt) { - data_size_t rest_non_zero_data = static_cast( - static_cast(cur_non_zero_cnt - cnt) * num_data / total_sample_cnt); - if (rest_non_zero_data < filter_cnt) { continue; } - need_new_group = false; - features_in_group[gid].push_back(fidx); - group_conflict_cnt[gid] += cnt; - group_non_zero_cnt[gid] += cur_non_zero_cnt - cnt; - if (!is_filtered_feature) { - MarkUsed(&conflict_marks[gid], sample_indices[fidx], num_per_col[fidx]); - } - if (is_use_gpu) { - group_num_bin[gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0); - } + const data_size_t rest_max_cnt = single_val_max_conflict_cnt - group_total_data_cnt[gid] + group_used_row_cnt[gid]; + const data_size_t cnt = is_filtered_feature ? 0 : GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt); + if (cnt >= 0 && cnt <= rest_max_cnt && cnt <= cur_non_zero_cnt / 2) { + best_gid = gid; + best_conflict_cnt = cnt; break; } } - if (need_new_group) { + if (best_gid >= 0) { + features_in_group[best_gid].push_back(fidx); + group_total_data_cnt[best_gid] += cur_non_zero_cnt; + group_used_row_cnt[best_gid] += cur_non_zero_cnt - best_conflict_cnt; + if (!is_filtered_feature) { + MarkUsed(&conflict_marks[best_gid], sample_indices[fidx], num_per_col[fidx]); + } + group_num_bin[best_gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0); + } else { features_in_group.emplace_back(); features_in_group.back().push_back(fidx); - group_conflict_cnt.push_back(0); conflict_marks.emplace_back(total_sample_cnt, false); if (!is_filtered_feature) { MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]); } - group_non_zero_cnt.emplace_back(cur_non_zero_cnt); - if (is_use_gpu) { - group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0)); + group_total_data_cnt.emplace_back(cur_non_zero_cnt); + group_used_row_cnt.emplace_back(cur_non_zero_cnt); + group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0)); + } + } + std::vector second_round_features; + std::vector> features_in_group2; + std::vector> conflict_marks2; + + const double dense_threshold = 0.4; + for (int gid = 0; gid < static_cast(features_in_group.size()); ++gid) { + const double dense_rate = static_cast(group_used_row_cnt[gid]) / total_sample_cnt; + if (dense_rate >= dense_threshold) { + features_in_group2.push_back(std::move(features_in_group[gid])); + conflict_marks2.push_back(std::move(conflict_marks[gid])); + } else { + for (auto fidx : features_in_group[gid]) { + second_round_features.push_back(fidx); + } + } + } + + features_in_group = features_in_group2; + conflict_marks = conflict_marks2; + multi_val_group->resize(features_in_group.size(), false); + if (!second_round_features.empty()) { + features_in_group.emplace_back(); + conflict_marks.emplace_back(total_sample_cnt, false); + bool is_multi_val = is_use_gpu ? true : false; + int conflict_cnt = 0; + for (auto fidx : second_round_features) { + features_in_group.back().push_back(fidx); + if (!is_multi_val) { + const int rest_max_cnt = single_val_max_conflict_cnt - conflict_cnt; + const auto cnt = GetConfilctCount(conflict_marks.back(), sample_indices[fidx], num_per_col[fidx], rest_max_cnt); + conflict_cnt += cnt; + if (cnt < 0 || conflict_cnt > single_val_max_conflict_cnt) { + is_multi_val = true; + continue; + } + MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]); } } + multi_val_group->push_back(is_multi_val); } return features_in_group; } @@ -171,17 +213,12 @@ std::vector> FastFeatureBundling(const std::vector& used_features, - double max_conflict_rate, data_size_t num_data, - data_size_t min_data, - double sparse_threshold, - bool is_enable_sparse, - bool is_use_gpu) { - // filter is based on sampling data, so decrease its range - const data_size_t filter_cnt = static_cast(static_cast(0.95 * min_data) / num_data * total_sample_cnt); - const data_size_t max_error_cnt = static_cast(total_sample_cnt * max_conflict_rate); + bool is_use_gpu, + std::vector* multi_val_group) { + Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer); std::vector feature_non_zero_cnt; feature_non_zero_cnt.reserve(used_features.size()); // put dense feature first @@ -209,6 +246,7 @@ std::vector> FastFeatureBundling(const std::vector> tmp_indices; std::vector tmp_num_per_col(num_sample_col, 0); for (auto fidx : used_features) { @@ -224,42 +262,25 @@ std::vector> FastFeatureBundling(const std::vector group_is_multi_val, group_is_multi_val2; + auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val); + auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val2); + if (features_in_group.size() > group2.size()) { features_in_group = group2; - } - std::vector> ret; - for (size_t i = 0; i < features_in_group.size(); ++i) { - if (features_in_group[i].size() <= 1 || features_in_group[i].size() >= 5) { - ret.push_back(features_in_group[i]); - } else { - int cnt_non_zero = 0; - for (size_t j = 0; j < features_in_group[i].size(); ++j) { - const int fidx = features_in_group[i][j]; - cnt_non_zero += static_cast(num_data * (1.0f - bin_mappers[fidx]->sparse_rate())); - } - double sparse_rate = 1.0f - static_cast(cnt_non_zero) / (num_data); - // take apart small sparse group, due it will not gain on speed - if (sparse_rate >= sparse_threshold && is_enable_sparse) { - for (size_t j = 0; j < features_in_group[i].size(); ++j) { - const int fidx = features_in_group[i][j]; - ret.emplace_back(); - ret.back().push_back(fidx); - } - } else { - ret.push_back(features_in_group[i]); - } - } + group_is_multi_val = group_is_multi_val2; } // shuffle groups - int num_group = static_cast(ret.size()); - Random tmp_rand(12); + int num_group = static_cast(features_in_group.size()); + Random tmp_rand(num_data); for (int i = 0; i < num_group - 1; ++i) { int j = tmp_rand.NextShort(i + 1, num_group); - std::swap(ret[i], ret[j]); + std::swap(features_in_group[i], features_in_group[j]); + // Use std::swap for vector will cause the wrong result.. + std::swap(group_is_multi_val[i], group_is_multi_val[j]); } - return ret; + *multi_val_group = group_is_multi_val; + return features_in_group; } void Dataset::Construct( @@ -274,7 +295,6 @@ void Dataset::Construct( const Config& io_config) { num_total_features_ = num_total_features; CHECK(num_total_features_ == static_cast(bin_mappers->size())); - sparse_threshold_ = io_config.sparse_threshold; // get num_features std::vector used_features; auto& ref_bin_mappers = *bin_mappers; @@ -287,13 +307,11 @@ void Dataset::Construct( Log::Warning("There are no meaningful features, as all feature values are constant."); } auto features_in_group = NoGroup(used_features); - + std::vector group_is_multi_val(used_features.size(), 0); if (io_config.enable_bundle && !used_features.empty()) { features_in_group = FastFeatureBundling(*bin_mappers, - sample_non_zero_indices, sample_values, num_per_col, num_sample_col, total_sample_cnt, - used_features, io_config.max_conflict_rate, - num_data_, io_config.min_data_in_leaf, - sparse_threshold_, io_config.is_enable_sparse, io_config.device_type == std::string("gpu")); + sample_non_zero_indices, sample_values, num_per_col, num_sample_col, static_cast(total_sample_cnt), + used_features, num_data_, io_config.device_type == std::string("gpu"), &group_is_multi_val); } num_features_ = 0; @@ -306,10 +324,14 @@ void Dataset::Construct( real_feature_idx_.resize(num_features_); feature2group_.resize(num_features_); feature2subfeature_.resize(num_features_); + int num_multi_val_group = 0; feature_need_push_zeros_.clear(); for (int i = 0; i < num_groups_; ++i) { auto cur_features = features_in_group[i]; int cur_cnt_features = static_cast(cur_features.size()); + if (group_is_multi_val[i]) { + ++num_multi_val_group; + } // get bin_mappers std::vector> cur_bin_mappers; for (int j = 0; j < cur_cnt_features; ++j) { @@ -325,8 +347,7 @@ void Dataset::Construct( ++cur_fidx; } feature_groups_.emplace_back(std::unique_ptr( - new FeatureGroup(cur_cnt_features, &cur_bin_mappers, num_data_, sparse_threshold_, - io_config.is_enable_sparse))); + new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_))); } feature_groups_.shrink_to_fit(); group_bin_boundaries_.clear(); @@ -414,9 +435,6 @@ void Dataset::ResetConfig(const char* parameters) { if (param.count("zero_as_missing") && io_config.zero_as_missing != zero_as_missing_) { Log::Warning("Cannot change zero_as_missing after constructed Dataset handle."); } - if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) { - Log::Warning("Cannot change sparse_threshold after constructed Dataset handle."); - } if (param.count("forcedbins_filename")) { Log::Warning("Cannot change forced bins after constructed Dataset handle."); } @@ -452,23 +470,229 @@ void Dataset::ResetConfig(const char* parameters) { void Dataset::FinishLoad() { if (is_finish_load_) { return; } if (num_groups_ > 0) { - OMP_INIT_EX(); -#pragma omp parallel for schedule(guided) for (int i = 0; i < num_groups_; ++i) { - OMP_LOOP_EX_BEGIN(); - feature_groups_[i]->bin_data_->FinishLoad(); - OMP_LOOP_EX_END(); + feature_groups_[i]->FinishLoad(); } - OMP_THROW_EX(); } is_finish_load_ = true; } +void PushDataToMultiValBin(int num_threads, data_size_t num_data, const std::vector most_freq_bins, + const std::vector offsets, std::vector>>& iters, MultiValBin* ret) { + Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin", global_timer); + const data_size_t min_block_size = 4096; + const int n_block = std::min(num_threads, (num_data + min_block_size - 1) / min_block_size); + const data_size_t block_size = (num_data + n_block - 1) / n_block; + if (ret->IsSparse()) { + #pragma omp parallel for schedule(static) + for (int tid = 0; tid < n_block; ++tid) { + std::vector cur_data; + data_size_t start = tid * block_size; + data_size_t end = std::min(num_data, start + block_size); + for (size_t j = 0; j < most_freq_bins.size(); ++j) { + iters[tid][j]->Reset(start); + } + for (data_size_t i = start; i < end; ++i) { + cur_data.clear(); + for (size_t j = 0; j < most_freq_bins.size(); ++j) { + auto cur_bin = iters[tid][j]->Get(i); + if (cur_bin == most_freq_bins[j]) { + continue; + } + cur_bin += offsets[j]; + if (most_freq_bins[j] == 0) { + cur_bin -= 1; + } + cur_data.push_back(cur_bin); + } + ret->PushOneRow(tid, i, cur_data); + } + } + } else { + #pragma omp parallel for schedule(static) + for (int tid = 0; tid < n_block; ++tid) { + std::vector cur_data; + data_size_t start = tid * block_size; + data_size_t end = std::min(num_data, start + block_size); + for (size_t j = 0; j < most_freq_bins.size(); ++j) { + iters[tid][j]->Reset(start); + } + for (data_size_t i = start; i < end; ++i) { + cur_data.clear(); + for (size_t j = 0; j < most_freq_bins.size(); ++j) { + auto cur_bin = iters[tid][j]->Get(i); + if (cur_bin == most_freq_bins[j]) { + cur_bin = 0; + } else { + cur_bin += offsets[j]; + if (most_freq_bins[j] == 0) { + cur_bin -= 1; + } + } + cur_data.push_back(cur_bin); + } + ret->PushOneRow(tid, i, cur_data); + } + } + } +} + +MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const { + Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures", global_timer); + int multi_group_id = -1; + for (int i = 0; i < num_groups_; ++i) { + if (feature_groups_[i]->is_multi_val_) { + if (multi_group_id < 0) { + multi_group_id = i; + } else { + Log::Fatal("Bug. There should be only one multi-val group."); + } + } + } + if (multi_group_id < 0) { + return nullptr; + } + const auto& offsets = feature_groups_[multi_group_id]->bin_offsets_; + const int num_feature = feature_groups_[multi_group_id]->num_feature_; + int num_threads = 1; + #pragma omp parallel + #pragma omp master + { + num_threads = omp_get_num_threads(); + } + + std::vector>> iters(num_threads); + std::vector most_freq_bins; + double sum_sparse_rate = 0; + for (int i = 0; i < num_feature; ++i) { + for (int tid = 0; tid < num_threads; ++tid) { + iters[tid].emplace_back(feature_groups_[multi_group_id]->SubFeatureIterator(i)); + } + most_freq_bins.push_back(feature_groups_[multi_group_id]->bin_mappers_[i]->GetMostFreqBin()); + sum_sparse_rate += feature_groups_[multi_group_id]->bin_mappers_[i]->sparse_rate(); + } + sum_sparse_rate /= num_feature; + Log::Debug("GetMultiBinFromSparseFeatures:: sparse rate %f", sum_sparse_rate); + std::unique_ptr ret; + ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(), num_feature, sum_sparse_rate)); + PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get()); + ret->FinishLoad(); + return ret.release(); +} + +MultiValBin* Dataset::GetMultiBinFromAllFeatures() const { + Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures", global_timer); + int num_threads = 1; + #pragma omp parallel + #pragma omp master + { + num_threads = omp_get_num_threads(); + } + double sum_dense_ratio = 0; + + std::unique_ptr ret; + std::vector>> iters(num_threads); + std::vector most_freq_bins; + std::vector offsets; + int num_total_bin = 1; + offsets.push_back(num_total_bin); + for (int gid = 0; gid < num_groups_; ++gid) { + if (feature_groups_[gid]->is_multi_val_) { + for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) { + const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid]; + sum_dense_ratio += 1.0f - bin_mapper->sparse_rate(); + most_freq_bins.push_back(bin_mapper->GetMostFreqBin()); + num_total_bin += bin_mapper->num_bin(); + if (most_freq_bins.back() == 0) { + num_total_bin -= 1; + } + offsets.push_back(num_total_bin); + for (int tid = 0; tid < num_threads; ++tid) { + iters[tid].emplace_back(feature_groups_[gid]->SubFeatureIterator(fid)); + } + } + } else { + most_freq_bins.push_back(0); + num_total_bin += feature_groups_[gid]->bin_offsets_.back() - 1; + for (int tid = 0; tid < num_threads; ++tid) { + iters[tid].emplace_back(feature_groups_[gid]->FeatureGroupIterator()); + } + offsets.push_back(num_total_bin); + for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) { + const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid]; + sum_dense_ratio += 1.0f - bin_mapper->sparse_rate(); + } + } + } + sum_dense_ratio /= static_cast(most_freq_bins.size()); + Log::Debug("GetMultiBinFromAllFeatures:: sparse rate %f", 1.0 - sum_dense_ratio); + ret.reset(MultiValBin::CreateMultiValBin(num_data_, num_total_bin, static_cast(most_freq_bins.size()), 1.0 - sum_dense_ratio)); + PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get()); + ret->FinishLoad(); + return ret.release(); +} + +MultiValBin* Dataset::TestMultiThreadingMethod(score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, + bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const { + int num_threads = 1; +#pragma omp parallel +#pragma omp master + { num_threads = omp_get_num_threads(); } + Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod", global_timer); + if (force_colwise && force_rowwise) { + Log::Fatal("cannot set both `force_col_wise` and `force_row_wise` to `true`."); + } + if (num_groups_ <= 0) { + return nullptr; + } + if (force_colwise) { + *is_hist_col_wise = true; + return GetMultiBinFromSparseFeatures(); + } else if (force_rowwise) { + *is_hist_col_wise = false; + auto ret = GetMultiBinFromAllFeatures(); + const int num_bin_aligned = + (ret->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize; + hist_buf_.resize(static_cast(num_bin_aligned) * 2 * num_threads); + return ret; + } else { + std::unique_ptr sparse_bin; + std::unique_ptr all_bin; + sparse_bin.reset(GetMultiBinFromSparseFeatures()); + all_bin.reset(GetMultiBinFromAllFeatures()); + std::vector> hist_data(NumTotalBin() * 2); + const int num_bin_aligned = + (all_bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize; + hist_buf_.resize(static_cast(num_bin_aligned) * 2 * num_threads); + std::chrono::duration col_wise_time, row_wise_time; + auto start_time = std::chrono::steady_clock::now(); + ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, hessians, gradients, hessians, is_constant_hessian, sparse_bin.get(), true, hist_data.data()); + col_wise_time = std::chrono::steady_clock::now() - start_time; + start_time = std::chrono::steady_clock::now(); + ConstructHistogramsMultiVal(all_bin.get(), nullptr, num_data_, gradients, hessians, is_constant_hessian, hist_data.data()); + row_wise_time = std::chrono::steady_clock::now() - start_time; + Log::Debug("colwise cost %f seconds, rowwise cost %f seconds", col_wise_time * 1e-3, row_wise_time * 1e-3); + if (col_wise_time < row_wise_time) { + *is_hist_col_wise = true; + hist_buf_.clear(); + return sparse_bin.release(); + } else { + *is_hist_col_wise = false; + Log::Info("Use row-wise multi-threading, may increase memory usage. If memory is not enough, you can set `force_col_wise=true`."); + if (all_bin->IsSparse()) { + Log::Debug("Use Sparse Multi-Val Bin"); + } else { + Log::Debug("Use Dense Multi-Val Bin"); + } + return all_bin.release(); + } + } +} + void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { feature_groups_.clear(); num_features_ = dataset->num_features_; num_groups_ = dataset->num_groups_; - sparse_threshold_ = dataset->sparse_threshold_; // copy feature bin mapper data for (int i = 0; i < num_groups_; ++i) { std::vector> bin_mappers; @@ -477,9 +701,9 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { } feature_groups_.emplace_back(new FeatureGroup( dataset->feature_groups_[i]->num_feature_, + dataset->feature_groups_[i]->is_multi_val_, &bin_mappers, - num_data_, - dataset->feature_groups_[i]->is_sparse_)); + num_data_)); } feature_groups_.shrink_to_fit(); used_feature_map_ = dataset->used_feature_map_; @@ -502,8 +726,6 @@ void Dataset::CreateValid(const Dataset* dataset) { feature_groups_.clear(); num_features_ = dataset->num_features_; num_groups_ = num_features_; - sparse_threshold_ = dataset->sparse_threshold_; - bool is_enable_sparse = true; feature2group_.clear(); feature2subfeature_.clear(); // copy feature bin mapper data @@ -514,12 +736,8 @@ void Dataset::CreateValid(const Dataset* dataset) { if (bin_mappers.back()->GetDefaultBin() != bin_mappers.back()->GetMostFreqBin()) { feature_need_push_zeros_.push_back(i); } - feature_groups_.emplace_back(new FeatureGroup( - 1, - &bin_mappers, - num_data_, - sparse_threshold_, - is_enable_sparse)); + feature_groups_.emplace_back(new FeatureGroup(&bin_mappers, + num_data_)); feature2group_.push_back(i); feature2subfeature_.push_back(0); } @@ -721,7 +939,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { writer->Write(binary_file_token, size_of_token); // get size of header size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_) - + sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_) + sizeof(sparse_threshold_) + + sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_) + 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_ + sizeof(double) * num_features_ + sizeof(int32_t) * num_total_features_ + sizeof(int) * 3 + sizeof(bool) * 2; // size of feature names @@ -743,7 +961,6 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { writer->Write(&min_data_in_bin_, sizeof(min_data_in_bin_)); writer->Write(&use_missing_, sizeof(use_missing_)); writer->Write(&zero_as_missing_, sizeof(zero_as_missing_)); - writer->Write(&sparse_threshold_, sizeof(sparse_threshold_)); writer->Write(used_feature_map_.data(), sizeof(int) * num_total_features_); writer->Write(&num_groups_, sizeof(num_groups_)); writer->Write(real_feature_idx_.data(), sizeof(int) * num_features_); @@ -866,20 +1083,110 @@ void Dataset::DumpTextFile(const char* text_filename) { fclose(file); } +void Dataset::ConstructHistogramsMultiVal(const MultiValBin* multi_val_bin, const data_size_t* data_indices, data_size_t num_data, + const score_t* gradients, const score_t* hessians, + bool is_constant_hessian, + hist_t* hist_data) const { + Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal", global_timer); + if (multi_val_bin == nullptr) { return; } + int num_threads = 1; + #pragma omp parallel + #pragma omp master + { + num_threads = omp_get_num_threads(); + } + + global_timer.Start("Dataset::sparse_bin_histogram"); + const int num_bin = multi_val_bin->num_bin(); + const int num_bin_aligned = (num_bin + kAlignedSize - 1) / kAlignedSize * kAlignedSize; + const int min_data_block_size = 1024; + const int n_data_block = std::min(num_threads, (num_data + min_data_block_size - 1) / min_data_block_size); + const int data_block_size = (num_data + n_data_block - 1) / n_data_block; + + const size_t buf_size = static_cast(n_data_block - 1)* num_bin_aligned * 2; + if (hist_buf_.size() < buf_size) { + hist_buf_.resize(buf_size); + } + + #pragma omp parallel for schedule(static) + for (int tid = 0; tid < n_data_block; ++tid) { + data_size_t start = tid * data_block_size; + data_size_t end = std::min(start + data_block_size, num_data); + auto data_ptr = hist_data; + if (tid > 0) { + data_ptr = hist_buf_.data() + static_cast(num_bin_aligned) * 2 * (tid - 1); + } + std::memset(reinterpret_cast(data_ptr), 0, num_bin* KHistEntrySize); + if (data_indices != nullptr && num_data < num_data_) { + if (!is_constant_hessian) { + multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, hessians, data_ptr); + } else { + multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, data_ptr); + } + } else { + if (!is_constant_hessian) { + multi_val_bin->ConstructHistogram(start, end, gradients, hessians, data_ptr); + } else { + multi_val_bin->ConstructHistogram(start, end, gradients, data_ptr); + } + } + } + global_timer.Stop("Dataset::sparse_bin_histogram"); + + global_timer.Start("Dataset::sparse_bin_histogram_merge"); + + const int min_bin_block_size = 512; + const int n_bin_block = std::min(num_threads, (num_bin + min_bin_block_size - 1) / min_bin_block_size); + const int bin_block_size = (num_bin + n_bin_block - 1) / n_bin_block; + if (!is_constant_hessian) { + #pragma omp parallel for schedule(static) + for (int t = 0; t < n_bin_block; ++t) { + const int start = t * bin_block_size; + const int end = std::min(start + bin_block_size, num_bin); + for (int tid = 1; tid < n_data_block; ++tid) { + auto src_ptr = hist_buf_.data() + static_cast(num_bin_aligned) * 2 * (tid - 1); + for (int i = start * 2; i < end * 2; ++i) { + hist_data[i] += src_ptr[i]; + } + } + } + } else { + #pragma omp parallel for schedule(static) + for (int t = 0; t < n_bin_block; ++t) { + const int start = t * bin_block_size; + const int end = std::min(start + bin_block_size, num_bin); + for (int tid = 1; tid < n_data_block; ++tid) { + auto src_ptr = hist_buf_.data() + static_cast(num_bin_aligned) * 2 * (tid - 1); + for (int i = start * 2; i < end * 2; ++i) { + hist_data[i] += src_ptr[i]; + } + } + for (int i = start; i < end; i++) { + GET_HESS(hist_data, i) = GET_HESS(hist_data, i) * hessians[0]; + } + } + } + global_timer.Stop("Dataset::sparse_bin_histogram_merge"); +} + void Dataset::ConstructHistograms(const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data, - int leaf_idx, - std::vector>* ordered_bins, const score_t* gradients, const score_t* hessians, score_t* ordered_gradients, score_t* ordered_hessians, bool is_constant_hessian, - HistogramBinEntry* hist_data) const { - if (leaf_idx < 0 || num_data < 0 || hist_data == nullptr) { + const MultiValBin* multi_val_bin, bool is_colwise, + hist_t* hist_data) const { + Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer); + if (num_data < 0 || hist_data == nullptr) { return; } - - std::vector used_group; - used_group.reserve(num_groups_); + if (!is_colwise) { + return ConstructHistogramsMultiVal(multi_val_bin, data_indices, num_data, gradients, hessians, is_constant_hessian, hist_data); + } + global_timer.Start("Dataset::Get used group"); + std::vector used_dense_group; + int multi_val_groud_id = -1; + used_dense_group.reserve(num_groups_); for (int group = 0; group < num_groups_; ++group) { const int f_cnt = group_feature_cnt_[group]; bool is_group_used = false; @@ -891,172 +1198,137 @@ void Dataset::ConstructHistograms(const std::vector& is_feature_used, } } if (is_group_used) { - used_group.push_back(group); - } - } - int num_used_group = static_cast(used_group.size()); - auto ptr_ordered_grad = gradients; - auto ptr_ordered_hess = hessians; - auto& ref_ordered_bins = *ordered_bins; - if (data_indices != nullptr && num_data < num_data_) { - if (!is_constant_hessian) { - #pragma omp parallel for schedule(static) - for (data_size_t i = 0; i < num_data; ++i) { - ordered_gradients[i] = gradients[data_indices[i]]; - ordered_hessians[i] = hessians[data_indices[i]]; - } - } else { - #pragma omp parallel for schedule(static) - for (data_size_t i = 0; i < num_data; ++i) { - ordered_gradients[i] = gradients[data_indices[i]]; + if (feature_groups_[group]->is_multi_val_) { + multi_val_groud_id = group; + } else { + used_dense_group.push_back(group); } } - ptr_ordered_grad = ordered_gradients; - ptr_ordered_hess = ordered_hessians; - if (!is_constant_hessian) { - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) - for (int gi = 0; gi < num_used_group; ++gi) { - OMP_LOOP_EX_BEGIN(); - int group = used_group[gi]; - // feature is not used - auto data_ptr = hist_data + group_bin_boundaries_[group]; - const int num_bin = feature_groups_[group]->num_total_bin_; - std::memset(reinterpret_cast(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry)); - // construct histograms for smaller leaf - if (ref_ordered_bins[group] == nullptr) { - // if not use ordered bin - feature_groups_[group]->bin_data_->ConstructHistogram( - data_indices, - 0, - num_data, - ptr_ordered_grad, - ptr_ordered_hess, - data_ptr); - } else { - // used ordered bin - ref_ordered_bins[group]->ConstructHistogram(leaf_idx, - gradients, - hessians, - data_ptr); + } + int num_used_dense_group = static_cast(used_dense_group.size()); + global_timer.Stop("Dataset::Get used group"); + global_timer.Start("Dataset::dense_bin_histogram"); + if (num_used_dense_group > 0) { + auto ptr_ordered_grad = gradients; + auto ptr_ordered_hess = hessians; + if (data_indices != nullptr && num_data < num_data_) { + if (!is_constant_hessian) { +#pragma omp parallel for schedule(static) + for (data_size_t i = 0; i < num_data; ++i) { + ordered_gradients[i] = gradients[data_indices[i]]; + ordered_hessians[i] = hessians[data_indices[i]]; + } + } else { +#pragma omp parallel for schedule(static) + for (data_size_t i = 0; i < num_data; ++i) { + ordered_gradients[i] = gradients[data_indices[i]]; } - OMP_LOOP_EX_END(); } - OMP_THROW_EX(); - } else { - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) - for (int gi = 0; gi < num_used_group; ++gi) { - OMP_LOOP_EX_BEGIN(); - int group = used_group[gi]; - // feature is not used - auto data_ptr = hist_data + group_bin_boundaries_[group]; - const int num_bin = feature_groups_[group]->num_total_bin_; - std::memset(reinterpret_cast(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry)); - // construct histograms for smaller leaf - if (ref_ordered_bins[group] == nullptr) { - // if not use ordered bin + ptr_ordered_grad = ordered_gradients; + ptr_ordered_hess = ordered_hessians; + if (!is_constant_hessian) { + OMP_INIT_EX(); +#pragma omp parallel for schedule(static) + for (int gi = 0; gi < num_used_dense_group; ++gi) { + OMP_LOOP_EX_BEGIN(); + int group = used_dense_group[gi]; + // feature is not used + auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; + const int num_bin = feature_groups_[group]->num_total_bin_; + std::memset(reinterpret_cast(data_ptr), 0, + num_bin * KHistEntrySize); + // construct histograms for smaller leaf feature_groups_[group]->bin_data_->ConstructHistogram( - data_indices, - 0, - num_data, - ptr_ordered_grad, - data_ptr); - } else { - // used ordered bin - ref_ordered_bins[group]->ConstructHistogram(leaf_idx, - gradients, - data_ptr); - } - // fixed hessian. - for (int i = 0; i < num_bin; ++i) { - data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0]; + data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, + data_ptr); + OMP_LOOP_EX_END(); } - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - } - } else { - if (!is_constant_hessian) { - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) - for (int gi = 0; gi < num_used_group; ++gi) { - OMP_LOOP_EX_BEGIN(); - int group = used_group[gi]; - // feature is not used - auto data_ptr = hist_data + group_bin_boundaries_[group]; - const int num_bin = feature_groups_[group]->num_total_bin_; - std::memset(reinterpret_cast(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry)); - // construct histograms for smaller leaf - if (ref_ordered_bins[group] == nullptr) { - // if not use ordered bin + OMP_THROW_EX(); + + } else { + OMP_INIT_EX(); +#pragma omp parallel for schedule(static) + for (int gi = 0; gi < num_used_dense_group; ++gi) { + OMP_LOOP_EX_BEGIN(); + int group = used_dense_group[gi]; + // feature is not used + auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; + const int num_bin = feature_groups_[group]->num_total_bin_; + std::memset(reinterpret_cast(data_ptr), 0, + num_bin * KHistEntrySize); + // construct histograms for smaller leaf feature_groups_[group]->bin_data_->ConstructHistogram( - 0, - num_data, - ptr_ordered_grad, - ptr_ordered_hess, - data_ptr); - } else { - // used ordered bin - ref_ordered_bins[group]->ConstructHistogram(leaf_idx, - gradients, - hessians, - data_ptr); + data_indices, 0, num_data, ptr_ordered_grad, data_ptr); + // fixed hessian. + for (int i = 0; i < num_bin; ++i) { + GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0]; + } + OMP_LOOP_EX_END(); } - OMP_LOOP_EX_END(); + OMP_THROW_EX(); } - OMP_THROW_EX(); } else { - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) - for (int gi = 0; gi < num_used_group; ++gi) { - OMP_LOOP_EX_BEGIN(); - int group = used_group[gi]; - // feature is not used - auto data_ptr = hist_data + group_bin_boundaries_[group]; - const int num_bin = feature_groups_[group]->num_total_bin_; - std::memset(reinterpret_cast(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry)); - // construct histograms for smaller leaf - if (ref_ordered_bins[group] == nullptr) { - // if not use ordered bin + if (!is_constant_hessian) { + OMP_INIT_EX(); +#pragma omp parallel for schedule(static) + for (int gi = 0; gi < num_used_dense_group; ++gi) { + OMP_LOOP_EX_BEGIN(); + int group = used_dense_group[gi]; + // feature is not used + auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; + const int num_bin = feature_groups_[group]->num_total_bin_; + std::memset(reinterpret_cast(data_ptr), 0, + num_bin * KHistEntrySize); + // construct histograms for smaller leaf feature_groups_[group]->bin_data_->ConstructHistogram( - 0, - num_data, - ptr_ordered_grad, - data_ptr); - } else { - // used ordered bin - ref_ordered_bins[group]->ConstructHistogram(leaf_idx, - gradients, - data_ptr); + 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); + OMP_LOOP_EX_END(); } - // fixed hessian. - for (int i = 0; i < num_bin; ++i) { - data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0]; + OMP_THROW_EX(); + } else { + OMP_INIT_EX(); +#pragma omp parallel for schedule(static) + for (int gi = 0; gi < num_used_dense_group; ++gi) { + OMP_LOOP_EX_BEGIN(); + int group = used_dense_group[gi]; + // feature is not used + auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; + const int num_bin = feature_groups_[group]->num_total_bin_; + std::memset(reinterpret_cast(data_ptr), 0, + num_bin * KHistEntrySize); + // construct histograms for smaller leaf + feature_groups_[group]->bin_data_->ConstructHistogram( + 0, num_data, ptr_ordered_grad, data_ptr); + // fixed hessian. + for (int i = 0; i < num_bin; ++i) { + GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0]; + } + OMP_LOOP_EX_END(); } - OMP_LOOP_EX_END(); + OMP_THROW_EX(); } - OMP_THROW_EX(); } } + global_timer.Stop("Dataset::dense_bin_histogram"); + if (multi_val_groud_id >= 0) { + ConstructHistogramsMultiVal(multi_val_bin, data_indices, num_data, gradients, hessians, is_constant_hessian, + hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + } } -void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data, - HistogramBinEntry* data) const { +void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const { const int group = feature2group_[feature_idx]; const int sub_feature = feature2subfeature_[feature_idx]; const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get(); const int most_freq_bin = bin_mapper->GetMostFreqBin(); if (most_freq_bin > 0) { const int num_bin = bin_mapper->num_bin(); - data[most_freq_bin].sum_gradients = sum_gradient; - data[most_freq_bin].sum_hessians = sum_hessian; - data[most_freq_bin].cnt = num_data; + GET_GRAD(data, most_freq_bin) = sum_gradient; + GET_HESS(data, most_freq_bin) = sum_hessian; for (int i = 0; i < num_bin; ++i) { if (i != most_freq_bin) { - data[most_freq_bin].sum_gradients -= data[i].sum_gradients; - data[most_freq_bin].sum_hessians -= data[i].sum_hessians; - data[most_freq_bin].cnt -= data[i].cnt; + GET_GRAD(data, most_freq_bin) -= GET_GRAD(data, i); + GET_HESS(data, most_freq_bin) -= GET_HESS(data, i); } } } @@ -1094,7 +1366,7 @@ void PushClearIfEmpty(std::vector* dest, const size_t dest_len, const std::ve } } -void Dataset::addFeaturesFrom(Dataset* other) { +void Dataset::AddFeaturesFrom(Dataset* other) { if (other->num_data_ != num_data_) { throw std::runtime_error("Cannot add features from other Dataset with a different number of rows"); } diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 31288aea6675..f1e8b749f799 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -335,8 +335,6 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b mem_ptr += sizeof(dataset->use_missing_); dataset->zero_as_missing_ = *(reinterpret_cast(mem_ptr)); mem_ptr += sizeof(dataset->zero_as_missing_); - dataset->sparse_threshold_ = *(reinterpret_cast(mem_ptr)); - mem_ptr += sizeof(dataset->sparse_threshold_); const int* tmp_feature_map = reinterpret_cast(mem_ptr); dataset->used_feature_map_.clear(); for (int i = 0; i < dataset->num_total_features_; ++i) { diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index fab014c28675..082065ce40bf 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -31,9 +31,9 @@ class DenseBinIterator: public BinIterator { } inline uint32_t RawGet(data_size_t idx) override; inline uint32_t Get(data_size_t idx) override; - inline void Reset(data_size_t) override { } + inline void Reset(data_size_t) override {} - private: +private: const DenseBin* bin_data_; VAL_T min_bin_; VAL_T max_bin_; @@ -46,7 +46,7 @@ class DenseBinIterator: public BinIterator { */ template class DenseBin: public Bin { - public: +public: friend DenseBinIterator; explicit DenseBin(data_size_t num_data) : num_data_(num_data), data_(num_data_, static_cast(0)) { @@ -68,84 +68,65 @@ class DenseBin: public Bin { BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; - void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, const score_t* ordered_hessians, - HistogramBinEntry* out) const override { - const data_size_t pf_offset = 64 / sizeof(VAL_T); - const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T); + #define ACC_GH(hist, i, g, h) \ + const auto ti = static_cast(i) << 1; \ + hist[ti] += g; \ + hist[ti + 1] += h; \ + + template + void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const { data_size_t i = start; - for (; i < pf_end; i++) { - PREFETCH_T0(data_.data() + data_indices[i + pf_offset]); - const VAL_T bin = data_[data_indices[i]]; - out[bin].sum_gradients += ordered_gradients[i]; - out[bin].sum_hessians += ordered_hessians[i]; - ++out[bin].cnt; + + if (use_prefetch) { + const data_size_t pf_offset = 64 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + for (; i < pf_end; ++i) { + const auto idx = use_indices ? data_indices[i] : i; + const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset; + PREFETCH_T0(data_.data() + pf_idx); + const VAL_T bin = data_[idx]; + if (use_hessians) { + ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); + } else { + ACC_GH(out, bin, ordered_gradients[i], 1.0f); + } + } } - for (; i < end; i++) { - const VAL_T bin = data_[data_indices[i]]; - out[bin].sum_gradients += ordered_gradients[i]; - out[bin].sum_hessians += ordered_hessians[i]; - ++out[bin].cnt; + for (; i < end; ++i) { + const auto idx = use_indices ? data_indices[i] : i; + const VAL_T bin = data_[idx]; + if (use_hessians) { + ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); + } else { + ACC_GH(out, bin, ordered_gradients[i], 1.0f); + } } } + #undef ACC_GH + + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, ordered_gradients, ordered_hessians, out); + } void ConstructHistogram(data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, - HistogramBinEntry* out) const override { - const data_size_t pf_offset = 64 / sizeof(VAL_T); - const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T); - data_size_t i = start; - for (; i < pf_end; i++) { - PREFETCH_T0(data_.data() + i + pf_offset); - const VAL_T bin = data_[i]; - out[bin].sum_gradients += ordered_gradients[i]; - out[bin].sum_hessians += ordered_hessians[i]; - ++out[bin].cnt; - } - for (; i < end; i++) { - const VAL_T bin = data_[i]; - out[bin].sum_gradients += ordered_gradients[i]; - out[bin].sum_hessians += ordered_hessians[i]; - ++out[bin].cnt; - } + hist_t* out) const override { + ConstructHistogramInner(nullptr, start, end, ordered_gradients, ordered_hessians, out); } void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, - HistogramBinEntry* out) const override { - const data_size_t pf_offset = 64 / sizeof(VAL_T); - const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T); - data_size_t i = start; - for (; i < pf_end; i++) { - PREFETCH_T0(data_.data() + data_indices[i + pf_offset]); - const VAL_T bin = data_[data_indices[i]]; - out[bin].sum_gradients += ordered_gradients[i]; - ++out[bin].cnt; - } - for (; i < end; i++) { - const VAL_T bin = data_[data_indices[i]]; - out[bin].sum_gradients += ordered_gradients[i]; - ++out[bin].cnt; - } + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, ordered_gradients, nullptr, out); } void ConstructHistogram(data_size_t start, data_size_t end, const score_t* ordered_gradients, - HistogramBinEntry* out) const override { - const data_size_t pf_offset = 64 / sizeof(VAL_T); - const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T); - data_size_t i = start; - for (; i < pf_end; i++) { - PREFETCH_T0(data_.data() + i + pf_offset); - const VAL_T bin = data_[i]; - out[bin].sum_gradients += ordered_gradients[i]; - ++out[bin].cnt; - } - for (; i < end; i++) { - const VAL_T bin = data_[i]; - out[bin].sum_gradients += ordered_gradients[i]; - ++out[bin].cnt; - } + hist_t* out) const override { + ConstructHistogramInner(nullptr, start, end, ordered_gradients, nullptr, out); } data_size_t Split( @@ -257,9 +238,6 @@ class DenseBin: public Bin { data_size_t num_data() const override { return num_data_; } - /*! \brief not ordered bin for dense feature */ - OrderedBin* CreateOrderedBin() const override { return nullptr; } - void FinishLoad() override {} void LoadFromMemory(const void* memory, const std::vector& local_used_indices) override { @@ -287,17 +265,18 @@ class DenseBin: public Bin { } size_t SizesInByte() const override { - return sizeof(VAL_T) * num_data_; + return sizeof(VAL_T)* num_data_; } DenseBin* Clone() override; - private: +private: data_size_t num_data_; - std::vector data_; + std::vector> data_; DenseBin(const DenseBin& other) - : num_data_(other.num_data_), data_(other.data_){} + : num_data_(other.num_data_), data_(other.data_) { + } }; template diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp index 8c3c18cdd707..c65540d9733b 100644 --- a/src/io/dense_nbits_bin.hpp +++ b/src/io/dense_nbits_bin.hpp @@ -16,7 +16,7 @@ namespace LightGBM { class Dense4bitsBin; class Dense4bitsBinIterator : public BinIterator { - public: +public: explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) : bin_data_(bin_data), min_bin_(static_cast(min_bin)), max_bin_(static_cast(max_bin)), @@ -31,7 +31,7 @@ class Dense4bitsBinIterator : public BinIterator { inline uint32_t Get(data_size_t idx) override; inline void Reset(data_size_t) override {} - private: +private: const Dense4bitsBin* bin_data_; uint8_t min_bin_; uint8_t max_bin_; @@ -40,12 +40,12 @@ class Dense4bitsBinIterator : public BinIterator { }; class Dense4bitsBin : public Bin { - public: +public: friend Dense4bitsBinIterator; explicit Dense4bitsBin(data_size_t num_data) : num_data_(num_data) { int len = (num_data_ + 1) / 2; - data_ = std::vector(len, static_cast(0)); + data_.resize(len, static_cast(0)); buf_ = std::vector(len, static_cast(0)); } @@ -73,88 +73,65 @@ class Dense4bitsBin : public Bin { inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; - void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, const score_t* ordered_hessians, - HistogramBinEntry* out) const override { - const data_size_t pf_offset = 64; - const data_size_t pf_end = end - pf_offset - kCacheLineSize; + #define ACC_GH(hist, i, g, h) \ + const auto ti = (i) << 1; \ + hist[ti] += g; \ + hist[ti + 1] += h; \ + + template + void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const { data_size_t i = start; - for (; i < pf_end; i++) { - PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1)); - const data_size_t idx = data_indices[i]; - const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - out[bin].sum_hessians += ordered_hessians[i]; - ++out[bin].cnt; + + if (use_prefetch) { + const data_size_t pf_offset = 64; + const data_size_t pf_end = end - pf_offset; + for (; i < pf_end; ++i) { + const auto idx = use_indices ? data_indices[i] : i; + const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset; + PREFETCH_T0(data_.data() + (pf_idx >> 1)); + const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; + if (use_hessians) { + ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); + } else { + ACC_GH(out, bin, ordered_gradients[i], 1.0f); + } + } } - for (; i < end; i++) { - const data_size_t idx = data_indices[i]; + for (; i < end; ++i) { + const auto idx = use_indices ? data_indices[i] : i; const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - out[bin].sum_hessians += ordered_hessians[i]; - ++out[bin].cnt; + if (use_hessians) { + ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); + } else { + ACC_GH(out, bin, ordered_gradients[i], 1.0f); + } } } + #undef ACC_GH + + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, ordered_gradients, ordered_hessians, out); + } void ConstructHistogram(data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, - HistogramBinEntry* out) const override { - const data_size_t pf_offset = 64; - const data_size_t pf_end = end - pf_offset - kCacheLineSize; - data_size_t i = start; - for (; i < pf_end; i++) { - PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1)); - const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - out[bin].sum_hessians += ordered_hessians[i]; - ++out[bin].cnt; - } - for (; i < end; i++) { - const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - out[bin].sum_hessians += ordered_hessians[i]; - ++out[bin].cnt; - } + hist_t* out) const override { + ConstructHistogramInner(nullptr, start, end, ordered_gradients, ordered_hessians, out); } void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, - HistogramBinEntry* out) const override { - const data_size_t pf_offset = 64; - const data_size_t pf_end = end - pf_offset - kCacheLineSize; - data_size_t i = start; - for (; i < pf_end; i++) { - PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1)); - const data_size_t idx = data_indices[i]; - const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - ++out[bin].cnt; - } - for (; i < end; i++) { - const data_size_t idx = data_indices[i]; - const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - ++out[bin].cnt; - } + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, ordered_gradients, nullptr, out); } void ConstructHistogram(data_size_t start, data_size_t end, const score_t* ordered_gradients, - HistogramBinEntry* out) const override { - const data_size_t pf_offset = 64; - const data_size_t pf_end = end - pf_offset - kCacheLineSize; - data_size_t i = start; - for (; i < pf_end; i++) { - PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1)); - const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - ++out[bin].cnt; - } - for (; i < end; i++) { - const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; - out[bin].sum_gradients += ordered_gradients[i]; - ++out[bin].cnt; - } + hist_t* out) const override { + ConstructHistogramInner(nullptr, start, end, ordered_gradients, nullptr, out); } data_size_t Split( @@ -266,8 +243,6 @@ class Dense4bitsBin : public Bin { data_size_t num_data() const override { return num_data_; } - /*! \brief not ordered bin for dense feature */ - OrderedBin* CreateOrderedBin() const override { return nullptr; } void FinishLoad() override { if (buf_.empty()) { return; } @@ -325,19 +300,20 @@ class Dense4bitsBin : public Bin { } size_t SizesInByte() const override { - return sizeof(uint8_t) * data_.size(); + return sizeof(uint8_t)* data_.size(); } Dense4bitsBin* Clone() override { return new Dense4bitsBin(*this); } - protected: +protected: Dense4bitsBin(const Dense4bitsBin& other) - : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {} + : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) { + } data_size_t num_data_; - std::vector data_; + std::vector> data_; std::vector buf_; }; diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp new file mode 100644 index 000000000000..e9589ce192b1 --- /dev/null +++ b/src/io/multi_val_dense_bin.hpp @@ -0,0 +1,168 @@ +/*! + * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_ +#define LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_ + + +#include +#include + +#include +#include +#include + +namespace LightGBM { + + +template +class MultiValDenseBin : public MultiValBin { +public: + + explicit MultiValDenseBin(data_size_t num_data, int num_bin, int num_feature) + : num_data_(num_data), num_bin_(num_bin), num_feature_(num_feature) { + data_.resize(static_cast(num_data_) * num_feature_, static_cast(0)); + } + + ~MultiValDenseBin() { + } + + data_size_t num_data() const override { + return num_data_; + } + + int num_bin() const override { + return num_bin_; + } + + + void PushOneRow(int , data_size_t idx, const std::vector& values) override { + auto start = RowPtr(idx); + CHECK(num_feature_ == static_cast(values.size())); + for (auto i = 0; i < num_feature_; ++i) { + data_[start + i] = static_cast(values[i]); + } + } + + void FinishLoad() override { + + } + + bool IsSparse() override{ + return false; + } + + void ReSize(data_size_t num_data) override { + if (num_data_ != num_data) { + num_data_ = num_data; + } + } + + #define ACC_GH(hist, i, g, h) \ + const auto ti = static_cast(i) << 1; \ + hist[ti] += g; \ + hist[ti + 1] += h; \ + + template + void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, hist_t* out) const { + data_size_t i = start; + if (use_prefetch) { + const data_size_t pf_offset = 32 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + + for (; i < pf_end; ++i) { + const auto idx = use_indices ? data_indices[i] : i; + const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset; + PREFETCH_T0(gradients + pf_idx); + if (use_hessians) { + PREFETCH_T0(hessians + pf_idx); + } + PREFETCH_T0(data_.data() + RowPtr(pf_idx)); + const auto j_start = RowPtr(idx); + for (auto j = j_start; j < j_start + num_feature_; ++j) { + const VAL_T bin = data_[j]; + if (use_hessians) { + ACC_GH(out, bin, gradients[idx], hessians[idx]); + } else { + ACC_GH(out, bin, gradients[idx], 1.0f); + } + } + } + } + for (; i < end; ++i) { + const auto idx = use_indices ? data_indices[i] : i; + const auto j_start = RowPtr(idx); + for (auto j = j_start; j < j_start + num_feature_; ++j) { + const VAL_T bin = data_[j]; + if (use_hessians) { + ACC_GH(out, bin, gradients[idx], hessians[idx]); + } else { + ACC_GH(out, bin, gradients[idx], 1.0f); + } + } + } + } + #undef ACC_GH + + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, gradients, hessians, out); + } + + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, + hist_t* out) const override { + ConstructHistogramInner(nullptr, start, end, gradients, hessians, out); + } + + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* gradients, + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, gradients, nullptr, out); + } + + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* gradients, + hist_t* out) const override { + ConstructHistogramInner(nullptr, start, end, gradients, nullptr, out); + } + + void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override { + auto other_bin = dynamic_cast*>(full_bin); + data_.clear(); + for (data_size_t i = 0; i < num_used_indices; ++i) { + for (int64_t j = other_bin->RowPtr(used_indices[i]); j < other_bin->RowPtr(used_indices[i] + 1); ++j) { + data_.push_back(other_bin->data_[j]); + } + } + } + + inline int64_t RowPtr(data_size_t idx) const { + return static_cast(idx) * num_feature_; + } + + MultiValDenseBin* Clone() override; + +private: + data_size_t num_data_; + int num_bin_; + int num_feature_; + std::vector> data_; + + MultiValDenseBin(const MultiValDenseBin& other) + : num_data_(other.num_data_), num_bin_(other.num_bin_), num_feature_(other.num_feature_), data_(other.data_) { + } +}; + +template +MultiValDenseBin* MultiValDenseBin::Clone() { + return new MultiValDenseBin(*this); +} + + + +} // namespace LightGBM +#endif // LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_ diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp new file mode 100644 index 000000000000..7f6fc866f617 --- /dev/null +++ b/src/io/multi_val_sparse_bin.hpp @@ -0,0 +1,204 @@ +/*! + * Copyright (c) 2020 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_ +#define LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_ + + +#include +#include + +#include +#include +#include + +namespace LightGBM { + + +template +class MultiValSparseBin : public MultiValBin { +public: + + explicit MultiValSparseBin(data_size_t num_data, int num_bin) + : num_data_(num_data), num_bin_(num_bin) { + row_ptr_.resize(num_data_ + 1, 0); + data_.reserve(num_data_); + int num_threads = 1; + #pragma omp parallel + #pragma omp master + { + num_threads = omp_get_num_threads(); + } + if (num_threads > 1) { + t_data_.resize(num_threads - 1); + } + } + + ~MultiValSparseBin() { + } + + data_size_t num_data() const override { + return num_data_; + } + + int num_bin() const override { + return num_bin_; + } + + + void PushOneRow(int tid, data_size_t idx, const std::vector & values) override { + row_ptr_[idx + 1] = static_cast(values.size()); + if (tid == 0) { + for (auto val : values) { + data_.push_back(static_cast(val)); + } + } else { + for (auto val : values) { + t_data_[tid - 1].push_back(static_cast(val)); + } + } + } + + void FinishLoad() override { + for (data_size_t i = 0; i < num_data_; ++i) { + row_ptr_[i + 1] += row_ptr_[i]; + } + if (t_data_.size() > 0) { + size_t offset = data_.size(); + data_.resize(row_ptr_[num_data_]); + for (size_t tid = 0; tid < t_data_.size(); ++tid) { + std::memcpy(data_.data() + offset, t_data_[tid].data(), t_data_[tid].size() * sizeof(VAL_T)); + offset += t_data_[tid].size(); + t_data_[tid].clear(); + } + } + row_ptr_.shrink_to_fit(); + data_.shrink_to_fit(); + t_data_.clear(); + t_data_.shrink_to_fit(); + } + + bool IsSparse() override { + return true; + } + + void ReSize(data_size_t num_data) override { + if (num_data_ != num_data) { + num_data_ = num_data; + } + } + + #define ACC_GH(hist, i, g, h) \ + const auto ti = static_cast(i) << 1; \ + hist[ti] += g; \ + hist[ti + 1] += h; \ + + template + void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, hist_t* out) const { + data_size_t i = start; + if (use_prefetch) { + const data_size_t pf_offset = 32 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + + for (; i < pf_end; ++i) { + const auto idx = use_indices ? data_indices[i] : i; + const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset; + PREFETCH_T0(gradients + pf_idx); + if (use_hessians) { + PREFETCH_T0(hessians + pf_idx); + } + PREFETCH_T0(row_ptr_.data() + pf_idx); + PREFETCH_T0(data_.data() + row_ptr_[pf_idx]); + const auto j_start = RowPtr(idx); + const auto j_end = RowPtr(idx + 1); + for (auto j = j_start; j < j_end; ++j) { + const VAL_T bin = data_[j]; + if (use_hessians) { + ACC_GH(out, bin, gradients[idx], hessians[idx]); + } else { + ACC_GH(out, bin, gradients[idx], 1.0f); + } + } + } + } + for (; i < end; ++i) { + const auto idx = use_indices ? data_indices[i] : i; + const auto j_start = RowPtr(idx); + const auto j_end = RowPtr(idx + 1); + for (auto j = j_start; j < j_end; ++j) { + const VAL_T bin = data_[j]; + if (use_hessians) { + ACC_GH(out, bin, gradients[idx], hessians[idx]); + } else { + ACC_GH(out, bin, gradients[idx], 1.0f); + } + } + } + } + #undef ACC_GH + + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, gradients, hessians, out); + } + + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* hessians, + hist_t* out) const override { + ConstructHistogramInner(nullptr, start, end, gradients, hessians, out); + } + + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* gradients, + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, gradients, nullptr, out); + } + + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* gradients, + hist_t* out) const override { + ConstructHistogramInner(nullptr, start, end, gradients, nullptr, out); + } + + void CopySubset(const Bin * full_bin, const data_size_t * used_indices, data_size_t num_used_indices) override { + auto other_bin = dynamic_cast*>(full_bin); + row_ptr_.resize(num_data_ + 1, 0); + data_.clear(); + for (data_size_t i = 0; i < num_used_indices; ++i) { + for (data_size_t j = other_bin->row_ptr_[used_indices[i]]; j < other_bin->row_ptr_[used_indices[i] + 1]; ++j) { + data_.push_back(other_bin->data_[j]); + } + row_ptr_[i + 1] = row_ptr_[i] + other_bin->row_ptr_[used_indices[i] + 1] - other_bin->row_ptr_[used_indices[i]]; + } + } + + inline data_size_t RowPtr(data_size_t idx) const { + return row_ptr_[idx]; + } + + MultiValSparseBin* Clone() override; + +private: + data_size_t num_data_; + int num_bin_; + std::vector> data_; + std::vector> row_ptr_; + std::vector> t_data_; + + MultiValSparseBin(const MultiValSparseBin & other) + : num_data_(other.num_data_), num_bin_(other.num_bin_), data_(other.data_), row_ptr_(other.row_ptr_) { + } +}; + +template +MultiValSparseBin* MultiValSparseBin::Clone() { + return new MultiValSparseBin(*this); +} + + + +} // namespace LightGBM +#endif // LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_ diff --git a/src/io/ordered_sparse_bin.hpp b/src/io/ordered_sparse_bin.hpp deleted file mode 100644 index d1b8bdf61bf9..000000000000 --- a/src/io/ordered_sparse_bin.hpp +++ /dev/null @@ -1,156 +0,0 @@ -/*! - * Copyright (c) 2016 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_ -#define LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_ - -#include - -#include -#include -#include -#include -#include -#include - -#include "sparse_bin.hpp" - -namespace LightGBM { - -/*! -* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin -* There are 2 advantages by using ordered bin. -* 1. group the data by leafs to improve the cache hit. -* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features. -* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature. -* So we only using ordered bin for sparse situations. -*/ -template -class OrderedSparseBin: public OrderedBin { - public: - /*! \brief Pair to store one bin entry */ - struct SparsePair { - data_size_t ridx; // data(row) index - VAL_T bin; // bin for this data - SparsePair() : ridx(0), bin(0) {} - }; - - explicit OrderedSparseBin(const SparseBin* bin_data) - :bin_data_(bin_data) { - data_size_t cur_pos = 0; - data_size_t i_delta = -1; - int non_zero_cnt = 0; - while (bin_data_->NextNonzero(&i_delta, &cur_pos)) { - ++non_zero_cnt; - } - ordered_pair_.resize(non_zero_cnt); - leaf_cnt_.push_back(non_zero_cnt); - } - - ~OrderedSparseBin() { - } - - void Init(const char* used_idices, int num_leaves) override { - // initialize the leaf information - leaf_start_ = std::vector(num_leaves, 0); - leaf_cnt_ = std::vector(num_leaves, 0); - if (used_idices == nullptr) { - // if using all data, copy all non-zero pair - data_size_t j = 0; - data_size_t cur_pos = 0; - data_size_t i_delta = -1; - while (bin_data_->NextNonzero(&i_delta, &cur_pos)) { - ordered_pair_[j].ridx = cur_pos; - ordered_pair_[j].bin = bin_data_->vals_[i_delta]; - ++j; - } - leaf_cnt_[0] = static_cast(j); - } else { - // if using part of data(bagging) - data_size_t j = 0; - data_size_t cur_pos = 0; - data_size_t i_delta = -1; - while (bin_data_->NextNonzero(&i_delta, &cur_pos)) { - if (used_idices[cur_pos]) { - ordered_pair_[j].ridx = cur_pos; - ordered_pair_[j].bin = bin_data_->vals_[i_delta]; - ++j; - } - } - leaf_cnt_[0] = j; - } - } - - void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian, - HistogramBinEntry* out) const override { - // get current leaf boundary - const data_size_t start = leaf_start_[leaf]; - const data_size_t end = start + leaf_cnt_[leaf]; - for (data_size_t i = start; i < end; ++i) { - const VAL_T bin = ordered_pair_[i].bin; - const auto g = gradient[ordered_pair_[i].ridx]; - const auto h = hessian[ordered_pair_[i].ridx]; - - out[bin].sum_gradients += g; - out[bin].sum_hessians += h; - ++out[bin].cnt; - } - } - - void ConstructHistogram(int leaf, const score_t* gradient, - HistogramBinEntry* out) const override { - // get current leaf boundary - const data_size_t start = leaf_start_[leaf]; - const data_size_t end = start + leaf_cnt_[leaf]; - for (data_size_t i = start; i < end; ++i) { - const VAL_T bin = ordered_pair_[i].bin; - const auto g = gradient[ordered_pair_[i].ridx]; - out[bin].sum_gradients += g; - ++out[bin].cnt; - } - } - - void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) override { - // get current leaf boundary - const data_size_t l_start = leaf_start_[leaf]; - const data_size_t l_end = l_start + leaf_cnt_[leaf]; - // new left leaf end after split - data_size_t new_left_end = l_start; - - for (data_size_t i = l_start; i < l_end; ++i) { - if (is_in_leaf[ordered_pair_[i].ridx] == mark) { - std::swap(ordered_pair_[new_left_end], ordered_pair_[i]); - ++new_left_end; - } - } - - leaf_start_[right_leaf] = new_left_end; - leaf_cnt_[leaf] = new_left_end - l_start; - leaf_cnt_[right_leaf] = l_end - new_left_end; - } - data_size_t NonZeroCount(int leaf) const override { - return static_cast(leaf_cnt_[leaf]); - } - /*! \brief Disable copy */ - OrderedSparseBin& operator=(const OrderedSparseBin&) = delete; - /*! \brief Disable copy */ - OrderedSparseBin(const OrderedSparseBin&) = delete; - - private: - const SparseBin* bin_data_; - /*! \brief Store non-zero pair , group by leaf */ - std::vector ordered_pair_; - /*! \brief leaf_start_[i] means data in i-th leaf start from */ - std::vector leaf_start_; - /*! \brief leaf_cnt_[i] means number of data in i-th leaf */ - std::vector leaf_cnt_; -}; - -template -OrderedBin* SparseBin::CreateOrderedBin() const { - return new OrderedSparseBin(this); -} - -} // namespace LightGBM -#endif // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_ diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index 7cd2d7c15e89..07898fa1ac65 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -24,7 +24,7 @@ const size_t kNumFastIndex = 64; template class SparseBinIterator: public BinIterator { - public: +public: SparseBinIterator(const SparseBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) : bin_data_(bin_data), min_bin_(static_cast(min_bin)), @@ -56,7 +56,7 @@ class SparseBinIterator: public BinIterator { inline void Reset(data_size_t idx) override; - private: +private: const SparseBin* bin_data_; data_size_t cur_pos_; data_size_t i_delta_; @@ -66,20 +66,16 @@ class SparseBinIterator: public BinIterator { uint8_t offset_; }; -template -class OrderedSparseBin; - template class SparseBin: public Bin { - public: +public: friend class SparseBinIterator; - friend class OrderedSparseBin; explicit SparseBin(data_size_t num_data) : num_data_(num_data) { int num_threads = 1; -#pragma omp parallel -#pragma omp master + #pragma omp parallel + #pragma omp master { num_threads = omp_get_num_threads(); } @@ -102,41 +98,97 @@ class SparseBin: public Bin { BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; - void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*, - const score_t*, HistogramBinEntry*) const override { - // Will use OrderedSparseBin->ConstructHistogram() instead - Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); + #define ACC_GH(hist, i, g, h) \ + const auto ti = static_cast(i) << 1; \ + hist[ti] += g; \ + hist[ti + 1] += h; \ + + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override { + data_size_t i_delta, cur_pos; + InitIndex(data_indices[start], &i_delta, &cur_pos); + data_size_t i = start; + for (;;) { + if (cur_pos < data_indices[i]) { + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { break; } + } else if (cur_pos > data_indices[i]) { + if (++i >= end) { break; } + } else { + const VAL_T bin = vals_[i_delta]; + ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); + if (++i >= end) { break; } + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { break; } + } + } } - void ConstructHistogram(data_size_t, data_size_t, const score_t*, - const score_t*, HistogramBinEntry*) const override { - // Will use OrderedSparseBin->ConstructHistogram() instead - Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const override { + data_size_t i_delta, cur_pos; + InitIndex(start, &i_delta, &cur_pos); + while (cur_pos < start && i_delta < num_vals_) { + cur_pos += deltas_[++i_delta]; + } + while (cur_pos < end && i_delta < num_vals_) { + const VAL_T bin = vals_[i_delta]; + ACC_GH(out, bin, ordered_gradients[cur_pos], ordered_hessians[cur_pos]); + cur_pos += deltas_[++i_delta]; + } } - void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*, - HistogramBinEntry*) const override { - // Will use OrderedSparseBin->ConstructHistogram() instead - Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + data_size_t i_delta, cur_pos; + InitIndex(data_indices[start], &i_delta, &cur_pos); + data_size_t i = start; + for (;;) { + if (cur_pos < data_indices[i]) { + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { break; } + } else if (cur_pos > data_indices[i]) { + if (++i >= end) { break; } + } else { + const VAL_T bin = vals_[i_delta]; + ACC_GH(out, bin, ordered_gradients[i], 1.0f); + if (++i >= end) { break; } + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { break; } + } + } } - void ConstructHistogram(data_size_t, data_size_t, const score_t*, - HistogramBinEntry*) const override { - // Will use OrderedSparseBin->ConstructHistogram() instead - Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); + void ConstructHistogram(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + data_size_t i_delta, cur_pos; + InitIndex(start, &i_delta, &cur_pos); + while (cur_pos < start && i_delta < num_vals_) { + cur_pos += deltas_[++i_delta]; + } + while (cur_pos < end && i_delta < num_vals_) { + const VAL_T bin = vals_[i_delta]; + ACC_GH(out, bin, ordered_gradients[cur_pos], 1.0f); + cur_pos += deltas_[++i_delta]; + } } + #undef ACC_GH - inline bool NextNonzero(data_size_t* i_delta, - data_size_t* cur_pos) const { - ++(*i_delta); - data_size_t shift = 0; - data_size_t delta = deltas_[*i_delta]; - while (*i_delta < num_vals_ && vals_[*i_delta] == 0) { - ++(*i_delta); - shift += 8; - delta |= static_cast(deltas_[*i_delta]) << shift; + inline void NextNonzeroFast(data_size_t* i_delta, + data_size_t* cur_pos) const { + *cur_pos += deltas_[++(*i_delta)]; + if (*i_delta >= num_vals_) { + *cur_pos = num_data_; } - *cur_pos += delta; + } + + inline bool NextNonzero(data_size_t* i_delta, + data_size_t* cur_pos) const { + *cur_pos += deltas_[++(*i_delta)]; if (*i_delta < num_vals_) { return true; } else { @@ -257,8 +309,6 @@ class SparseBin: public Bin { data_size_t num_data() const override { return num_data_; } - OrderedBin* CreateOrderedBin() const override; - void FinishLoad() override { // get total non zero size size_t pair_cnt = 0; @@ -276,8 +326,8 @@ class SparseBin: public Bin { // sort by data index std::sort(idx_val_pairs.begin(), idx_val_pairs.end(), [](const std::pair& a, const std::pair& b) { - return a.first < b.first; - }); + return a.first < b.first; + }); // load delta array LoadFromPair(idx_val_pairs); } @@ -291,11 +341,12 @@ class SparseBin: public Bin { const data_size_t cur_idx = idx_val_pairs[i].first; const VAL_T bin = idx_val_pairs[i].second; data_size_t cur_delta = cur_idx - last_idx; + // disallow the multi-val in one row if (i > 0 && cur_delta == 0) { continue; } while (cur_delta >= 256) { - deltas_.push_back(cur_delta & 0xff); + deltas_.push_back(255); vals_.push_back(0); - cur_delta >>= 8; + cur_delta -= 255; } deltas_.push_back(static_cast(cur_delta)); vals_.push_back(bin); @@ -384,7 +435,7 @@ class SparseBin: public Bin { while (cur_pos < idx && j < num_vals_) { NextNonzero(&j, &cur_pos); } - if (cur_pos == idx && j < num_vals_) { + if (cur_pos == idx && j < num_vals_ && vals_[j] > 0) { // new row index is i tmp_pair.emplace_back(i, vals_[j]); } @@ -405,13 +456,13 @@ class SparseBin: public Bin { // transform to delta array data_size_t last_idx = 0; for (data_size_t i = 0; i < num_used_indices; ++i) { - VAL_T bin = iterator.InnerRawGet(used_indices[i]); + auto bin = iterator.InnerRawGet(used_indices[i]); if (bin > 0) { data_size_t cur_delta = i - last_idx; while (cur_delta >= 256) { - deltas_.push_back(cur_delta & 0xff); + deltas_.push_back(255); vals_.push_back(0); - cur_delta >>= 8; + cur_delta -= 255; } deltas_.push_back(static_cast(cur_delta)); vals_.push_back(bin); @@ -432,15 +483,29 @@ class SparseBin: public Bin { SparseBin* Clone() override; - protected: SparseBin(const SparseBin& other) : num_data_(other.num_data_), deltas_(other.deltas_), vals_(other.vals_), - num_vals_(other.num_vals_), push_buffers_(other.push_buffers_), - fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) {} + num_vals_(other.num_vals_), push_buffers_(other.push_buffers_), + fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) { + } + + void InitIndex(data_size_t start_idx, data_size_t * i_delta, data_size_t * cur_pos) const { + auto idx = start_idx >> fast_index_shift_; + if (static_cast(idx) < fast_index_.size()) { + const auto fast_pair = fast_index_[start_idx >> fast_index_shift_]; + *i_delta = fast_pair.first; + *cur_pos = fast_pair.second; + } else { + *i_delta = -1; + *cur_pos = 0; + } + } + +private: data_size_t num_data_; - std::vector deltas_; - std::vector vals_; + std::vector> deltas_; + std::vector> vals_; data_size_t num_vals_; std::vector>> push_buffers_; std::vector> fast_index_; @@ -460,7 +525,7 @@ inline uint32_t SparseBinIterator::RawGet(data_size_t idx) { template inline VAL_T SparseBinIterator::InnerRawGet(data_size_t idx) { while (cur_pos_ < idx) { - bin_data_->NextNonzero(&i_delta_, &cur_pos_); + bin_data_->NextNonzeroFast(&i_delta_, &cur_pos_); } if (cur_pos_ == idx) { return bin_data_->vals_[i_delta_]; @@ -471,15 +536,7 @@ inline VAL_T SparseBinIterator::InnerRawGet(data_size_t idx) { template inline void SparseBinIterator::Reset(data_size_t start_idx) { - auto idx = start_idx >> bin_data_->fast_index_shift_; - if (static_cast(idx) < bin_data_->fast_index_.size()) { - const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_]; - i_delta_ = fast_pair.first; - cur_pos_ = fast_pair.second; - } else { - i_delta_ = -1; - cur_pos_ = 0; - } + bin_data_->InitIndex(start_idx, &i_delta_, &cur_pos_); } template diff --git a/src/objective/rank_xendcg_objective.hpp b/src/objective/rank_xendcg_objective.hpp index 81c8ab70f33f..1f9d4ae75327 100644 --- a/src/objective/rank_xendcg_objective.hpp +++ b/src/objective/rank_xendcg_objective.hpp @@ -73,9 +73,9 @@ class RankXENDCG: public ObjectiveFunction { // Skip query if sum of labels is 0. float sum_labels = 0; for (data_size_t i = 0; i < cnt; ++i) { - sum_labels += phi(label[i], gammas[i]); + sum_labels += static_cast(phi(label[i], gammas[i])); } - if (sum_labels == 0) { + if (std::fabs(sum_labels) < kEpsilon) { return; } @@ -111,7 +111,7 @@ class RankXENDCG: public ObjectiveFunction { } double phi(const label_t l, double g) const { - return Common::Pow(2, l) - g; + return Common::Pow(2, static_cast(l)) - g; } const char* GetName() const override { diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index f415e4e59991..a728a17b7193 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -27,7 +27,7 @@ void DataParallelTreeLearner::Init(const Dataset* train_data, boo rank_ = Network::rank(); num_machines_ = Network::num_machines(); // allocate buffer for communication - size_t buffer_size = this->train_data_->NumTotalBin() * sizeof(HistogramBinEntry); + size_t buffer_size = this->train_data_->NumTotalBin() * KHistEntrySize; input_buffer_.resize(buffer_size); output_buffer_.resize(buffer_size); @@ -82,7 +82,7 @@ void DataParallelTreeLearner::BeforeTrain() { if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { num_bin -= 1; } - block_len_[i] += num_bin * sizeof(HistogramBinEntry); + block_len_[i] += num_bin * KHistEntrySize; } reduce_scatter_size_ += block_len_[i]; } @@ -101,7 +101,7 @@ void DataParallelTreeLearner::BeforeTrain() { if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { num_bin -= 1; } - bin_size += num_bin * sizeof(HistogramBinEntry); + bin_size += num_bin * KHistEntrySize; } } @@ -113,7 +113,7 @@ void DataParallelTreeLearner::BeforeTrain() { if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { num_bin -= 1; } - bin_size += num_bin * sizeof(HistogramBinEntry); + bin_size += num_bin * KHistEntrySize; } // sync global data sumup info @@ -158,8 +158,8 @@ void DataParallelTreeLearner::FindBestSplits() { this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram()); } // Reduce scatter for histogram - Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(), - block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramBinEntry::SumReducer); + Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(), + block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramSumReducer); this->FindBestSplitsFromHistograms(this->is_feature_used_, true); } @@ -186,7 +186,6 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms(const this->train_data_->FixHistogram(feature_index, this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), - GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()), this->smaller_leaf_histogram_array_[feature_index].RawData()); SplitInfo smaller_split; // find best threshold for smaller child diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp index 4cf9b5b22756..98847683f5dc 100644 --- a/src/treelearner/data_partition.hpp +++ b/src/treelearner/data_partition.hpp @@ -108,58 +108,70 @@ class DataPartition { * \param threshold threshold that want to split * \param right_leaf index of right leaf */ - void Split(int leaf, const Dataset* dataset, int feature, const uint32_t* threshold, int num_threshold, bool default_left, int right_leaf) { + void Split(int leaf, const Dataset* dataset, int feature, + const uint32_t* threshold, int num_threshold, bool default_left, + int right_leaf) { + Common::FunctionTimer fun_timer("DataPartition::Split", global_timer); const data_size_t min_inner_size = 512; // get leaf boundary const data_size_t begin = leaf_begin_[leaf]; const data_size_t cnt = leaf_count_[leaf]; - data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_; - if (inner_size < min_inner_size) { inner_size = min_inner_size; } + const int nblock = + std::min(num_threads_, (cnt + min_inner_size - 1) / min_inner_size); + data_size_t inner_size = SIZE_ALIGNED((cnt + nblock - 1) / nblock); + auto left_start = indices_.data() + begin; + global_timer.Start("DataPartition::Split.MT"); // split data multi-threading OMP_INIT_EX(); - #pragma omp parallel for schedule(static, 1) - for (int i = 0; i < num_threads_; ++i) { +#pragma omp parallel for schedule(static, 1) + for (int i = 0; i < nblock; ++i) { OMP_LOOP_EX_BEGIN(); - left_cnts_buf_[i] = 0; - right_cnts_buf_[i] = 0; data_size_t cur_start = i * inner_size; - if (cur_start > cnt) { continue; } - data_size_t cur_cnt = inner_size; - if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; } + data_size_t cur_cnt = std::min(inner_size, cnt - cur_start); + if (cur_cnt <= 0) { + left_cnts_buf_[i] = 0; + right_cnts_buf_[i] = 0; + continue; + } // split data inner, reduce the times of function called - data_size_t cur_left_count = dataset->Split(feature, threshold, num_threshold, default_left, indices_.data() + begin + cur_start, cur_cnt, - temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start); + data_size_t cur_left_count = + dataset->Split(feature, threshold, num_threshold, default_left, + left_start + cur_start, cur_cnt, + temp_left_indices_.data() + cur_start, + temp_right_indices_.data() + cur_start); offsets_buf_[i] = cur_start; left_cnts_buf_[i] = cur_left_count; right_cnts_buf_[i] = cur_cnt - cur_left_count; OMP_LOOP_EX_END(); } OMP_THROW_EX(); - data_size_t left_cnt = 0; + global_timer.Stop("DataPartition::Split.MT"); + global_timer.Start("DataPartition::Split.Merge"); left_write_pos_buf_[0] = 0; right_write_pos_buf_[0] = 0; - for (int i = 1; i < num_threads_; ++i) { - left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1]; - right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1]; + for (int i = 1; i < nblock; ++i) { + left_write_pos_buf_[i] = + left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1]; + right_write_pos_buf_[i] = + right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1]; } - left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1]; - // copy back indices of right leaf to indices_ - #pragma omp parallel for schedule(static, 1) - for (int i = 0; i < num_threads_; ++i) { - if (left_cnts_buf_[i] > 0) { - std::memcpy(indices_.data() + begin + left_write_pos_buf_[i], - temp_left_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t)); - } - if (right_cnts_buf_[i] > 0) { - std::memcpy(indices_.data() + begin + left_cnt + right_write_pos_buf_[i], - temp_right_indices_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t)); - } + data_size_t left_cnt = + left_write_pos_buf_[nblock - 1] + left_cnts_buf_[nblock - 1]; + + auto right_start = left_start + left_cnt; +#pragma omp parallel for schedule(static) + for (int i = 0; i < nblock; ++i) { + std::copy_n(temp_left_indices_.data() + offsets_buf_[i], + left_cnts_buf_[i], left_start + left_write_pos_buf_[i]); + std::copy_n(temp_right_indices_.data() + offsets_buf_[i], + right_cnts_buf_[i], right_start + right_write_pos_buf_[i]); } // update leaf boundary leaf_count_[leaf] = left_cnt; leaf_begin_[right_leaf] = left_cnt + begin; leaf_count_[right_leaf] = cnt - left_cnt; + global_timer.Stop("DataPartition::Split.Merge"); } /*! @@ -201,11 +213,11 @@ class DataPartition { /*! \brief number of data on one leaf */ std::vector leaf_count_; /*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */ - std::vector indices_; + std::vector> indices_; /*! \brief team indices buffer for split */ - std::vector temp_left_indices_; + std::vector> temp_left_indices_; /*! \brief team indices buffer for split */ - std::vector temp_right_indices_; + std::vector> temp_right_indices_; /*! \brief used data indices, used for bagging */ const data_size_t* used_data_indices_; /*! \brief used data count, used for bagging */ diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 8e0c3c585d7c..6dde6c4a541e 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -5,6 +5,7 @@ #ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_ #define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_ +#include #include #include @@ -20,7 +21,7 @@ namespace LightGBM { class FeatureMetainfo { - public: +public: int num_bin; MissingType missing_type; int8_t offset = 0; @@ -35,7 +36,7 @@ class FeatureMetainfo { * \brief FeatureHistogram is used to construct and store a histogram for a feature. */ class FeatureHistogram { - public: +public: FeatureHistogram() { data_ = nullptr; } @@ -53,19 +54,19 @@ class FeatureHistogram { * \param feature the feature data for this histogram * \param min_num_data_one_leaf minimal number of data in one leaf */ - void Init(HistogramBinEntry* data, const FeatureMetainfo* meta) { + void Init(hist_t* data, const FeatureMetainfo* meta) { meta_ = meta; data_ = data; if (meta_->bin_type == BinType::NumericalBin) { find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1 - , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6); + , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6); } else { find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1 - , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6); + , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6); } } - HistogramBinEntry* RawData() { + hist_t* RawData() { return data_; } /*! @@ -73,15 +74,13 @@ class FeatureHistogram { * \param other The histogram that want to subtract */ void Subtract(const FeatureHistogram& other) { - for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { - data_[i].cnt -= other.data_[i].cnt; - data_[i].sum_gradients -= other.data_[i].sum_gradients; - data_[i].sum_hessians -= other.data_[i].sum_hessians; + for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) { + data_[i] -= other.data_[i]; } } void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, - SplitInfo* output) { + SplitInfo* output) { output->default_left = true; output->gain = kMinScore; find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output); @@ -89,10 +88,10 @@ class FeatureHistogram { } void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, - SplitInfo* output) { + SplitInfo* output) { is_splittable_ = false; double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step); + meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { if (meta_->missing_type == MissingType::Zero) { @@ -116,8 +115,8 @@ class FeatureHistogram { } void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data, - double min_constraint, double max_constraint, - SplitInfo* output) { + double min_constraint, double max_constraint, + SplitInfo* output) { output->default_left = false; double best_gain = kMinScore; data_size_t best_left_count = 0; @@ -134,25 +133,28 @@ class FeatureHistogram { bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot; int best_threshold = -1; int best_dir = 1; - + const double cnt_factor = num_data / sum_hessian; if (use_onehot) { for (int t = 0; t < used_bin; ++t) { + const auto grad = GET_GRAD(data_, t); + const auto hess = GET_HESS(data_, t); + data_size_t cnt = static_cast(Common::RoundInt(hess * cnt_factor)); // if data not enough, or sum hessian too small - if (data_[t].cnt < meta_->config->min_data_in_leaf - || data_[t].sum_hessians < meta_->config->min_sum_hessian_in_leaf) continue; - data_size_t other_count = num_data - data_[t].cnt; + if (cnt < meta_->config->min_data_in_leaf + || hess < meta_->config->min_sum_hessian_in_leaf) continue; + data_size_t other_count = num_data - cnt; // if data not enough if (other_count < meta_->config->min_data_in_leaf) continue; - double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon; + double sum_other_hessian = sum_hessian - hess - kEpsilon; // if sum hessian too small if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) continue; - double sum_other_gradient = sum_gradient - data_[t].sum_gradients; + double sum_other_gradient = sum_gradient - grad; // current split gain - double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint, 0); + double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon, + meta_->config->lambda_l1, l2, meta_->config->max_delta_step, + min_constraint, max_constraint, 0); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -161,15 +163,15 @@ class FeatureHistogram { // better split point if (current_gain > best_gain) { best_threshold = t; - best_sum_left_gradient = data_[t].sum_gradients; - best_sum_left_hessian = data_[t].sum_hessians + kEpsilon; - best_left_count = data_[t].cnt; + best_sum_left_gradient = grad; + best_sum_left_hessian = hess + kEpsilon; + best_left_count = cnt; best_gain = current_gain; } } } else { for (int i = 0; i < used_bin; ++i) { - if (data_[i].cnt >= meta_->config->cat_smooth) { + if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >= meta_->config->cat_smooth) { sorted_idx.push_back(i); } } @@ -181,9 +183,9 @@ class FeatureHistogram { return (sum_grad) / (sum_hess + meta_->config->cat_smooth); }; std::sort(sorted_idx.begin(), sorted_idx.end(), - [this, &ctr_fun](int i, int j) { - return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians); - }); + [this, &ctr_fun](int i, int j) { + return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) < ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j)); + }); std::vector find_direction(1, 1); std::vector start_position(1, 0); @@ -203,14 +205,17 @@ class FeatureHistogram { for (int i = 0; i < used_bin && i < max_num_cat; ++i) { auto t = sorted_idx[start_pos]; start_pos += dir; + const auto grad = GET_GRAD(data_, t); + const auto hess = GET_HESS(data_, t); + data_size_t cnt = static_cast(Common::RoundInt(hess * cnt_factor)); - sum_left_gradient += data_[t].sum_gradients; - sum_left_hessian += data_[t].sum_hessians; - left_count += data_[t].cnt; - cnt_cur_group += data_[t].cnt; + sum_left_gradient += grad; + sum_left_hessian += hess; + left_count += cnt; + cnt_cur_group += cnt; if (left_count < meta_->config->min_data_in_leaf - || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue; + || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue; data_size_t right_count = num_data - left_count; if (right_count < meta_->config->min_data_in_leaf || right_count < min_data_per_group) break; @@ -223,8 +228,8 @@ class FeatureHistogram { double sum_right_gradient = sum_gradient - sum_left_gradient; double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint, 0); + meta_->config->lambda_l1, l2, meta_->config->max_delta_step, + min_constraint, max_constraint, 0); if (current_gain <= min_gain_shift) continue; is_splittable_ = true; if (current_gain > best_gain) { @@ -241,15 +246,15 @@ class FeatureHistogram { if (is_splittable_) { output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + meta_->config->lambda_l1, l2, meta_->config->max_delta_step, + min_constraint, max_constraint); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient, - sum_hessian - best_sum_left_hessian, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + sum_hessian - best_sum_left_hessian, + meta_->config->lambda_l1, l2, meta_->config->max_delta_step, + min_constraint, max_constraint); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; @@ -279,22 +284,22 @@ class FeatureHistogram { } void GatherInfoForThreshold(double sum_gradient, double sum_hessian, - uint32_t threshold, data_size_t num_data, SplitInfo *output) { + uint32_t threshold, data_size_t num_data, SplitInfo* output) { if (meta_->bin_type == BinType::NumericalBin) { GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold, - num_data, output); + num_data, output); } else { GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold, - num_data, output); + num_data, output); } } void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian, - uint32_t threshold, data_size_t num_data, - SplitInfo *output) { + uint32_t threshold, data_size_t num_data, + SplitInfo* output) { double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step); + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; // do stuff here @@ -315,27 +320,29 @@ class FeatureHistogram { int t = meta_->num_bin - 1 - offset - use_na_as_missing; const int t_end = 1 - offset; - + const double cnt_factor = num_data / sum_hessian; // from right to left, and we don't need data in bin0 for (; t >= t_end; --t) { if (static_cast(t + offset) < threshold) { break; } // need to skip default bin if (skip_default_bin && (t + offset) == static_cast(meta_->default_bin)) { continue; } - - sum_right_gradient += data_[t].sum_gradients; - sum_right_hessian += data_[t].sum_hessians; - right_count += data_[t].cnt; + const auto grad = GET_GRAD(data_, t); + const auto hess = GET_HESS(data_, t); + data_size_t cnt = static_cast(Common::RoundInt(hess * cnt_factor)); + sum_right_gradient += grad; + sum_right_hessian += hess; + right_count += cnt; } double sum_left_gradient = sum_gradient - sum_right_gradient; double sum_left_hessian = sum_hessian - sum_right_hessian; data_size_t left_count = num_data - right_count; double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step) - + GetLeafSplitGain(sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step); + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step) + + GetLeafSplitGain(sum_right_gradient, sum_right_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step); // gain with split is worse than without split if (std::isnan(current_gain) || current_gain <= min_gain_shift) { @@ -347,15 +354,15 @@ class FeatureHistogram { // update split information output->threshold = threshold; output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step); + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step); output->left_count = left_count; output->left_sum_gradient = sum_left_gradient; output->left_sum_hessian = sum_left_hessian - kEpsilon; output->right_output = CalculateSplittedLeafOutput(sum_gradient - sum_left_gradient, - sum_hessian - sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step); + sum_hessian - sum_left_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step); output->right_count = num_data - left_count; output->right_sum_gradient = sum_gradient - sum_left_gradient; output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon; @@ -365,13 +372,13 @@ class FeatureHistogram { } void GatherInfoForThresholdCategorical(double sum_gradient, double sum_hessian, - uint32_t threshold, data_size_t num_data, SplitInfo *output) { + uint32_t threshold, data_size_t num_data, SplitInfo* output) { // get SplitInfo for a given one-hot categorical split. output->default_left = false; double gain_shift = GetLeafSplitGain( - sum_gradient, sum_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, - meta_->config->max_delta_step); + sum_gradient, sum_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step); double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; bool is_full_categorical = meta_->missing_type == MissingType::None; int used_bin = meta_->num_bin - 1 + is_full_categorical; @@ -380,21 +387,25 @@ class FeatureHistogram { Log::Warning("Invalid categorical threshold split"); return; } + const double cnt_factor = num_data / sum_hessian; + const auto grad = GET_GRAD(data_, threshold); + const auto hess = GET_HESS(data_, threshold); + data_size_t cnt = static_cast(Common::RoundInt(hess * cnt_factor)); double l2 = meta_->config->lambda_l2; - data_size_t left_count = data_[threshold].cnt; + data_size_t left_count = cnt; data_size_t right_count = num_data - left_count; - double sum_left_hessian = data_[threshold].sum_hessians + kEpsilon; + double sum_left_hessian = hess + kEpsilon; double sum_right_hessian = sum_hessian - sum_left_hessian; - double sum_left_gradient = data_[threshold].sum_gradients; + double sum_left_gradient = grad; double sum_right_gradient = sum_gradient - sum_left_gradient; // current split gain double current_gain = GetLeafSplitGain(sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, l2, - meta_->config->max_delta_step) - + GetLeafSplitGain(sum_left_gradient, sum_left_hessian, - meta_->config->lambda_l1, l2, - meta_->config->max_delta_step); + meta_->config->lambda_l1, l2, + meta_->config->max_delta_step) + + GetLeafSplitGain(sum_left_gradient, sum_left_hessian, + meta_->config->lambda_l1, l2, + meta_->config->max_delta_step); if (std::isnan(current_gain) || current_gain <= min_gain_shift) { output->gain = kMinScore; Log::Warning("'Forced Split' will be ignored since the gain getting worse. "); @@ -402,14 +413,14 @@ class FeatureHistogram { } output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian, - meta_->config->lambda_l1, l2, - meta_->config->max_delta_step); + meta_->config->lambda_l1, l2, + meta_->config->max_delta_step); output->left_count = left_count; output->left_sum_gradient = sum_left_gradient; output->left_sum_hessian = sum_left_hessian - kEpsilon; output->right_output = CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, l2, - meta_->config->max_delta_step); + meta_->config->lambda_l1, l2, + meta_->config->max_delta_step); output->right_count = right_count; output->right_sum_gradient = sum_gradient - sum_left_gradient; output->right_sum_hessian = sum_right_hessian - kEpsilon; @@ -423,14 +434,14 @@ class FeatureHistogram { * \brief Binary size of this histogram */ int SizeOfHistgram() const { - return (meta_->num_bin - meta_->offset) * sizeof(HistogramBinEntry); + return (meta_->num_bin - meta_->offset) * KHistEntrySize; } /*! * \brief Restore histogram from memory */ void FromMemory(char* memory_data) { - std::memcpy(data_, memory_data, (meta_->num_bin - meta_->offset) * sizeof(HistogramBinEntry)); + std::memcpy(data_, memory_data, (meta_->num_bin - meta_->offset) * KHistEntrySize); } /*! @@ -457,11 +468,11 @@ class FeatureHistogram { } } - private: +private: static double GetSplitGains(double sum_left_gradients, double sum_left_hessians, - double sum_right_gradients, double sum_right_hessians, - double l1, double l2, double max_delta_step, - double min_constraint, double max_constraint, int8_t monotone_constraint) { + double sum_right_gradients, double sum_right_hessians, + double l1, double l2, double max_delta_step, + double min_constraint, double max_constraint, int8_t monotone_constraint) { double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint, max_constraint); double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint, max_constraint); if (((monotone_constraint > 0) && (left_output > right_output)) || @@ -479,7 +490,7 @@ class FeatureHistogram { * \return leaf output */ static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step, - double min_constraint, double max_constraint) { + double min_constraint, double max_constraint) { double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step); if (ret < min_constraint) { ret = min_constraint; @@ -506,7 +517,7 @@ class FeatureHistogram { } void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, - double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) { + double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) { const int8_t offset = meta_->offset; double best_sum_left_gradient = NAN; @@ -514,7 +525,7 @@ class FeatureHistogram { double best_gain = kMinScore; data_size_t best_left_count = 0; uint32_t best_threshold = static_cast(meta_->num_bin); - + const double cnt_factor = num_data / sum_hessian; if (dir == -1) { double sum_right_gradient = 0.0f; double sum_right_hessian = kEpsilon; @@ -528,12 +539,15 @@ class FeatureHistogram { // need to skip default bin if (skip_default_bin && (t + offset) == static_cast(meta_->default_bin)) { continue; } - sum_right_gradient += data_[t].sum_gradients; - sum_right_hessian += data_[t].sum_hessians; - right_count += data_[t].cnt; + const auto grad = GET_GRAD(data_, t); + const auto hess = GET_HESS(data_, t); + data_size_t cnt = static_cast(Common::RoundInt(hess * cnt_factor)); + sum_right_gradient += grad; + sum_right_hessian += hess; + right_count += cnt; // if data not enough, or sum hessian too small if (right_count < meta_->config->min_data_in_leaf - || sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) continue; + || sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) continue; data_size_t left_count = num_data - right_count; // if data not enough if (left_count < meta_->config->min_data_in_leaf) break; @@ -545,8 +559,8 @@ class FeatureHistogram { double sum_left_gradient = sum_gradient - sum_right_gradient; // current split gain double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint, meta_->monotone_type); + meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, + min_constraint, max_constraint, meta_->monotone_type); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -575,9 +589,12 @@ class FeatureHistogram { sum_left_hessian = sum_hessian - kEpsilon; left_count = num_data; for (int i = 0; i < meta_->num_bin - offset; ++i) { - sum_left_gradient -= data_[i].sum_gradients; - sum_left_hessian -= data_[i].sum_hessians; - left_count -= data_[i].cnt; + const auto grad = GET_GRAD(data_, i); + const auto hess = GET_HESS(data_, i); + data_size_t cnt = static_cast(Common::RoundInt(hess * cnt_factor)); + sum_left_gradient -= grad; + sum_left_hessian -= hess; + left_count -= cnt; } t = -1; } @@ -586,13 +603,13 @@ class FeatureHistogram { // need to skip default bin if (skip_default_bin && (t + offset) == static_cast(meta_->default_bin)) { continue; } if (t >= 0) { - sum_left_gradient += data_[t].sum_gradients; - sum_left_hessian += data_[t].sum_hessians; - left_count += data_[t].cnt; + sum_left_gradient += GET_GRAD(data_, t); + sum_left_hessian += GET_HESS(data_, t); + left_count += static_cast(Common::RoundInt(GET_HESS(data_, t) * cnt_factor)); } // if data not enough, or sum hessian too small if (left_count < meta_->config->min_data_in_leaf - || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue; + || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue; data_size_t right_count = num_data - left_count; // if data not enough if (right_count < meta_->config->min_data_in_leaf) break; @@ -604,8 +621,8 @@ class FeatureHistogram { double sum_right_gradient = sum_gradient - sum_left_gradient; // current split gain double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint, meta_->monotone_type); + meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, + min_constraint, max_constraint, meta_->monotone_type); // gain with split is worse than without split if (current_gain <= min_gain_shift) continue; @@ -626,15 +643,15 @@ class FeatureHistogram { // update split information output->threshold = best_threshold; output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, + min_constraint, max_constraint); output->left_count = best_left_count; output->left_sum_gradient = best_sum_left_gradient; output->left_sum_hessian = best_sum_left_hessian - kEpsilon; output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient, - sum_hessian - best_sum_left_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, - min_constraint, max_constraint); + sum_hessian - best_sum_left_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, + min_constraint, max_constraint); output->right_count = num_data - best_left_count; output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; @@ -645,14 +662,13 @@ class FeatureHistogram { const FeatureMetainfo* meta_; /*! \brief sum of gradient of each bin */ - HistogramBinEntry* data_; - // std::vector data_; + hist_t* data_; bool is_splittable_ = true; std::function find_best_threshold_fun_; }; class HistogramPool { - public: +public: /*! * \brief Constructor */ @@ -698,7 +714,7 @@ class HistogramPool { } } - void DynamicChangeSize(const Dataset* train_data, const Config* config, int cache_size, int total_size) { + void DynamicChangeSize(const Dataset* train_data, bool is_hist_colwise, const Config* config, int cache_size, int total_size) { if (feature_metas_.empty()) { uint64_t bin_cnt_over_features = 0; int num_feature = train_data->num_features(); @@ -720,7 +736,6 @@ class HistogramPool { } Log::Info("Total Bins %d", bin_cnt_over_features); } - uint64_t num_total_bin = train_data->NumTotalBin(); int old_cache_size = static_cast(pool_.size()); Reset(cache_size, total_size); @@ -728,24 +743,39 @@ class HistogramPool { pool_.resize(cache_size); data_.resize(cache_size); } + int num_total_bin = static_cast(train_data->NumTotalBin()); + std::vector offsets; + if (is_hist_colwise) { + int offset = 0; + for (int j = 0; j < train_data->num_features(); ++j) { + offset += train_data->SubFeatureBinOffset(j); + offsets.push_back(offset); + auto num_bin = train_data->FeatureNumBin(j); + if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) { + num_bin -= 1; + } + offset += num_bin; + } + } else { + num_total_bin = 1; + for (int j = 0; j < train_data->num_features(); ++j) { + offsets.push_back(num_total_bin); + num_total_bin += train_data->FeatureBinMapper(j)->num_bin(); + if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) { + num_total_bin -= 1; + } + } + } OMP_INIT_EX(); #pragma omp parallel for schedule(static) for (int i = old_cache_size; i < cache_size; ++i) { OMP_LOOP_EX_BEGIN(); pool_[i].reset(new FeatureHistogram[train_data->num_features()]); - data_[i].resize(num_total_bin); - uint64_t offset = 0; + data_[i].resize(num_total_bin * 2); for (int j = 0; j < train_data->num_features(); ++j) { - offset += static_cast(train_data->SubFeatureBinOffset(j)); - pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j]); - auto num_bin = train_data->FeatureNumBin(j); - if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) { - num_bin -= 1; - } - offset += static_cast(num_bin); + pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]); } - CHECK(offset == num_total_bin); OMP_LOOP_EX_END(); } OMP_THROW_EX(); @@ -816,9 +846,9 @@ class HistogramPool { inverse_mapper_[slot] = dst_idx; } - private: +private: std::vector> pool_; - std::vector> data_; + std::vector>> data_; std::vector feature_metas_; int cache_size_; int total_size_; diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index 9dd584574c9b..ceb8f87d3e3d 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -49,15 +49,15 @@ void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { // some functions used for debugging the GPU histogram construction #if GPU_DEBUG > 0 -void PrintHistograms(HistogramBinEntry* h, size_t size) { - size_t total = 0; +void PrintHistograms(hist_t* h, size_t size) { + double total_hess = 0; for (size_t i = 0; i < size; ++i) { - printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt); - total += h[i].cnt; - if ((i & 3) == 3) + printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i)); + if ((i & 2) == 2) printf("\n"); + total_hess += GET_HESS(h, i); } - printf("\nTotal examples: %lu\n", total); + printf("\nSum hessians: %9.3g\n", total_hess); } union Float_t { @@ -69,27 +69,23 @@ union Float_t { }; -void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) { +void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) { size_t i; Float_t a, b; for (i = 0; i < size; ++i) { - a.f = h1[i].sum_gradients; - b.f = h2[i].sum_gradients; + a.f = GET_GRAD(h1, i); + b.f = GET_GRAD(h2, i); int32_t ulps = Float_t::ulp_diff(a, b); - if (fabs(h1[i].cnt - h2[i].cnt != 0)) { - printf("%d != %d\n", h1[i].cnt, h2[i].cnt); - goto err; - } if (ulps > 0) { - // printf("grad %g != %g (%d ULPs)\n", h1[i].sum_gradients, h2[i].sum_gradients, ulps); + // printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps); // goto err; } - a.f = h1[i].sum_hessians; - b.f = h2[i].sum_hessians; + a.f = GET_HESS(h1, i); + b.f = GET_HESS(h2, i); ulps = Float_t::ulp_diff(a, b); - if (ulps > 0) { - // printf("hessian %g != %g (%d ULPs)\n", h1[i].sum_hessians, h2[i].sum_hessians, ulps); - // goto err; + if (std::fabs(a.f - b.f) >= 1e-20) { + printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps); + goto err; } } return; @@ -191,7 +187,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur } template -void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) { +void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) { HistType* hist_outputs = reinterpret_cast(host_histogram_outputs_); // when the output is ready, the computation is done histograms_wait_obj_.wait(); @@ -201,29 +197,25 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) { continue; } int dense_group_index = dense_feature_group_map_[i]; - auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index); + auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2; int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); if (device_bin_mults_[i] == 1) { for (int j = 0; j < bin_size; ++j) { - old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients; - old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians; - old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt; + GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j); + GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j); } } else { // values of this feature has been redistributed to multiple bins; need a reduction here int ind = 0; for (int j = 0; j < bin_size; ++j) { double sum_g = 0.0, sum_h = 0.0; - size_t cnt = 0; for (int k = 0; k < device_bin_mults_[i]; ++k) { - sum_g += hist_outputs[i * device_bin_size_+ ind].sum_gradients; - sum_h += hist_outputs[i * device_bin_size_+ ind].sum_hessians; - cnt += hist_outputs[i * device_bin_size_ + ind].cnt; + sum_g += GET_GRAD(hist_outputs, i * device_bin_size_+ ind); + sum_h += GET_HESS(hist_outputs, i * device_bin_size_+ ind); ind++; } - old_histogram_array[j].sum_gradients = sum_g; - old_histogram_array[j].sum_hessians = sum_h; - old_histogram_array[j].cnt = (data_size_t)cnt; + GET_GRAD(old_histogram_array, j) = sum_g; + GET_HESS(old_histogram_array, j) = sum_h; } } } @@ -233,7 +225,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) { void GPUTreeLearner::AllocateGPUMemory() { num_dense_feature_groups_ = 0; for (int i = 0; i < num_feature_groups_; ++i) { - if (ordered_bins_[i] == nullptr) { + if (!train_data_->IsMultiGroup(i)) { num_dense_feature_groups_++; } } @@ -303,7 +295,7 @@ void GPUTreeLearner::AllocateGPUMemory() { device_data_indices_ = std::unique_ptr>(new boost::compute::vector(allocated_num_data_, ctx_)); boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_); // histogram bin entry size depends on the precision (single/double) - hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry); + hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) * 2 : sizeof(gpu_hist_t) * 2; Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_); // create output buffer, each feature has a histogram with device_bin_size_ bins, // each work group generates a sub-histogram of dword_features_ features. @@ -326,7 +318,7 @@ void GPUTreeLearner::AllocateGPUMemory() { std::vector dense_dword_ind(dword_features_); for (int i = 0; i < num_feature_groups_; ++i) { // looking for dword_features_ non-sparse feature-groups - if (ordered_bins_[i] == nullptr) { + if (!train_data_->IsMultiGroup(i)) { dense_dword_ind[k] = i; // decide if we need to redistribute the bin double t = device_bin_size_ / static_cast(train_data_->FeatureGroupNumBin(i)); @@ -682,6 +674,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) { printf("bin size: "); #endif for (int i = 0; i < num_feature_groups_; ++i) { + if (train_data_->IsMultiGroup(i)) { + continue; + } #if GPU_DEBUG >= 1 printf("%d, ", train_data_->FeatureGroupNumBin(i)); #endif @@ -960,35 +955,34 @@ void GPUTreeLearner::ConstructHistograms(const std::vector& is_feature_u for (int feature_index = 0; feature_index < num_features_; ++feature_index) { if (!is_feature_used_[feature_index]) continue; if (!is_feature_used[feature_index]) continue; - if (ordered_bins_[train_data_->Feature2Group(feature_index)]) { + if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) { is_sparse_feature_used[feature_index] = 1; } else { is_dense_feature_used[feature_index] = 1; } } // construct smaller leaf - HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; + hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset; // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, nullptr, smaller_leaf_splits_->num_data_in_leaf(), nullptr, nullptr, nullptr, nullptr); // then construct sparse features on CPU - // We set data_indices to null to avoid rebuilding ordered gradients/hessians train_data_->ConstructHistograms(is_sparse_feature_used, - nullptr, smaller_leaf_splits_->num_data_in_leaf(), - smaller_leaf_splits_->LeafIndex(), - &ordered_bins_, gradients_, hessians_, + smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), + gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, + multi_val_bin_.get(), is_hist_colwise_, ptr_smaller_leaf_hist_data); // wait for GPU to finish, only if GPU is actually used if (is_gpu_used) { if (config_->gpu_use_dp) { // use double precision - WaitAndGetHistograms(ptr_smaller_leaf_hist_data); + WaitAndGetHistograms(ptr_smaller_leaf_hist_data); } else { // use single precision - WaitAndGetHistograms(ptr_smaller_leaf_hist_data); + WaitAndGetHistograms(ptr_smaller_leaf_hist_data); } } @@ -1000,48 +994,58 @@ void GPUTreeLearner::ConstructHistograms(const std::vector& is_feature_u continue; int dense_feature_group_index = dense_feature_group_map_[i]; size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index); - HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; - HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index); - HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size]; + hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset; + hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2; + hist_t* gpu_histogram = new hist_t[size * 2]; data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf(); printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size); - std::copy(current_histogram, current_histogram + size, gpu_histogram); - std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry)); - train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( - num_data != num_data_ ? smaller_leaf_splits_->data_indices() : nullptr, - num_data, - num_data != num_data_ ? ordered_gradients_.data() : gradients_, - num_data != num_data_ ? ordered_hessians_.data() : hessians_, - current_histogram); + std::copy(current_histogram, current_histogram + size * 2, gpu_histogram); + std::memset(current_histogram, 0, size * sizeof(hist_t) * 2); + if(train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr){continue;} + if (num_data != num_data_ ) { + train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( + smaller_leaf_splits_->data_indices(), + 0, + num_data, + ordered_gradients_.data(), + ordered_hessians_.data(), + current_histogram); + } else { + train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( + 0, + num_data, + gradients_, + hessians_, + current_histogram); + } CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index); - std::copy(gpu_histogram, gpu_histogram + size, current_histogram); + std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram); delete [] gpu_histogram; } #endif if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { // construct larger leaf - HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1; + hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - KHistOffset; is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data()); // then construct sparse features on CPU - // We set data_indices to null to avoid rebuilding ordered gradients/hessians train_data_->ConstructHistograms(is_sparse_feature_used, - nullptr, larger_leaf_splits_->num_data_in_leaf(), - larger_leaf_splits_->LeafIndex(), - &ordered_bins_, gradients_, hessians_, + larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), + gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, + multi_val_bin_.get(), is_hist_colwise_, ptr_larger_leaf_hist_data); // wait for GPU to finish, only if GPU is actually used if (is_gpu_used) { if (config_->gpu_use_dp) { // use double precision - WaitAndGetHistograms(ptr_larger_leaf_hist_data); + WaitAndGetHistograms(ptr_larger_leaf_hist_data); } else { // use single precision - WaitAndGetHistograms(ptr_larger_leaf_hist_data); + WaitAndGetHistograms(ptr_larger_leaf_hist_data); } } } diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h index 8686a5057510..f6e03acb5fa4 100644 --- a/src/treelearner/gpu_tree_learner.h +++ b/src/treelearner/gpu_tree_learner.h @@ -76,12 +76,7 @@ class GPUTreeLearner: public SerialTreeLearner { uint8_t s[4]; }; - /*! \brief Single precision histogram entiry for GPU */ - struct GPUHistogramBinEntry { - score_t sum_gradients; - score_t sum_hessians; - uint32_t cnt; - }; + typedef float gpu_hist_t; /*! * \brief Find the best number of workgroups processing one feature for maximizing efficiency @@ -133,7 +128,7 @@ class GPUTreeLearner: public SerialTreeLearner { * \param histograms Destination of histogram results from GPU. */ template - void WaitAndGetHistograms(HistogramBinEntry* histograms); + void WaitAndGetHistograms(hist_t* histograms); /*! * \brief Construct GPU histogram asynchronously. diff --git a/src/treelearner/ocl/histogram16.cl b/src/treelearner/ocl/histogram16.cl index efe0d8462cb8..eb15916066eb 100644 --- a/src/treelearner/ocl/histogram16.cl +++ b/src/treelearner/ocl/histogram16.cl @@ -163,7 +163,7 @@ R""() void within_kernel_reduction16x8(uchar8 feature_mask, __global const acc_type* restrict feature4_sub_hist, const uint skip_id, - acc_type stat_val, uint cnt_val, + acc_type stat_val, const ushort num_sub_hist, __global acc_type* restrict output_buf, __local acc_type * restrict local_hist) { @@ -181,33 +181,21 @@ void within_kernel_reduction16x8(uchar8 feature_mask, // 256 threads working on 8 features' 16 bins, gradient and hessian stat_val += *p; p += NUM_BINS * DWORD_FEATURES * 2; - if (ltid < LOCAL_SIZE_0 / 2) { - cnt_val += as_acc_int_type(*p); - } - p += NUM_BINS * DWORD_FEATURES; } // skip the counters we already have - p += 3 * DWORD_FEATURES * NUM_BINS; + p += 2 * DWORD_FEATURES * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { stat_val += *p; p += NUM_BINS * DWORD_FEATURES * 2; - if (ltid < LOCAL_SIZE_0 / 2) { - cnt_val += as_acc_int_type(*p); - } - p += NUM_BINS * DWORD_FEATURES; } #endif // printf("thread %d:feature=%d, bin_id=%d, hessian=%d, stat_val=%f, cnt=%d", ltid, feature_id, bin_id, is_hessian_first, stat_val, cnt_val); // now overwrite the local_hist for final reduction and output // reverse the f7...f0 order to match the real order feature_id = DWORD_FEATURES_MASK - feature_id; - local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + is_hessian_first] = stat_val; - bin_id = ltid >> (LOG2_DWORD_FEATURES); // range 0 - 16, for counter - if (ltid < LOCAL_SIZE_0 / 2) { - local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val); - } + local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + is_hessian_first] = stat_val; barrier(CLK_LOCAL_MEM_FENCE); - for (i = ltid; i < DWORD_FEATURES * 3 * NUM_BINS; i += lsize) { + for (i = ltid; i < DWORD_FEATURES * 2 * NUM_BINS; i += lsize) { output_buf[i] = local_hist[i]; } } @@ -335,7 +323,9 @@ __kernel void histogram16(__global const uchar4* feature_data_base, bk7_c_f0_bin16 bk7_c_f1_bin16 bk7_c_f2_bin16 bk7_c_f3_bin16 bk7_c_f4_bin16 bk7_c_f5_bin16 bk7_c_f6_bin16 bk7_c_f7_bin0 ----------------------------------------------- */ + #if CONST_HESSIAN == 1 __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS); + #endif // thread 0, 1, 2, 3, 4, 5, 6, 7 compute histograms for gradients first // thread 8, 9, 10, 11, 12, 13, 14, 15 compute histograms for hessians first @@ -547,7 +537,7 @@ R""() atomic_local_add_f(gh_hist + addr2, stat2); #endif } - + #if CONST_HESSIAN == 1 // STAGE 3: accumulate counter // there are 8 counters for 8 features // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's counts for example 0, 1, 2, 3, 4, 5, 6, 7 @@ -614,6 +604,7 @@ R""() // printf("thread %x add counter %d feature %d (7)\n", ltid, bin, offset); atom_inc(cnt_hist + addr); } + #endif stat1 = stat1_next; stat2 = stat2_next; feature4 = feature4_next; @@ -642,6 +633,7 @@ R""() ushort bank_id = (i + offset) & BANK_MASK; stat_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + feature_id]; } + #if CONST_HESSIAN == 1 if (ltid < LOCAL_SIZE_0 / 2) { // first 128 threads accumulate the 8 * 16 = 128 counter values bin_id = ltid >> LOG2_DWORD_FEATURES; // bits 3 - 6 range 0 - 16 is bin ID @@ -651,6 +643,7 @@ R""() cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * DWORD_FEATURES + feature_id]; } } + #endif // now thread 0 - 7 holds feature 0 - 7's gradient for bin 0 and counter bin 0 // now thread 8 - 15 holds feature 0 - 7's hessian for bin 0 and counter bin 1 @@ -687,7 +680,7 @@ R""() // write to output // write gradients and hessians histogram for all 4 features // output data in linear order for further reduction - // output size = 4 (features) * 3 (counters) * 64 (bins) * sizeof(float) + // output size = 4 (features) * 2 (counters) * 64 (bins) * sizeof(float) /* memory layout of output: g_f0_bin0 g_f1_bin0 g_f2_bin0 g_f3_bin0 g_f4_bin0 g_f5_bin0 g_f6_bin0 g_f7_bin0 h_f0_bin0 h_f1_bin0 h_f2_bin0 h_f3_bin0 h_f4_bin0 h_f5_bin0 h_f6_bin0 h_f7_bin0 @@ -705,14 +698,10 @@ R""() // if there is only one workgroup processing this feature4, don't even need to write uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS); #if POWER_FEATURE_WORKGROUPS != 0 - __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 3 * NUM_BINS; + __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 2 * NUM_BINS; // if g_val and h_val are double, they are converted to float here // write gradients and hessians for 8 features output[0 * DWORD_FEATURES * NUM_BINS + ltid] = stat_val; - // write counts for 8 features - if (ltid < LOCAL_SIZE_0 / 2) { - output[2 * DWORD_FEATURES * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val); - } barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE); // To avoid the cost of an extra reducting kernel, we have to deal with some @@ -738,7 +727,7 @@ R""() // The is done by using an global atomic counter. // On AMD GPUs ideally this should be done in GDS, // but currently there is no easy way to access it via OpenCL. - __local uint * counter_val = cnt_hist; + __local uint * counter_val = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS); if (ltid == 0) { // all workgroups processing the same feature add this counter *counter_val = atom_inc(sync_counters + feature4_id); @@ -762,12 +751,12 @@ R""() // locate our feature4's block in output memory uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS); __global acc_type const * restrict feature4_subhists = - (__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 3 * NUM_BINS; + (__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 2 * NUM_BINS; // skip reading the data already in local memory uint skip_id = group_id ^ output_offset; // locate output histogram location for this feature4 - __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 3 * NUM_BINS; - within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val, cnt_val, + __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 2 * NUM_BINS; + within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val, 1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array); } } @@ -776,4 +765,3 @@ R""() // the +9 skips extra characters ")", newline, "#endif" and newline at the beginning // )"" "\n#endif" + 9 #endif - diff --git a/src/treelearner/ocl/histogram256.cl b/src/treelearner/ocl/histogram256.cl index 0fa4bed88412..6030044614a4 100644 --- a/src/treelearner/ocl/histogram256.cl +++ b/src/treelearner/ocl/histogram256.cl @@ -155,15 +155,6 @@ void within_kernel_reduction256x4(uchar4 feature_mask, acc_type f1_hess_bin = local_hist[ltid * 8 + 5]; acc_type f2_hess_bin = local_hist[ltid * 8 + 6]; acc_type f3_hess_bin = local_hist[ltid * 8 + 7]; - __local uint* restrict local_cnt = (__local uint *)(local_hist + 4 * 2 * NUM_BINS); - #if POWER_FEATURE_WORKGROUPS != 0 - uint f0_cont_bin = ltid ? local_cnt[ltid * 4] : old_val_f0_cont_bin0; - #else - uint f0_cont_bin = local_cnt[ltid * 4]; - #endif - uint f1_cont_bin = local_cnt[ltid * 4 + 1]; - uint f2_cont_bin = local_cnt[ltid * 4 + 2]; - uint f3_cont_bin = local_cnt[ltid * 4 + 3]; ushort i; // printf("%d-pre(skip %d): %f %f %f %f %f %f %f %f %d %d %d %d", ltid, skip_id, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin); #if POWER_FEATURE_WORKGROUPS != 0 @@ -173,70 +164,62 @@ void within_kernel_reduction256x4(uchar4 feature_mask, if (feature_mask.s3) { f0_grad_bin += *p; p += NUM_BINS; f0_hess_bin += *p; p += NUM_BINS; - f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS; } else { - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; } if (feature_mask.s2) { f1_grad_bin += *p; p += NUM_BINS; f1_hess_bin += *p; p += NUM_BINS; - f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS; } else { - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; } if (feature_mask.s1) { f2_grad_bin += *p; p += NUM_BINS; f2_hess_bin += *p; p += NUM_BINS; - f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS; } else { - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; } if (feature_mask.s0) { f3_grad_bin += *p; p += NUM_BINS; f3_hess_bin += *p; p += NUM_BINS; - f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS; } else { - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; } } // skip the counters we already have - p += 3 * 4 * NUM_BINS; + p += 2 * 4 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { if (feature_mask.s3) { f0_grad_bin += *p; p += NUM_BINS; f0_hess_bin += *p; p += NUM_BINS; - f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS; } else { - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; } if (feature_mask.s2) { f1_grad_bin += *p; p += NUM_BINS; f1_hess_bin += *p; p += NUM_BINS; - f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS; } else { - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; } if (feature_mask.s1) { f2_grad_bin += *p; p += NUM_BINS; f2_hess_bin += *p; p += NUM_BINS; - f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS; } else { - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; } if (feature_mask.s0) { f3_grad_bin += *p; p += NUM_BINS; f3_hess_bin += *p; p += NUM_BINS; - f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS; } else { - p += 3 * NUM_BINS; + p += 2 * NUM_BINS; } } // printf("%d-aft: %f %f %f %f %f %f %f %f %d %d %d %d", ltid, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin); @@ -245,18 +228,14 @@ void within_kernel_reduction256x4(uchar4 feature_mask, barrier(CLK_LOCAL_MEM_FENCE); #if USE_DP_FLOAT == 0 // reverse the f3...f0 order to match the real order - local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin; - local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin; - local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin); - local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin; - local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin; - local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin); - local_hist[2 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin; - local_hist[2 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin; - local_hist[2 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin); - local_hist[3 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin; - local_hist[3 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin; - local_hist[3 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin); + local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f3_grad_bin; + local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f3_hess_bin; + local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f2_grad_bin; + local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f2_hess_bin; + local_hist[2 * 2 * NUM_BINS + ltid * 2 + 0] = f1_grad_bin; + local_hist[2 * 2 * NUM_BINS + ltid * 2 + 1] = f1_hess_bin; + local_hist[3 * 2 * NUM_BINS + ltid * 2 + 0] = f0_grad_bin; + local_hist[3 * 2 * NUM_BINS + ltid * 2 + 1] = f0_hess_bin; barrier(CLK_LOCAL_MEM_FENCE); /* for (ushort i = ltid; i < 4 * 3 * NUM_BINS; i += lsize) { @@ -267,34 +246,28 @@ void within_kernel_reduction256x4(uchar4 feature_mask, if (feature_mask.s0) { output_buf[i] = local_hist[i]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; - output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS]; } - i += 1 * 3 * NUM_BINS; + i += 1 * 2 * NUM_BINS; if (feature_mask.s1) { output_buf[i] = local_hist[i]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; - output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS]; } - i += 1 * 3 * NUM_BINS; + i += 1 * 2 * NUM_BINS; if (feature_mask.s2) { output_buf[i] = local_hist[i]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; - output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS]; } - i += 1 * 3 * NUM_BINS; - if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) { + i += 1 * 2 * NUM_BINS; + if (feature_mask.s3 && i < 4 * 2 * NUM_BINS) { output_buf[i] = local_hist[i]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; - output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS]; } #else // when double precision is used, we need to write twice, because local memory size is not enough - local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin; - local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin; - local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin); - local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin; - local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin; - local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin); + local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f3_grad_bin; + local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f3_hess_bin; + local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f2_grad_bin; + local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f2_hess_bin; barrier(CLK_LOCAL_MEM_FENCE); /* for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) { @@ -305,21 +278,17 @@ void within_kernel_reduction256x4(uchar4 feature_mask, if (feature_mask.s0) { output_buf[i] = local_hist[i]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; - output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS]; } - i += 1 * 3 * NUM_BINS; + i += 1 * 2 * NUM_BINS; if (feature_mask.s1) { output_buf[i] = local_hist[i]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; - output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS]; } barrier(CLK_LOCAL_MEM_FENCE); - local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin; - local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin; - local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin); - local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin; - local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin; - local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin); + local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f1_grad_bin; + local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f1_hess_bin; + local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f0_grad_bin; + local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f0_hess_bin; barrier(CLK_LOCAL_MEM_FENCE); /* for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) { @@ -328,15 +297,13 @@ void within_kernel_reduction256x4(uchar4 feature_mask, */ i = ltid; if (feature_mask.s2) { - output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i]; - output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS]; - output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS]; + output_buf[i + 2 * 2 * NUM_BINS] = local_hist[i]; + output_buf[i + 2 * 2 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS]; } - i += 1 * 3 * NUM_BINS; + i += 1 * 2 * NUM_BINS; if (feature_mask.s3) { - output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i]; - output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS]; - output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS]; + output_buf[i + 2 * 2 * NUM_BINS] = local_hist[i]; + output_buf[i + 2 * 2 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS]; } #endif } @@ -401,7 +368,9 @@ __kernel void histogram256(__global const uchar4* feature_data_base, __local acc_type * gh_hist = (__local acc_type *)shared_array; // counter histogram // total size: 4 * 256 * size_of(uint) = 4 KB + #if CONST_HESSIAN == 1 __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS); + #endif // thread 0, 1, 2, 3 compute histograms for gradients first // thread 4, 5, 6, 7 compute histograms for hessians first @@ -602,7 +571,7 @@ R""() s0_stat1 += stat1; s0_stat2 += stat2; } - + #if CONST_HESSIAN == 1 // STAGE 3: accumulate counter // there are 4 counters for 4 features // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3 @@ -633,6 +602,7 @@ R""() addr = bin * 4 + offset; atom_inc(cnt_hist + addr); } + #endif stat1 = stat1_next; stat2 = stat2_next; feature4 = feature4_next; @@ -741,7 +711,7 @@ R""() uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS); // if there is only one workgroup processing this feature4, don't even need to write #if POWER_FEATURE_WORKGROUPS != 0 - __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS; + __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 2 * NUM_BINS; // write gradients and hessians __global acc_type * restrict ptr_f = output; for (ushort j = 0; j < 4; ++j) { @@ -751,17 +721,7 @@ R""() acc_type value = gh_hist[i * 4 + j]; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; } - ptr_f += 3 * NUM_BINS; - } - // write counts - __global acc_int_type * restrict ptr_i = (__global acc_int_type * restrict)(output + 2 * NUM_BINS); - for (ushort j = 0; j < 4; ++j) { - for (ushort i = ltid; i < NUM_BINS; i += lsize) { - // FIXME: 2-way bank conflict - uint value = cnt_hist[i * 4 + j]; - ptr_i[i] = value; - } - ptr_i += 3 * NUM_BINS; + ptr_f += 2 * NUM_BINS; } barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE); @@ -788,7 +748,7 @@ R""() // The is done by using an global atomic counter. // On AMD GPUs ideally this should be done in GDS, // but currently there is no easy way to access it via OpenCL. - __local uint * counter_val = cnt_hist; + __local uint * counter_val = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS);; // backup the old value uint old_val = *counter_val; if (ltid == 0) { @@ -814,11 +774,11 @@ R""() // locate our feature4's block in output memory uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS); __global acc_type const * restrict feature4_subhists = - (__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS; + (__global acc_type *)output_buf + output_offset * 4 * 2 * NUM_BINS; // skip reading the data already in local memory uint skip_id = group_id ^ output_offset; // locate output histogram location for this feature4 - __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS; + __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 2 * NUM_BINS; within_kernel_reduction256x4(feature_mask, feature4_subhists, skip_id, old_val, 1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array); // if (ltid == 0) diff --git a/src/treelearner/ocl/histogram64.cl b/src/treelearner/ocl/histogram64.cl index 5b265abda703..d3c4d48c729f 100644 --- a/src/treelearner/ocl/histogram64.cl +++ b/src/treelearner/ocl/histogram64.cl @@ -157,7 +157,7 @@ R""() void within_kernel_reduction64x4(uchar4 feature_mask, __global const acc_type* restrict feature4_sub_hist, const uint skip_id, - acc_type g_val, acc_type h_val, uint cnt_val, + acc_type g_val, acc_type h_val, const ushort num_sub_hist, __global acc_type* restrict output_buf, __local acc_type * restrict local_hist) { @@ -173,38 +173,35 @@ void within_kernel_reduction64x4(uchar4 feature_mask, for (i = 0; i < skip_id; ++i) { g_val += *p; p += NUM_BINS * 4; // 256 threads working on 4 features' 64 bins h_val += *p; p += NUM_BINS * 4; - cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4; } // skip the counters we already have - p += 3 * 4 * NUM_BINS; + p += 2 * 4 * NUM_BINS; for (i = i + 1; i < num_sub_hist; ++i) { g_val += *p; p += NUM_BINS * 4; h_val += *p; p += NUM_BINS * 4; - cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4; } #endif // printf("thread %d: g_val=%f, h_val=%f cnt=%d", ltid, g_val, h_val, cnt_val); // now overwrite the local_hist for final reduction and output // reverse the f3...f0 order to match the real order feature_id = 3 - feature_id; - local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 0] = g_val; - local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 1] = h_val; - local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val); + local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 0] = g_val; + local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 1] = h_val; barrier(CLK_LOCAL_MEM_FENCE); i = ltid; - if (feature_mask.s0 && i < 1 * 3 * NUM_BINS) { + if (feature_mask.s0 && i < 1 * 2 * NUM_BINS) { output_buf[i] = local_hist[i]; } - i += 1 * 3 * NUM_BINS; - if (feature_mask.s1 && i < 2 * 3 * NUM_BINS) { + i += 1 * 2 * NUM_BINS; + if (feature_mask.s1 && i < 2 * 2 * NUM_BINS) { output_buf[i] = local_hist[i]; } - i += 1 * 3 * NUM_BINS; - if (feature_mask.s2 && i < 3 * 3 * NUM_BINS) { + i += 1 * 2 * NUM_BINS; + if (feature_mask.s2 && i < 3 * 2 * NUM_BINS) { output_buf[i] = local_hist[i]; } - i += 1 * 3 * NUM_BINS; - if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) { + i += 1 * 2 * NUM_BINS; + if (feature_mask.s3 && i < 4 * 2 * NUM_BINS) { output_buf[i] = local_hist[i]; } } @@ -306,7 +303,9 @@ __kernel void histogram64(__global const uchar4* feature_data_base, bk3_c_f0_bin64 bk3_c_f1_bin64 bk3_c_f2_bin64 bk3_c_f3_bin64 ----------------------------------------------- */ + #if CONST_HESSIAN == 1 __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS); + #endif // thread 0, 1, 2, 3 compute histograms for gradients first // thread 4, 5, 6, 7 compute histograms for hessians first @@ -509,7 +508,7 @@ R""() s0_stat1 += stat1; s0_stat2 += stat2; } - + #if CONST_HESSIAN == 1 // STAGE 3: accumulate counter // there are 4 counters for 4 features // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3 @@ -540,6 +539,7 @@ R""() addr = bin * CNT_BIN_MULT + bank * 4 + offset; atom_inc(cnt_hist + addr); } + #endif stat1 = stat1_next; stat2 = stat2_next; feature4 = feature4_next; @@ -639,7 +639,9 @@ R""() ushort bank_id = (i + offset) & BANK_MASK; g_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id]; h_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id + 4]; + #if CONST_HESSIAN == 1 cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * 4 + feature_id]; + #endif } // now thread 0 - 3 holds feature 0, 1, 2, 3's gradient, hessian and count bin 0 // now thread 4 - 7 holds feature 0, 1, 2, 3's gradient, hessian and count bin 1 @@ -670,14 +672,12 @@ R""() // if there is only one workgroup processing this feature4, don't even need to write uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS); #if POWER_FEATURE_WORKGROUPS != 0 - __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS; + __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 2 * NUM_BINS; // if g_val and h_val are double, they are converted to float here // write gradients for 4 features output[0 * 4 * NUM_BINS + ltid] = g_val; // write hessians for 4 features output[1 * 4 * NUM_BINS + ltid] = h_val; - // write counts for 4 features - output[2 * 4 * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val); barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE); // To avoid the cost of an extra reducting kernel, we have to deal with some @@ -703,7 +703,7 @@ R""() // The is done by using an global atomic counter. // On AMD GPUs ideally this should be done in GDS, // but currently there is no easy way to access it via OpenCL. - __local uint * counter_val = cnt_hist; + __local uint * counter_val = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);; if (ltid == 0) { // all workgroups processing the same feature add this counter *counter_val = atom_inc(sync_counters + feature4_id); @@ -727,12 +727,12 @@ R""() // locate our feature4's block in output memory uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS); __global acc_type const * restrict feature4_subhists = - (__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS; + (__global acc_type *)output_buf + output_offset * 4 * 2 * NUM_BINS; // skip reading the data already in local memory uint skip_id = group_id ^ output_offset; // locate output histogram location for this feature4 - __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS; - within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val, cnt_val, + __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 2 * NUM_BINS; + within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val, 1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array); } } diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index c6754b517397..dde47d4989da 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -181,8 +181,8 @@ class VotingParallelTreeLearner: public TREELEARNER_T { /*! \brief Store global histogram for larger leaf */ std::unique_ptr larger_leaf_histogram_array_global_; - std::vector smaller_leaf_histogram_data_; - std::vector larger_leaf_histogram_data_; + std::vector smaller_leaf_histogram_data_; + std::vector larger_leaf_histogram_data_; std::vector feature_metas_; }; diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 252ce5fdca28..84ff5f2ee5f3 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -18,14 +18,6 @@ namespace LightGBM { -#ifdef TIMETAG -std::chrono::duration init_train_time; -std::chrono::duration init_split_time; -std::chrono::duration hist_time; -std::chrono::duration find_split_time; -std::chrono::duration split_time; -std::chrono::duration ordered_bin_time; -#endif // TIMETAG SerialTreeLearner::SerialTreeLearner(const Config* config) :config_(config) { @@ -38,14 +30,7 @@ SerialTreeLearner::SerialTreeLearner(const Config* config) } SerialTreeLearner::~SerialTreeLearner() { - #ifdef TIMETAG - Log::Info("SerialTreeLearner::init_train costs %f", init_train_time * 1e-3); - Log::Info("SerialTreeLearner::init_split costs %f", init_split_time * 1e-3); - Log::Info("SerialTreeLearner::hist_build costs %f", hist_time * 1e-3); - Log::Info("SerialTreeLearner::find_split costs %f", find_split_time * 1e-3); - Log::Info("SerialTreeLearner::split costs %f", split_time * 1e-3); - Log::Info("SerialTreeLearner::ordered_bin costs %f", ordered_bin_time * 1e-3); - #endif + } void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { @@ -60,7 +45,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian } else { size_t total_histogram_size = 0; for (int i = 0; i < train_data_->num_features(); ++i) { - total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i); + total_histogram_size += KHistEntrySize * train_data_->FeatureNumBin(i); } max_cache_size = static_cast(config_->histogram_pool_size * 1024 * 1024 / total_histogram_size); } @@ -68,19 +53,10 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian max_cache_size = std::max(2, max_cache_size); max_cache_size = std::min(max_cache_size, config_->num_leaves); - histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves); + // push split information for all leaves best_split_per_leaf_.resize(config_->num_leaves); - // get ordered bin - train_data_->CreateOrderedBins(&ordered_bins_); - // check existing for ordered bin - for (int i = 0; i < static_cast(ordered_bins_.size()); ++i) { - if (ordered_bins_[i] != nullptr) { - has_ordered_bin_ = true; - break; - } - } // initialize splits for leaf smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_data())); larger_leaf_splits_.reset(new LeafSplits(train_data_->num_data())); @@ -92,17 +68,10 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian // initialize ordered gradients and hessians ordered_gradients_.resize(num_data_); ordered_hessians_.resize(num_data_); - // if has ordered bin, need to allocate a buffer to fast split - if (has_ordered_bin_) { - is_data_in_leaf_.resize(num_data_); - std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), static_cast(0)); - ordered_bin_indices_.clear(); - for (int i = 0; i < static_cast(ordered_bins_.size()); i++) { - if (ordered_bins_[i] != nullptr) { - ordered_bin_indices_.push_back(i); - } - } - } + + GetMultiValBin(train_data_, true); + + histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves); Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_); if (CostEfficientGradientBoosting::IsEnable(config_)) { cegb_.reset(new CostEfficientGradientBoosting(this)); @@ -110,14 +79,23 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian } } +void SerialTreeLearner::GetMultiValBin(const Dataset* dataset, bool is_first_time) { + if (is_first_time) { + auto used_feature = GetUsedFeatures(true); + multi_val_bin_.reset(dataset->TestMultiThreadingMethod(ordered_gradients_.data(), ordered_hessians_.data(), used_feature, + is_constant_hessian_, config_->force_col_wise, config_->force_row_wise, &is_hist_colwise_)); + } else { + // cannot change is_hist_col_wise during training + multi_val_bin_.reset(dataset->TestMultiThreadingMethod(ordered_gradients_.data(), ordered_hessians_.data(), is_feature_used_, + is_constant_hessian_, is_hist_colwise_, !is_hist_colwise_, &is_hist_colwise_)); + } +} + void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { train_data_ = train_data; num_data_ = train_data_->num_data(); CHECK(num_features_ == train_data_->num_features()); - // get ordered bin - train_data_->CreateOrderedBins(&ordered_bins_); - // initialize splits for leaf smaller_leaf_splits_->ResetNumData(num_data_); larger_leaf_splits_->ResetNumData(num_data_); @@ -125,14 +103,12 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { // initialize data partition data_partition_->ResetNumData(num_data_); + GetMultiValBin(train_data_, false); + // initialize ordered gradients and hessians ordered_gradients_.resize(num_data_); ordered_hessians_.resize(num_data_); - // if has ordered bin, need to allocate a buffer to fast split - if (has_ordered_bin_) { - is_data_in_leaf_.resize(num_data_); - std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), static_cast(0)); - } + if (cegb_ != nullptr) { cegb_->Init(); } @@ -148,14 +124,14 @@ void SerialTreeLearner::ResetConfig(const Config* config) { } else { size_t total_histogram_size = 0; for (int i = 0; i < train_data_->num_features(); ++i) { - total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i); + total_histogram_size += KHistEntrySize * train_data_->FeatureNumBin(i); } max_cache_size = static_cast(config_->histogram_pool_size * 1024 * 1024 / total_histogram_size); } // at least need 2 leaves max_cache_size = std::max(2, max_cache_size); max_cache_size = std::min(max_cache_size, config_->num_leaves); - histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves); + histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves); // push split information for all leaves best_split_per_leaf_.resize(config_->num_leaves); @@ -171,19 +147,14 @@ void SerialTreeLearner::ResetConfig(const Config* config) { } Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) { + Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer); gradients_ = gradients; hessians_ = hessians; is_constant_hessian_ = is_constant_hessian; - #ifdef TIMETAG - auto start_time = std::chrono::steady_clock::now(); - #endif + // some initial works before training BeforeTrain(); - #ifdef TIMETAG - init_train_time += std::chrono::steady_clock::now() - start_time; - #endif - auto tree = std::unique_ptr(new Tree(config_->num_leaves)); // root leaf int left_leaf = 0; @@ -199,14 +170,8 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians } for (int split = init_splits; split < config_->num_leaves - 1; ++split) { - #ifdef TIMETAG - start_time = std::chrono::steady_clock::now(); - #endif // some initial works before finding best split if (!aborted_last_force_split && BeforeFindBestSplit(tree.get(), left_leaf, right_leaf)) { - #ifdef TIMETAG - init_split_time += std::chrono::steady_clock::now() - start_time; - #endif // find best threshold for every feature FindBestSplits(); } else if (aborted_last_force_split) { @@ -222,14 +187,8 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians Log::Warning("No further splits with positive gain, best gain: %f", best_leaf_SplitInfo.gain); break; } - #ifdef TIMETAG - start_time = std::chrono::steady_clock::now(); - #endif // split tree with best leaf Split(tree.get(), best_leaf, &left_leaf, &right_leaf); - #ifdef TIMETAG - split_time += std::chrono::steady_clock::now() - start_time; - #endif cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf)); } Log::Debug("Trained a tree with leaves = %d and max_depth = %d", tree->num_leaves(), cur_depth); @@ -319,6 +278,7 @@ std::vector SerialTreeLearner::GetUsedFeatures(bool is_tree_level) { } void SerialTreeLearner::BeforeTrain() { + Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeTrain", global_timer); // reset histogram pool histogram_pool_.ResetMap(); @@ -350,54 +310,10 @@ void SerialTreeLearner::BeforeTrain() { } larger_leaf_splits_->Init(); - - // if has ordered bin, need to initialize the ordered bin - if (has_ordered_bin_) { - #ifdef TIMETAG - auto start_time = std::chrono::steady_clock::now(); - #endif - if (data_partition_->leaf_count(0) == num_data_) { - // use all data, pass nullptr - OMP_INIT_EX(); - #pragma omp parallel for schedule(static) - for (int i = 0; i < static_cast(ordered_bin_indices_.size()); ++i) { - OMP_LOOP_EX_BEGIN(); - ordered_bins_[ordered_bin_indices_[i]]->Init(nullptr, config_->num_leaves); - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - } else { - // bagging, only use part of data - - // mark used data - const data_size_t* indices = data_partition_->indices(); - data_size_t begin = data_partition_->leaf_begin(0); - data_size_t end = begin + data_partition_->leaf_count(0); - #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024) - for (data_size_t i = begin; i < end; ++i) { - is_data_in_leaf_[indices[i]] = 1; - } - OMP_INIT_EX(); - // initialize ordered bin - #pragma omp parallel for schedule(static) - for (int i = 0; i < static_cast(ordered_bin_indices_.size()); ++i) { - OMP_LOOP_EX_BEGIN(); - ordered_bins_[ordered_bin_indices_[i]]->Init(is_data_in_leaf_.data(), config_->num_leaves); - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024) - for (data_size_t i = begin; i < end; ++i) { - is_data_in_leaf_[indices[i]] = 0; - } - } - #ifdef TIMETAG - ordered_bin_time += std::chrono::steady_clock::now() - start_time; - #endif - } } bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) { + Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeFindBestSplit", global_timer); // check depth of current leaf if (config_->max_depth > 0) { // only need to check left leaf, since right leaf is in same level of left leaf @@ -435,44 +351,6 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; } histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_); } - // split for the ordered bin - if (has_ordered_bin_ && right_leaf >= 0) { - #ifdef TIMETAG - auto start_time = std::chrono::steady_clock::now(); - #endif - // mark data that at left-leaf - const data_size_t* indices = data_partition_->indices(); - const auto left_cnt = data_partition_->leaf_count(left_leaf); - const auto right_cnt = data_partition_->leaf_count(right_leaf); - char mark = 1; - data_size_t begin = data_partition_->leaf_begin(left_leaf); - data_size_t end = begin + left_cnt; - if (left_cnt > right_cnt) { - begin = data_partition_->leaf_begin(right_leaf); - end = begin + right_cnt; - mark = 0; - } - #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024) - for (data_size_t i = begin; i < end; ++i) { - is_data_in_leaf_[indices[i]] = 1; - } - OMP_INIT_EX(); - // split the ordered bin - #pragma omp parallel for schedule(static) - for (int i = 0; i < static_cast(ordered_bin_indices_.size()); ++i) { - OMP_LOOP_EX_BEGIN(); - ordered_bins_[ordered_bin_indices_[i]]->Split(left_leaf, right_leaf, is_data_in_leaf_.data(), mark); - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024) - for (data_size_t i = begin; i < end; ++i) { - is_data_in_leaf_[indices[i]] = 0; - } - #ifdef TIMETAG - ordered_bin_time += std::chrono::steady_clock::now() - start_time; - #endif - } return true; } @@ -494,37 +372,30 @@ void SerialTreeLearner::FindBestSplits() { } void SerialTreeLearner::ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) { - #ifdef TIMETAG - auto start_time = std::chrono::steady_clock::now(); - #endif + Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms", global_timer); // construct smaller leaf - HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; + hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset; train_data_->ConstructHistograms(is_feature_used, smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), - smaller_leaf_splits_->LeafIndex(), - &ordered_bins_, gradients_, hessians_, + gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, + multi_val_bin_.get(), is_hist_colwise_, ptr_smaller_leaf_hist_data); if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { // construct larger leaf - HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1; + hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - KHistOffset; train_data_->ConstructHistograms(is_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), - larger_leaf_splits_->LeafIndex(), - &ordered_bins_, gradients_, hessians_, + gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, + multi_val_bin_.get(), is_hist_colwise_, ptr_larger_leaf_hist_data); } - #ifdef TIMETAG - hist_time += std::chrono::steady_clock::now() - start_time; - #endif } void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) { - #ifdef TIMETAG - auto start_time = std::chrono::steady_clock::now(); - #endif + Common::FunctionTimer fun_timer("SerialTreeLearner::FindBestSplitsFromHistograms", global_timer); std::vector smaller_best(num_threads_); std::vector larger_best(num_threads_); std::vector smaller_node_used_features(num_features_, 1); @@ -534,7 +405,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& larger_node_used_features = GetUsedFeatures(false); } OMP_INIT_EX(); - // find splits + // find splits #pragma omp parallel for schedule(static) for (int feature_index = 0; feature_index < num_features_; ++feature_index) { OMP_LOOP_EX_BEGIN(); @@ -543,7 +414,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& SplitInfo smaller_split; train_data_->FixHistogram(feature_index, smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(), - smaller_leaf_splits_->num_data_in_leaf(), smaller_leaf_histogram_array_[feature_index].RawData()); int real_fidx = train_data_->RealFeatureIndex(feature_index); smaller_leaf_histogram_array_[feature_index].FindBestThreshold( @@ -567,7 +437,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]); } else { train_data_->FixHistogram(feature_index, larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians(), - larger_leaf_splits_->num_data_in_leaf(), larger_leaf_histogram_array_[feature_index].RawData()); } SplitInfo larger_split; @@ -589,7 +458,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& OMP_LOOP_EX_END(); } OMP_THROW_EX(); - auto smaller_best_idx = ArrayArgs::ArgMax(smaller_best); int leaf = smaller_leaf_splits_->LeafIndex(); best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx]; @@ -599,9 +467,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& auto larger_best_idx = ArrayArgs::ArgMax(larger_best); best_split_per_leaf_[leaf] = larger_best[larger_best_idx]; } - #ifdef TIMETAG - find_split_time += std::chrono::steady_clock::now() - start_time; - #endif } int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf, @@ -769,69 +634,80 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json } void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) { - const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; + Common::FunctionTimer fun_timer("SerialTreeLearner::Split", global_timer); + SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature); if (cegb_ != nullptr) { cegb_->UpdateLeafBestSplits(tree, best_leaf, &best_split_info, &best_split_per_leaf_); } - // left = parent *left_leaf = best_leaf; + auto next_leaf_id = tree->NextLeafId(); + bool is_numerical_split = train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin; if (is_numerical_split) { auto threshold_double = train_data_->RealThreshold(inner_feature_index, best_split_info.threshold); + data_partition_->Split(best_leaf, train_data_, inner_feature_index, + &best_split_info.threshold, 1, best_split_info.default_left, next_leaf_id); + best_split_info.left_count = data_partition_->leaf_count(*left_leaf); + best_split_info.right_count = data_partition_->leaf_count(next_leaf_id); // split tree, will return right leaf *right_leaf = tree->Split(best_leaf, - inner_feature_index, - best_split_info.feature, - best_split_info.threshold, - threshold_double, - static_cast(best_split_info.left_output), - static_cast(best_split_info.right_output), - static_cast(best_split_info.left_count), - static_cast(best_split_info.right_count), - static_cast(best_split_info.left_sum_hessian), - static_cast(best_split_info.right_sum_hessian), - static_cast(best_split_info.gain), - train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), - best_split_info.default_left); - data_partition_->Split(best_leaf, train_data_, inner_feature_index, - &best_split_info.threshold, 1, best_split_info.default_left, *right_leaf); + inner_feature_index, + best_split_info.feature, + best_split_info.threshold, + threshold_double, + static_cast(best_split_info.left_output), + static_cast(best_split_info.right_output), + static_cast(best_split_info.left_count), + static_cast(best_split_info.right_count), + static_cast(best_split_info.left_sum_hessian), + static_cast(best_split_info.right_sum_hessian), + static_cast(best_split_info.gain), + train_data_->FeatureBinMapper(inner_feature_index)->missing_type(), + best_split_info.default_left); + } else { + std::vector cat_bitset_inner = Common::ConstructBitset(best_split_info.cat_threshold.data(), best_split_info.num_cat_threshold); std::vector threshold_int(best_split_info.num_cat_threshold); for (int i = 0; i < best_split_info.num_cat_threshold; ++i) { threshold_int[i] = static_cast(train_data_->RealThreshold(inner_feature_index, best_split_info.cat_threshold[i])); } std::vector cat_bitset = Common::ConstructBitset(threshold_int.data(), best_split_info.num_cat_threshold); - *right_leaf = tree->SplitCategorical(best_leaf, - inner_feature_index, - best_split_info.feature, - cat_bitset_inner.data(), - static_cast(cat_bitset_inner.size()), - cat_bitset.data(), - static_cast(cat_bitset.size()), - static_cast(best_split_info.left_output), - static_cast(best_split_info.right_output), - static_cast(best_split_info.left_count), - static_cast(best_split_info.right_count), - static_cast(best_split_info.left_sum_hessian), - static_cast(best_split_info.right_sum_hessian), - static_cast(best_split_info.gain), - train_data_->FeatureBinMapper(inner_feature_index)->missing_type()); + data_partition_->Split(best_leaf, train_data_, inner_feature_index, - cat_bitset_inner.data(), static_cast(cat_bitset_inner.size()), best_split_info.default_left, *right_leaf); - } + cat_bitset_inner.data(), static_cast(cat_bitset_inner.size()), best_split_info.default_left, next_leaf_id); + + best_split_info.left_count = data_partition_->leaf_count(*left_leaf); + best_split_info.right_count = data_partition_->leaf_count(next_leaf_id); + + *right_leaf = tree->SplitCategorical(best_leaf, + inner_feature_index, + best_split_info.feature, + cat_bitset_inner.data(), + static_cast(cat_bitset_inner.size()), + cat_bitset.data(), + static_cast(cat_bitset.size()), + static_cast(best_split_info.left_output), + static_cast(best_split_info.right_output), + static_cast(best_split_info.left_count), + static_cast(best_split_info.right_count), + static_cast(best_split_info.left_sum_hessian), + static_cast(best_split_info.right_sum_hessian), + static_cast(best_split_info.gain), + train_data_->FeatureBinMapper(inner_feature_index)->missing_type()); + } + CHECK(*right_leaf == next_leaf_id); - #ifdef DEBUG - CHECK(best_split_info.left_count == data_partition_->leaf_count(best_leaf)); - #endif auto p_left = smaller_leaf_splits_.get(); auto p_right = larger_leaf_splits_.get(); // init the leaves that used on next iteration if (best_split_info.left_count < best_split_info.right_count) { + CHECK(best_split_info.left_count > 0); smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); } else { + CHECK(best_split_info.right_count > 0); smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); p_right = smaller_leaf_splits_.get(); diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 31743933a780..0fedefc5a15d 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -79,7 +79,12 @@ class SerialTreeLearner: public TreeLearner { void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override; + bool IsHistColWise() const override { return is_hist_colwise_; } + protected: + + void GetMultiValBin(const Dataset* dataset, bool is_first_time); + virtual std::vector GetUsedFeatures(bool is_tree_level); /*! * \brief Some initial works before training @@ -161,17 +166,13 @@ class SerialTreeLearner: public TreeLearner { std::vector> ordered_hessians_; #else /*! \brief gradients of current iteration, ordered for cache optimized */ - std::vector ordered_gradients_; + std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized */ - std::vector ordered_hessians_; + std::vector> ordered_hessians_; #endif - /*! \brief Store ordered bin */ - std::vector> ordered_bins_; - /*! \brief True if has ordered bin */ - bool has_ordered_bin_ = false; /*! \brief is_data_in_leaf_[i] != 0 means i-th data is marked */ - std::vector is_data_in_leaf_; + std::vector> is_data_in_leaf_; /*! \brief used to cache historical histogram to speed up*/ HistogramPool histogram_pool_; /*! \brief config of tree learner*/ @@ -179,6 +180,8 @@ class SerialTreeLearner: public TreeLearner { int num_threads_; std::vector ordered_bin_indices_; bool is_constant_hessian_; + std::unique_ptr multi_val_bin_; + bool is_hist_colwise_; std::unique_ptr cegb_; }; diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index e13210b3c600..3fc644e540a2 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -36,7 +36,7 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data, b } } // calculate buffer size - size_t buffer_size = 2 * top_k_ * std::max(max_bin * sizeof(HistogramBinEntry), sizeof(LightSplitInfo) * num_machines_); + size_t buffer_size = 2 * top_k_ * std::max(max_bin * KHistEntrySize, sizeof(LightSplitInfo) * num_machines_); // left and right on same time, so need double size input_buffer_.resize(buffer_size); output_buffer_.resize(buffer_size); @@ -290,7 +290,6 @@ void VotingParallelTreeLearner::FindBestSplits() { const int real_feature_index = this->train_data_->RealFeatureIndex(feature_index); this->train_data_->FixHistogram(feature_index, this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), - this->smaller_leaf_splits_->num_data_in_leaf(), this->smaller_leaf_histogram_array_[feature_index].RawData()); this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold( @@ -308,7 +307,6 @@ void VotingParallelTreeLearner::FindBestSplits() { this->larger_leaf_histogram_array_[feature_index].Subtract(this->smaller_leaf_histogram_array_[feature_index]); } else { this->train_data_->FixHistogram(feature_index, this->larger_leaf_splits_->sum_gradients(), this->larger_leaf_splits_->sum_hessians(), - this->larger_leaf_splits_->num_data_in_leaf(), this->larger_leaf_histogram_array_[feature_index].RawData()); } // find best threshold for larger child @@ -367,8 +365,8 @@ void VotingParallelTreeLearner::FindBestSplits() { CopyLocalHistogram(smaller_top_features, larger_top_features); // Reduce scatter for histogram - Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(), block_len_.data(), - output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramBinEntry::SumReducer); + Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(), block_len_.data(), + output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramSumReducer); this->FindBestSplitsFromHistograms(is_feature_used, false); } @@ -399,7 +397,6 @@ void VotingParallelTreeLearner::FindBestSplitsFromHistograms(cons this->train_data_->FixHistogram(feature_index, smaller_leaf_splits_global_->sum_gradients(), smaller_leaf_splits_global_->sum_hessians(), - GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()), smaller_leaf_histogram_array_global_[feature_index].RawData()); // find best threshold @@ -423,7 +420,6 @@ void VotingParallelTreeLearner::FindBestSplitsFromHistograms(cons this->train_data_->FixHistogram(feature_index, larger_leaf_splits_global_->sum_gradients(), larger_leaf_splits_global_->sum_hessians(), - GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()), larger_leaf_histogram_array_global_[feature_index].RawData()); // find best threshold diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py index cc284aa9076a..63a5834cf619 100644 --- a/tests/python_package_test/test_consistency.py +++ b/tests/python_package_test/test_consistency.py @@ -38,7 +38,9 @@ def load_cpp_result(self, result_file='LightGBM_predict_result.txt'): return np.loadtxt(os.path.join(self.directory, result_file)) def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred): - gbm = lgb.train(self.params, lgb_train) + params = dict(self.params) + params['force_row_wise'] = True + gbm = lgb.train(params, lgb_train) y_pred = gbm.predict(X_test) cpp_pred = gbm.predict(X_test_fn) np.testing.assert_allclose(y_pred, cpp_pred) @@ -105,7 +107,9 @@ def test_lambdarank(self): X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True) group_train = fd.load_field('.train.query') lgb_train = lgb.Dataset(X_train, y_train, group=group_train) - gbm = lgb.LGBMRanker(**fd.params) + params = dict(fd.params) + params['force_col_wise'] = True + gbm = lgb.LGBMRanker(**params) gbm.fit(X_train, y_train, group=group_train) sk_pred = gbm.predict(X_test) fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 5d1ce43b03b5..72d263238ce7 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -66,7 +66,7 @@ def test_binary(self): verbose_eval=False, evals_result=evals_result) ret = log_loss(y_test, gbm.predict(X_test)) - self.assertLess(ret, 0.11) + self.assertLess(ret, 0.14) self.assertEqual(len(evals_result['valid_0']['binary_logloss']), 50) self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5) @@ -328,7 +328,7 @@ def test_multiclass(self): verbose_eval=False, evals_result=evals_result) ret = multi_logloss(y_test, gbm.predict(X_test)) - self.assertLess(ret, 0.15) + self.assertLess(ret, 0.16) self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5) def test_multiclass_rf(self): @@ -518,7 +518,7 @@ def test_early_stopping(self): valid_names=valid_set_name, verbose_eval=False, early_stopping_rounds=5) - self.assertLessEqual(gbm.best_iteration, 31) + self.assertLessEqual(gbm.best_iteration, 39) self.assertIn(valid_set_name, gbm.best_score) self.assertIn('binary_logloss', gbm.best_score[valid_set_name]) @@ -1740,7 +1740,7 @@ def test_node_level_subcol(self): verbose_eval=False, evals_result=evals_result) ret = log_loss(y_test, gbm.predict(X_test)) - self.assertLess(ret, 0.13) + self.assertLess(ret, 0.14) self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5) params['feature_fraction'] = 0.5 gbm2 = lgb.train(params, lgb_train, num_boost_round=25) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 8af3edf8f120..d9b9c872a2b5 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -77,7 +77,7 @@ def test_binary(self): gbm = lgb.LGBMClassifier(n_estimators=50, silent=True) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False) ret = log_loss(y_test, gbm.predict_proba(X_test)) - self.assertLess(ret, 0.11) + self.assertLess(ret, 0.12) self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5) def test_regression(self): @@ -97,7 +97,7 @@ def test_multiclass(self): ret = multi_error(y_test, gbm.predict(X_test)) self.assertLess(ret, 0.05) ret = multi_logloss(y_test, gbm.predict_proba(X_test)) - self.assertLess(ret, 0.15) + self.assertLess(ret, 0.16) self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], places=5) def test_lambdarank(self): @@ -114,8 +114,8 @@ def test_lambdarank(self): eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False, callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))]) self.assertLessEqual(gbm.best_iteration_, 24) - self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6333) - self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6048) + self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.5769) + self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.5920) def test_xendcg(self): dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -129,7 +129,7 @@ def test_xendcg(self): eval_metric='ndcg', callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))]) self.assertLessEqual(gbm.best_iteration_, 24) - self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6579) + self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6559) self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6421) def test_regression_with_custom_objective(self): diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index c923fad9be6c..f899f97e4dc4 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -30,24 +30,24 @@ SAK SAK LightGBM - 8.1 + 10.0 - v140 + v142 - v140 + v142 - v140 + v142 DynamicLibrary - v140 + v142 - v140 + v142 @@ -95,6 +95,8 @@ false Disabled MultiThreadedDebugDLL + true + AdvancedVectorExtensions2 @@ -116,6 +118,8 @@ false Disabled MultiThreadedDebugDLL + true + AdvancedVectorExtensions2 @@ -137,6 +141,8 @@ true true MultiThreadedDLL + true + AdvancedVectorExtensions2 @@ -162,6 +168,8 @@ MultiThreadedDLL true true + true + AdvancedVectorExtensions2 @@ -181,6 +189,8 @@ MultiThreadedDLL true true + true + AdvancedVectorExtensions2 @@ -224,7 +234,8 @@ - + + diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index c5f5c94a93d0..4f706fb17c42 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -57,9 +57,6 @@ src\io - - src\io - src\io @@ -213,6 +210,12 @@ src\treelearner + + src\io + + + src\io +