Support both row-wise and col-wise multi-threading (#2699)

* commit * fix a bug * fix bug * reset to track changes * refine the auto choose logic * sort the time stats output * fix include * change multi_val_bin_sparse_threshold * add cmake * add _mm_malloc and _mm_free for cross platform * fix cmake bug * timer for split * try to fix cmake * fix tests * refactor DataPartition::Split * fix test * typo * formating * Revert "formating" This reverts commit 5b8de4f. * add document * [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719) * naming * fix gpu code * Update include/LightGBM/bin.h Co-Authored-By: James Lamb <jaylamb20@gmail.com> * Update src/treelearner/ocl/histogram16.cl * test: swap compilers for CI * fix omp * not avx2 * no aligned for feature histogram * Revert "refactor DataPartition::Split" This reverts commit 256e6d9. * slightly refactor data partition * reduce the memory cost Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
microsoft · Feb 2, 2020 · 509c2e5 · 509c2e5
1 parent bc7bc4a
commit 509c2e5
Show file tree

Hide file tree

Showing 50 changed files with 2,195 additions and 1,499 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -44,11 +44,11 @@ before_install:
   - export BUILD_DIRECTORY="$TRAVIS_BUILD_DIR"
   - if [[ $TRAVIS_OS_NAME == "osx" ]]; then
         export OS_NAME="macos";
-        export COMPILER="gcc";
+        export COMPILER="clang";
         export R_MAC_VERSION=3.6.1;
     else
         export OS_NAME="linux";
-        export COMPILER="clang";
+        export COMPILER="gcc";
         export R_TRAVIS_LINUX_VERSION=3.6.1-3bionic;
     fi
   - export CONDA="$HOME/miniconda"

diff --git a/.vsts-ci.yml b/.vsts-ci.yml
@@ -17,7 +17,7 @@ jobs:
 - job: Linux
 ###########################################
   variables:
-    COMPILER: gcc
+    COMPILER: clang
   pool:
     vmImage: 'ubuntu-16.04'
   container: ubuntu1404
@@ -72,7 +72,7 @@ jobs:
 - job: MacOS
 ###########################################
   variables:
-    COMPILER: clang
+    COMPILER: gcc
   pool:
     vmImage: 'macOS-10.13'
   strategy:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -68,6 +68,10 @@ if(USE_R35)
     ADD_DEFINITIONS(-DR_VER_ABOVE_35)
 endif(USE_R35)
 
+if(USE_TIMETAG)
+    ADD_DEFINITIONS(-DTIMETAG)
+endif(USE_TIMETAG)
+
 if(USE_MPI)
     find_package(MPI REQUIRED)
     ADD_DEFINITIONS(-DUSE_MPI)
@@ -130,6 +134,21 @@ if(${MM_PREFETCH})
   ADD_DEFINITIONS(-DMM_PREFETCH)
 endif()
 
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles("
+#include <mm_malloc.h>
+int main() {
+  char *a = (char*)_mm_malloc(8, 16);
+  _mm_free(a);
+  return 0;
+}
+" MM_MALLOC)
+
+if(${MM_MALLOC})
+  message(STATUS "Use _mm_malloc")
+  ADD_DEFINITIONS(-DMM_MALLOC)
+endif()
+
 if(UNIX OR MINGW OR CYGWIN)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O3 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type")
     if(USE_SWIG)

diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
@@ -252,3 +252,46 @@ test_that("lgb.train() throws an informative error if 'valids' contains lgb.Data
     )
   }, regexp = "each element of valids must have a name")
 })
+
+test_that("lgb.train() works with force_col_wise and force_row_wise", {
+  set.seed(1234L)
+  nrounds <- 10L
+  dtrain <- lgb.Dataset(
+    train$data
+    , label = train$label
+  )
+  params <- list(
+    objective = "binary"
+    , metric = "binary_error"
+    , force_col_wise = TRUE
+  )
+  bst_colwise <- lgb.train(
+    params = params
+    , data = dtrain
+    , nrounds = nrounds
+  )
+
+  params <- list(
+    objective = "binary"
+    , metric = "binary_error"
+    , force_row_wise = TRUE
+  )
+  bst_row_wise <- lgb.train(
+    params = params
+    , data = dtrain
+    , nrounds = nrounds
+  )
+
+  expected_error <- 0.003070782
+  expect_equal(bst_colwise$eval_train()[[1L]][["value"]], expected_error)
+  expect_equal(bst_row_wise$eval_train()[[1L]][["value"]], expected_error)
+
+  # check some basic details of the boosters just to be sure force_col_wise
+  # and force_row_wise are not causing any weird side effects
+  for (bst in list(bst_row_wise, bst_colwise)) {
+    expect_equal(bst$current_iter(), nrounds)
+    parsed_model <- jsonlite::fromJSON(bst$dump_model())
+    expect_equal(parsed_model$objective, "binary sigmoid:1")
+    expect_false(parsed_model$average_output)
+  }
+})
diff --git a/R-package/tests/testthat/test_learning_to_rank.R b/R-package/tests/testthat/test_learning_to_rank.R
@@ -47,8 +47,8 @@ test_that("learning-to-rank with lgb.train() works as expected", {
     }
     expect_identical(sapply(eval_results, function(x) {x$name}), eval_names)
     expect_equal(eval_results[[1L]][["value"]], 0.825)
-    expect_true(abs(eval_results[[2L]][["value"]] - 0.795986) < TOLERANCE)
-    expect_true(abs(eval_results[[3L]][["value"]] - 0.7734639) < TOLERANCE)
+    expect_true(abs(eval_results[[2L]][["value"]] - 0.7766434) < TOLERANCE)
+    expect_true(abs(eval_results[[3L]][["value"]] - 0.7527939) < TOLERANCE)
 })
 
 test_that("learning-to-rank with lgb.cv() works as expected", {

diff --git a/docs/Parameters.rst b/docs/Parameters.rst
@@ -190,6 +190,38 @@ Core Parameters
 Learning Control Parameters
 ---------------------------
 
+-  ``force_col_wise`` :raw-html:`<a id="force_col_wise" title="Permalink to this parameter" href="#force_col_wise">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build
+
+   -  Recommend ``force_col_wise=true`` when:
+
+      -  the number of columns is large, or the total number of bin is large
+
+      -  when ``num_threads`` is large, e.g. ``>20``
+
+      -  want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up
+
+      -  want to reduce memory cost
+
+   -  when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one
+
+-  ``force_row_wise`` :raw-html:`<a id="force_row_wise" title="Permalink to this parameter" href="#force_row_wise">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  set ``force_row_wise=true`` will force LightGBM to use row-wise histogram build
+
+   -  Recommend ``force_row_wise=true`` when:
+
+      -  the number of data is large, and the number of total bin is relatively small
+
+      -  want to use small ``bagging``, or ``goss``, to speed-up
+
+      -  when ``num_threads`` is relatively small, e.g. ``<=16``
+
+   -  set ``force_row_wise=true`` will double the memory cost for Dataset object, if your memory is not enough, you can try ``force_col_wise=true``
+
+   -  when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one.
+
 -  ``max_depth`` :raw-html:`<a id="max_depth" title="Permalink to this parameter" href="#max_depth">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
 
    -  limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise
@@ -559,22 +591,6 @@ IO Parameters
 
    -  **Note**: disabling this may cause the slow training speed for sparse datasets
 
--  ``max_conflict_rate`` :raw-html:`<a id="max_conflict_rate" title="Permalink to this parameter" href="#max_conflict_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, constraints: ``0.0 <= max_conflict_rate < 1.0``
-
-   -  max conflict rate for bundles in EFB
-
-   -  set this to ``0.0`` to disallow the conflict and provide more accurate results
-
-   -  set this to a larger value to achieve faster speed
-
--  ``is_enable_sparse`` :raw-html:`<a id="is_enable_sparse" title="Permalink to this parameter" href="#is_enable_sparse">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool, aliases: ``is_sparse``, ``enable_sparse``, ``sparse``
-
-   -  used to enable/disable sparse optimization
-
--  ``sparse_threshold`` :raw-html:`<a id="sparse_threshold" title="Permalink to this parameter" href="#sparse_threshold">&#x1F517;&#xFE0E;</a>`, default = ``0.8``, type = double, constraints: ``0.0 < sparse_threshold <= 1.0``
-
-   -  the threshold of zero elements percentage for treating a feature as a sparse one
-
 -  ``use_missing`` :raw-html:`<a id="use_missing" title="Permalink to this parameter" href="#use_missing">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool
 
    -  set this to ``false`` to disable the special handle of missing value

diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
@@ -29,36 +29,29 @@ enum MissingType {
   NaN
 };
 
-/*! \brief Store data for one histogram bin */
-struct HistogramBinEntry {
- public:
-  /*! \brief Sum of gradients on this bin */
-  double sum_gradients = 0.0f;
-  /*! \brief Sum of hessians on this bin */
-  double sum_hessians = 0.0f;
-  /*! \brief Number of data on this bin */
-  data_size_t cnt = 0;
-  /*!
-  * \brief Sum up (reducers) functions for histogram bin
-  */
-  inline static void SumReducer(const char *src, char *dst, int type_size, comm_size_t len) {
-    comm_size_t used_size = 0;
-    const HistogramBinEntry* p1;
-    HistogramBinEntry* p2;
-    while (used_size < len) {
-      // convert
-      p1 = reinterpret_cast<const HistogramBinEntry*>(src);
-      p2 = reinterpret_cast<HistogramBinEntry*>(dst);
-      // add
-      p2->cnt += p1->cnt;
-      p2->sum_gradients += p1->sum_gradients;
-      p2->sum_hessians += p1->sum_hessians;
-      src += type_size;
-      dst += type_size;
-      used_size += type_size;
-    }
+typedef double hist_t;
+
+const size_t KHistEntrySize = 2 * sizeof(hist_t);
+const int KHistOffset = 2;
+const double kSparseThreshold = 0.7;
+
+#define GET_GRAD(hist, i) hist[(i) << 1]
+#define GET_HESS(hist, i) hist[((i) << 1) + 1]
+
+inline static void HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
+  comm_size_t used_size = 0;
+  const hist_t* p1;
+  hist_t* p2;
+  while (used_size < len) {
+    // convert
+    p1 = reinterpret_cast<const hist_t*>(src);
+    p2 = reinterpret_cast<hist_t*>(dst);
+    *p2 += *p1;
+    src += type_size;
+    dst += type_size;
+    used_size += type_size;
   }
-};
+}
 
 /*! \brief This class used to convert feature values into bin,
 *          and store some meta information for bin*/
@@ -252,7 +245,7 @@ class OrderedBin {
   * \param out Output Result
   */
   virtual void ConstructHistogram(int leaf, const score_t* gradients,
-    const score_t* hessians, HistogramBinEntry* out) const = 0;
+    const score_t* hessians, hist_t* out) const = 0;
 
   /*!
   * \brief Construct histogram by using this bin
@@ -262,7 +255,7 @@ class OrderedBin {
   * \param gradients Gradients, Note:non-ordered by leaf
   * \param out Output Result
   */
-  virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
+  virtual void ConstructHistogram(int leaf, const score_t* gradients, hist_t* out) const = 0;
 
   /*!
   * \brief Split current bin, and perform re-order by leaf
@@ -360,11 +353,11 @@ class Bin {
   virtual void ConstructHistogram(
     const data_size_t* data_indices, data_size_t start, data_size_t end,
     const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const = 0;
+    hist_t* out) const = 0;
 
   virtual void ConstructHistogram(data_size_t start, data_size_t end,
     const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const = 0;
+    hist_t* out) const = 0;
 
   /*!
   * \brief Construct histogram of this feature,
@@ -380,10 +373,10 @@ class Bin {
   * \param out Output Result
   */
   virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
-                                  const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
+                                  const score_t* ordered_gradients, hist_t* out) const = 0;
 
   virtual void ConstructHistogram(data_size_t start, data_size_t end,
-                                  const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
+                                  const score_t* ordered_gradients, hist_t* out) const = 0;
 
   /*!
   * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
@@ -423,30 +416,11 @@ class Bin {
                             data_size_t* data_indices, data_size_t num_data,
                             data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
 
-  /*!
-  * \brief Create the ordered bin for this bin
-  * \return Pointer to ordered bin
-  */
-  virtual OrderedBin* CreateOrderedBin() const = 0;
-
   /*!
   * \brief After pushed all feature data, call this could have better refactor for bin data
   */
   virtual void FinishLoad() = 0;
 
-  /*!
-  * \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
-  * \param num_data Total number of data
-  * \param num_bin Number of bin
-  * \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
-  * \param is_enable_sparse True if enable sparse feature
-  * \param sparse_threshold Threshold for treating a feature as a sparse feature
-  * \param is_sparse Will set to true if this bin is sparse
-  * \return The bin data object
-  */
-  static Bin* CreateBin(data_size_t num_data, int num_bin,
-    double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
-
   /*!
   * \brief Create object for bin data of one feature, used for dense feature
   * \param num_data Total number of data
@@ -469,6 +443,46 @@ class Bin {
   virtual Bin* Clone() = 0;
 };
 
+
+class MultiValBin {
+public:
+
+  virtual ~MultiValBin() {}
+
+  virtual data_size_t num_data() const = 0;
+
+  virtual int32_t num_bin() const = 0;
+
+  virtual void ReSize(data_size_t num_data) = 0;
+
+  virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
+
+  virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
+
+  virtual void ConstructHistogram(
+    const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void FinishLoad() = 0;
+
+  virtual bool IsSparse() = 0;
+
+  static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate);
+
+  virtual MultiValBin* Clone() = 0;
+};
+
 inline uint32_t BinMapper::ValueToBin(double value) const {
   if (std::isnan(value)) {
     if (missing_type_ == MissingType::NaN) {