diff --git a/.travis.yml b/.travis.yml
index bbd72bb9b207..aff0084711d2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -44,11 +44,11 @@ before_install:
   - export BUILD_DIRECTORY="$TRAVIS_BUILD_DIR"
   - if [[ $TRAVIS_OS_NAME == "osx" ]]; then
         export OS_NAME="macos";
-        export COMPILER="gcc";
+        export COMPILER="clang";
         export R_MAC_VERSION=3.6.1;
     else
         export OS_NAME="linux";
-        export COMPILER="clang";
+        export COMPILER="gcc";
         export R_TRAVIS_LINUX_VERSION=3.6.1-3bionic;
     fi
   - export CONDA="$HOME/miniconda"
diff --git a/.vsts-ci.yml b/.vsts-ci.yml
index 98a4f591e6ef..0ae390e07ab4 100644
--- a/.vsts-ci.yml
+++ b/.vsts-ci.yml
@@ -17,7 +17,7 @@ jobs:
 - job: Linux
 ###########################################
   variables:
-    COMPILER: gcc
+    COMPILER: clang
   pool:
     vmImage: 'ubuntu-16.04'
   container: ubuntu1404
@@ -72,7 +72,7 @@ jobs:
 - job: MacOS
 ###########################################
   variables:
-    COMPILER: clang
+    COMPILER: gcc
   pool:
     vmImage: 'macOS-10.13'
   strategy:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd1d57e46553..53efb3bc6fed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,6 +68,10 @@ if(USE_R35)
     ADD_DEFINITIONS(-DR_VER_ABOVE_35)
 endif(USE_R35)
 
+if(USE_TIMETAG)
+    ADD_DEFINITIONS(-DTIMETAG)
+endif(USE_TIMETAG)
+
 if(USE_MPI)
     find_package(MPI REQUIRED)
     ADD_DEFINITIONS(-DUSE_MPI)
@@ -130,6 +134,21 @@ if(${MM_PREFETCH})
   ADD_DEFINITIONS(-DMM_PREFETCH)
 endif()
 
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles("
+#include <mm_malloc.h>
+int main() {
+  char *a = (char*)_mm_malloc(8, 16);
+  _mm_free(a);
+  return 0;
+}
+" MM_MALLOC)
+
+if(${MM_MALLOC})
+  message(STATUS "Use _mm_malloc")
+  ADD_DEFINITIONS(-DMM_MALLOC)
+endif()
+
 if(UNIX OR MINGW OR CYGWIN)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O3 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type")
     if(USE_SWIG)
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index d74739150aa4..7d43706cf45d 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -252,3 +252,46 @@ test_that("lgb.train() throws an informative error if 'valids' contains lgb.Data
     )
   }, regexp = "each element of valids must have a name")
 })
+
+test_that("lgb.train() works with force_col_wise and force_row_wise", {
+  set.seed(1234L)
+  nrounds <- 10L
+  dtrain <- lgb.Dataset(
+    train$data
+    , label = train$label
+  )
+  params <- list(
+    objective = "binary"
+    , metric = "binary_error"
+    , force_col_wise = TRUE
+  )
+  bst_colwise <- lgb.train(
+    params = params
+    , data = dtrain
+    , nrounds = nrounds
+  )
+
+  params <- list(
+    objective = "binary"
+    , metric = "binary_error"
+    , force_row_wise = TRUE
+  )
+  bst_row_wise <- lgb.train(
+    params = params
+    , data = dtrain
+    , nrounds = nrounds
+  )
+
+  expected_error <- 0.003070782
+  expect_equal(bst_colwise$eval_train()[[1L]][["value"]], expected_error)
+  expect_equal(bst_row_wise$eval_train()[[1L]][["value"]], expected_error)
+
+  # check some basic details of the boosters just to be sure force_col_wise
+  # and force_row_wise are not causing any weird side effects
+  for (bst in list(bst_row_wise, bst_colwise)) {
+    expect_equal(bst$current_iter(), nrounds)
+    parsed_model <- jsonlite::fromJSON(bst$dump_model())
+    expect_equal(parsed_model$objective, "binary sigmoid:1")
+    expect_false(parsed_model$average_output)
+  }
+})
diff --git a/R-package/tests/testthat/test_learning_to_rank.R b/R-package/tests/testthat/test_learning_to_rank.R
index 049ba53c78f6..65768a9ae178 100644
--- a/R-package/tests/testthat/test_learning_to_rank.R
+++ b/R-package/tests/testthat/test_learning_to_rank.R
@@ -47,8 +47,8 @@ test_that("learning-to-rank with lgb.train() works as expected", {
     }
     expect_identical(sapply(eval_results, function(x) {x$name}), eval_names)
     expect_equal(eval_results[[1L]][["value"]], 0.825)
-    expect_true(abs(eval_results[[2L]][["value"]] - 0.795986) < TOLERANCE)
-    expect_true(abs(eval_results[[3L]][["value"]] - 0.7734639) < TOLERANCE)
+    expect_true(abs(eval_results[[2L]][["value"]] - 0.7766434) < TOLERANCE)
+    expect_true(abs(eval_results[[3L]][["value"]] - 0.7527939) < TOLERANCE)
 })
 
 test_that("learning-to-rank with lgb.cv() works as expected", {
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 7ce7fdad2306..4bc708c222bf 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -190,6 +190,38 @@ Core Parameters
 Learning Control Parameters
 ---------------------------
 
+-  ``force_col_wise`` :raw-html:`<a id="force_col_wise" title="Permalink to this parameter" href="#force_col_wise">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build
+
+   -  Recommend ``force_col_wise=true`` when:
+
+      -  the number of columns is large, or the total number of bin is large
+
+      -  when ``num_threads`` is large, e.g. ``>20``
+
+      -  want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up
+
+      -  want to reduce memory cost
+
+   -  when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one
+
+-  ``force_row_wise`` :raw-html:`<a id="force_row_wise" title="Permalink to this parameter" href="#force_row_wise">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  set ``force_row_wise=true`` will force LightGBM to use row-wise histogram build
+
+   -  Recommend ``force_row_wise=true`` when:
+
+      -  the number of data is large, and the number of total bin is relatively small
+
+      -  want to use small ``bagging``, or ``goss``, to speed-up
+
+      -  when ``num_threads`` is relatively small, e.g. ``<=16``
+
+   -  set ``force_row_wise=true`` will double the memory cost for Dataset object, if your memory is not enough, you can try ``force_col_wise=true``
+
+   -  when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one.
+
 -  ``max_depth`` :raw-html:`<a id="max_depth" title="Permalink to this parameter" href="#max_depth">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
 
    -  limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise
@@ -559,22 +591,6 @@ IO Parameters
 
    -  **Note**: disabling this may cause the slow training speed for sparse datasets
 
--  ``max_conflict_rate`` :raw-html:`<a id="max_conflict_rate" title="Permalink to this parameter" href="#max_conflict_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, constraints: ``0.0 <= max_conflict_rate < 1.0``
-
-   -  max conflict rate for bundles in EFB
-
-   -  set this to ``0.0`` to disallow the conflict and provide more accurate results
-
-   -  set this to a larger value to achieve faster speed
-
--  ``is_enable_sparse`` :raw-html:`<a id="is_enable_sparse" title="Permalink to this parameter" href="#is_enable_sparse">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool, aliases: ``is_sparse``, ``enable_sparse``, ``sparse``
-
-   -  used to enable/disable sparse optimization
-
--  ``sparse_threshold`` :raw-html:`<a id="sparse_threshold" title="Permalink to this parameter" href="#sparse_threshold">&#x1F517;&#xFE0E;</a>`, default = ``0.8``, type = double, constraints: ``0.0 < sparse_threshold <= 1.0``
-
-   -  the threshold of zero elements percentage for treating a feature as a sparse one
-
 -  ``use_missing`` :raw-html:`<a id="use_missing" title="Permalink to this parameter" href="#use_missing">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool
 
    -  set this to ``false`` to disable the special handle of missing value
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index d263f516c7cb..4c7e79787cb6 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -29,36 +29,29 @@ enum MissingType {
   NaN
 };
 
-/*! \brief Store data for one histogram bin */
-struct HistogramBinEntry {
- public:
-  /*! \brief Sum of gradients on this bin */
-  double sum_gradients = 0.0f;
-  /*! \brief Sum of hessians on this bin */
-  double sum_hessians = 0.0f;
-  /*! \brief Number of data on this bin */
-  data_size_t cnt = 0;
-  /*!
-  * \brief Sum up (reducers) functions for histogram bin
-  */
-  inline static void SumReducer(const char *src, char *dst, int type_size, comm_size_t len) {
-    comm_size_t used_size = 0;
-    const HistogramBinEntry* p1;
-    HistogramBinEntry* p2;
-    while (used_size < len) {
-      // convert
-      p1 = reinterpret_cast<const HistogramBinEntry*>(src);
-      p2 = reinterpret_cast<HistogramBinEntry*>(dst);
-      // add
-      p2->cnt += p1->cnt;
-      p2->sum_gradients += p1->sum_gradients;
-      p2->sum_hessians += p1->sum_hessians;
-      src += type_size;
-      dst += type_size;
-      used_size += type_size;
-    }
+typedef double hist_t;
+
+const size_t KHistEntrySize = 2 * sizeof(hist_t);
+const int KHistOffset = 2;
+const double kSparseThreshold = 0.7;
+
+#define GET_GRAD(hist, i) hist[(i) << 1]
+#define GET_HESS(hist, i) hist[((i) << 1) + 1]
+
+inline static void HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
+  comm_size_t used_size = 0;
+  const hist_t* p1;
+  hist_t* p2;
+  while (used_size < len) {
+    // convert
+    p1 = reinterpret_cast<const hist_t*>(src);
+    p2 = reinterpret_cast<hist_t*>(dst);
+    *p2 += *p1;
+    src += type_size;
+    dst += type_size;
+    used_size += type_size;
   }
-};
+}
 
 /*! \brief This class used to convert feature values into bin,
 *          and store some meta information for bin*/
@@ -252,7 +245,7 @@ class OrderedBin {
   * \param out Output Result
   */
   virtual void ConstructHistogram(int leaf, const score_t* gradients,
-    const score_t* hessians, HistogramBinEntry* out) const = 0;
+    const score_t* hessians, hist_t* out) const = 0;
 
   /*!
   * \brief Construct histogram by using this bin
@@ -262,7 +255,7 @@ class OrderedBin {
   * \param gradients Gradients, Note:non-ordered by leaf
   * \param out Output Result
   */
-  virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
+  virtual void ConstructHistogram(int leaf, const score_t* gradients, hist_t* out) const = 0;
 
   /*!
   * \brief Split current bin, and perform re-order by leaf
@@ -360,11 +353,11 @@ class Bin {
   virtual void ConstructHistogram(
     const data_size_t* data_indices, data_size_t start, data_size_t end,
     const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const = 0;
+    hist_t* out) const = 0;
 
   virtual void ConstructHistogram(data_size_t start, data_size_t end,
     const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const = 0;
+    hist_t* out) const = 0;
 
   /*!
   * \brief Construct histogram of this feature,
@@ -380,10 +373,10 @@ class Bin {
   * \param out Output Result
   */
   virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
-                                  const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
+                                  const score_t* ordered_gradients, hist_t* out) const = 0;
 
   virtual void ConstructHistogram(data_size_t start, data_size_t end,
-                                  const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
+                                  const score_t* ordered_gradients, hist_t* out) const = 0;
 
   /*!
   * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
@@ -423,30 +416,11 @@ class Bin {
                             data_size_t* data_indices, data_size_t num_data,
                             data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
 
-  /*!
-  * \brief Create the ordered bin for this bin
-  * \return Pointer to ordered bin
-  */
-  virtual OrderedBin* CreateOrderedBin() const = 0;
-
   /*!
   * \brief After pushed all feature data, call this could have better refactor for bin data
   */
   virtual void FinishLoad() = 0;
 
-  /*!
-  * \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
-  * \param num_data Total number of data
-  * \param num_bin Number of bin
-  * \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
-  * \param is_enable_sparse True if enable sparse feature
-  * \param sparse_threshold Threshold for treating a feature as a sparse feature
-  * \param is_sparse Will set to true if this bin is sparse
-  * \return The bin data object
-  */
-  static Bin* CreateBin(data_size_t num_data, int num_bin,
-    double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
-
   /*!
   * \brief Create object for bin data of one feature, used for dense feature
   * \param num_data Total number of data
@@ -469,6 +443,46 @@ class Bin {
   virtual Bin* Clone() = 0;
 };
 
+
+class MultiValBin {
+public:
+
+  virtual ~MultiValBin() {}
+
+  virtual data_size_t num_data() const = 0;
+
+  virtual int32_t num_bin() const = 0;
+
+  virtual void ReSize(data_size_t num_data) = 0;
+
+  virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
+
+  virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
+
+  virtual void ConstructHistogram(
+    const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void FinishLoad() = 0;
+
+  virtual bool IsSparse() = 0;
+
+  static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate);
+
+  virtual MultiValBin* Clone() = 0;
+};
+
 inline uint32_t BinMapper::ValueToBin(double value) const {
   if (std::isnan(value)) {
     if (missing_type_ == MissingType::NaN) {
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index da10e23c8b5c..ff1646c67210 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -214,6 +214,24 @@ struct Config {
 
   #pragma region Learning Control Parameters
 
+  // desc = set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build
+  // desc = Recommend ``force_col_wise=true`` when:
+  // descl2 = the number of columns is large, or the total number of bin is large
+  // descl2 = when ``num_threads`` is large, e.g. ``>20``
+  // descl2 = want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up
+  // descl2 = want to reduce memory cost
+  // desc = when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one
+  bool force_col_wise = false;
+
+  // desc = set ``force_row_wise=true`` will force LightGBM to use row-wise histogram build
+  // desc = Recommend ``force_row_wise=true`` when:
+  // descl2 = the number of data is large, and the number of total bin is relatively small
+  // descl2 = want to use small ``bagging``, or ``goss``, to speed-up
+  // descl2 = when ``num_threads`` is relatively small, e.g. ``<=16``
+  // desc = set ``force_row_wise=true`` will double the memory cost for Dataset object, if your memory is not enough, you can try ``force_col_wise=true``
+  // desc = when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one.
+  bool force_row_wise = false;
+
   // desc = limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise
   // desc = ``<= 0`` means no limit
   int max_depth = -1;
@@ -534,22 +552,6 @@ struct Config {
   // desc = **Note**: disabling this may cause the slow training speed for sparse datasets
   bool enable_bundle = true;
 
-  // check = >=0.0
-  // check = <1.0
-  // desc = max conflict rate for bundles in EFB
-  // desc = set this to ``0.0`` to disallow the conflict and provide more accurate results
-  // desc = set this to a larger value to achieve faster speed
-  double max_conflict_rate = 0.0;
-
-  // alias = is_sparse, enable_sparse, sparse
-  // desc = used to enable/disable sparse optimization
-  bool is_enable_sparse = true;
-
-  // check = >0.0
-  // check = <=1.0
-  // desc = the threshold of zero elements percentage for treating a feature as a sparse one
-  double sparse_threshold = 0.8;
-
   // desc = set this to ``false`` to disable the special handle of missing value
   bool use_missing = true;
 
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 2c8dc97e2823..2c6f9deebfca 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -8,6 +8,7 @@
 #include <LightGBM/config.h>
 #include <LightGBM/feature_group.h>
 #include <LightGBM/meta.h>
+#include <LightGBM/utils/array_args.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 #include <LightGBM/utils/random.h>
@@ -381,6 +382,7 @@ class Dataset {
   inline uint64_t NumTotalBin() const {
     return group_bin_boundaries_.back();
   }
+
   inline std::vector<int> ValidFeatureIndices() const {
     std::vector<int> ret;
     for (int i = 0; i < num_total_features_; ++i) {
@@ -394,6 +396,13 @@ class Dataset {
 
   void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
 
+  MultiValBin* GetMultiBinFromSparseFeatures() const;
+
+  MultiValBin* GetMultiBinFromAllFeatures() const;
+
+  MultiValBin* TestMultiThreadingMethod(score_t* gradients, score_t* hessians, const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
+    bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const;
+
   LIGHTGBM_EXPORT void FinishLoad();
 
   LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
@@ -423,15 +432,18 @@ class Dataset {
 
   void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
                            const data_size_t* data_indices, data_size_t num_data,
-                           int leaf_idx,
-                           std::vector<std::unique_ptr<OrderedBin>>* ordered_bins,
                            const score_t* gradients, const score_t* hessians,
                            score_t* ordered_gradients, score_t* ordered_hessians,
                            bool is_constant_hessian,
-                           HistogramBinEntry* histogram_data) const;
+                           const MultiValBin* multi_val_bin, bool is_colwise,
+                           hist_t* histogram_data) const;
+
+  void ConstructHistogramsMultiVal(const MultiValBin* multi_val_bin, const data_size_t* data_indices, data_size_t num_data,
+                                  const score_t* gradients, const score_t* hessians,
+                                  bool is_constant_hessian,
+                                  hist_t* histogram_data) const;
 
-  void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
-                    HistogramBinEntry* data) const;
+  void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
 
   inline data_size_t Split(int feature,
                            const uint32_t* threshold, int num_threshold,  bool default_left,
@@ -496,19 +508,10 @@ class Dataset {
     return feature_groups_[group]->bin_mappers_[sub_feature].get();
   }
 
-  inline const Bin* FeatureBin(int i) const {
-    const int group = feature2group_[i];
-    return feature_groups_[group]->bin_data_.get();
-  }
-
   inline const Bin* FeatureGroupBin(int group) const {
     return feature_groups_[group]->bin_data_.get();
   }
 
-  inline bool FeatureGroupIsSparse(int group) const {
-    return feature_groups_[group]->is_sparse_;
-  }
-
   inline BinIterator* FeatureIterator(int i) const {
     const int group = feature2group_[i];
     const int sub_feature = feature2subfeature_[i];
@@ -519,6 +522,10 @@ class Dataset {
     return feature_groups_[group]->FeatureGroupIterator();
   }
 
+  inline bool IsMultiGroup(int i) const {
+    return feature_groups_[i]->is_multi_val_;
+  }
+
   inline double RealThreshold(int i, uint32_t threshold) const {
     const int group = feature2group_[i];
     const int sub_feature = feature2subfeature_[i];
@@ -532,18 +539,6 @@ class Dataset {
     return feature_groups_[group]->bin_mappers_[sub_feature]->ValueToBin(threshold_double);
   }
 
-  inline void CreateOrderedBins(std::vector<std::unique_ptr<OrderedBin>>* ordered_bins) const {
-    ordered_bins->resize(num_groups_);
-    OMP_INIT_EX();
-    #pragma omp parallel for schedule(guided)
-    for (int i = 0; i < num_groups_; ++i) {
-      OMP_LOOP_EX_BEGIN();
-      ordered_bins->at(i).reset(feature_groups_[i]->bin_data_->CreateOrderedBin());
-      OMP_LOOP_EX_END();
-    }
-    OMP_THROW_EX();
-  }
-
   /*!
   * \brief Get meta data pointer
   * \return Pointer of meta data
@@ -620,7 +615,7 @@ class Dataset {
   /*! \brief Disable copy */
   Dataset(const Dataset&) = delete;
 
-  void addFeaturesFrom(Dataset* other);
+  void AddFeaturesFrom(Dataset* other);
 
  private:
   std::string data_filename_;
@@ -638,8 +633,6 @@ class Dataset {
   Metadata metadata_;
   /*! \brief index of label column */
   int label_idx_ = 0;
-  /*! \brief Threshold for treating a feature as a sparse feature */
-  double sparse_threshold_;
   /*! \brief store feature names */
   std::vector<std::string> feature_names_;
   /*! \brief store feature names */
@@ -662,6 +655,8 @@ class Dataset {
   bool use_missing_;
   bool zero_as_missing_;
   std::vector<int> feature_need_push_zeros_;
+  mutable std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_buf_;
+
 };
 
 }  // namespace LightGBM
diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index 439a8aea74e4..a4d066462a0b 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -30,14 +30,13 @@ class FeatureGroup {
   * \param is_enable_sparse True if enable sparse feature
   * \param sparse_threshold Threshold for treating a feature as a sparse feature
   */
-  FeatureGroup(int num_feature,
+  FeatureGroup(int num_feature, bool is_multi_val,
     std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
-    data_size_t num_data, double sparse_threshold, bool is_enable_sparse) : num_feature_(num_feature) {
+    data_size_t num_data) : num_feature_(num_feature), is_multi_val_(is_multi_val), is_sparse_(false) {
     CHECK(static_cast<int>(bin_mappers->size()) == num_feature);
     // use bin at zero to store most_freq_bin
     num_total_bin_ = 1;
     bin_offsets_.emplace_back(num_total_bin_);
-    int cnt_non_zero = 0;
     for (int i = 0; i < num_feature_; ++i) {
       bin_mappers_.emplace_back(bin_mappers->at(i).release());
       auto num_bin = bin_mappers_[i]->num_bin();
@@ -46,18 +45,26 @@ class FeatureGroup {
       }
       num_total_bin_ += num_bin;
       bin_offsets_.emplace_back(num_total_bin_);
-      cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers_[i]->sparse_rate()));
     }
-    double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
-    bin_data_.reset(Bin::CreateBin(num_data, num_total_bin_,
-      sparse_rate, is_enable_sparse, sparse_threshold, &is_sparse_));
+    if (is_multi_val_) {
+      multi_bin_data_.clear();
+      for (int i = 0; i < num_feature_; ++i) {
+        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
+        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
+          multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
+        } else {
+          multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
+        }
+      }
+    } else {
+      bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
+    }
   }
 
-  FeatureGroup(int num_feature,
-               std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
-               data_size_t num_data, bool is_sparse) : num_feature_(num_feature) {
-    CHECK(static_cast<int>(bin_mappers->size()) == num_feature);
-    // use bin at zero to store most_freq_bin
+  FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
+    data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
+    CHECK(static_cast<int>(bin_mappers->size()) == 1);
+    // use bin at zero to store default_bin
     num_total_bin_ = 1;
     bin_offsets_.emplace_back(num_total_bin_);
     for (int i = 0; i < num_feature_; ++i) {
@@ -69,13 +76,15 @@ class FeatureGroup {
       num_total_bin_ += num_bin;
       bin_offsets_.emplace_back(num_total_bin_);
     }
-    is_sparse_ = is_sparse;
-    if (is_sparse_) {
+    if (bin_mappers_[0]->sparse_rate() >=  kSparseThreshold) {
+      is_sparse_ = true;
       bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
     } else {
+      is_sparse_ = false;
       bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
     }
   }
+
   /*!
   * \brief Constructor from memory
   * \param memory Pointer of memory
@@ -86,6 +95,8 @@ class FeatureGroup {
     const std::vector<data_size_t>& local_used_indices) {
     const char* memory_ptr = reinterpret_cast<const char*>(memory);
     // get is_sparse
+    is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
+    memory_ptr += sizeof(is_multi_val_);
     is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
     memory_ptr += sizeof(is_sparse_);
     num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
@@ -110,13 +121,26 @@ class FeatureGroup {
     if (!local_used_indices.empty()) {
       num_data = static_cast<data_size_t>(local_used_indices.size());
     }
-    if (is_sparse_) {
-      bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
+    if (is_multi_val_) {
+      for (int i = 0; i < num_feature_; ++i) {
+        int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
+        if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
+          multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
+        } else {
+          multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
+        }
+        multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
+        memory_ptr += multi_bin_data_.back()->SizesInByte();
+      }
     } else {
-      bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
+      if (is_sparse_) {
+        bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
+      } else {
+        bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
+      }
+      // get bin data
+      bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
     }
-    // get bin data
-    bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
   }
   /*! \brief Destructor */
   ~FeatureGroup() {
@@ -131,22 +155,54 @@ class FeatureGroup {
   inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
     uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
     if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { return; }
-    bin += bin_offsets_[sub_feature_idx];
     if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
       bin -= 1;
     }
-    bin_data_->Push(tid, line_idx, bin);
+    if (is_multi_val_) {
+      multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
+    } else {
+      bin += bin_offsets_[sub_feature_idx];
+      bin_data_->Push(tid, line_idx, bin);
+    }
   }
 
   inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
-    bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
+    if (!is_multi_val_) {
+      bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
+    } else {
+      for (int i = 0; i < num_feature_; ++i) {
+        multi_bin_data_[i]->CopySubset(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
+      }
+    }
   }
 
   inline BinIterator* SubFeatureIterator(int sub_feature) {
-    uint32_t min_bin = bin_offsets_[sub_feature];
-    uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
     uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
-    return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
+    if (!is_multi_val_) {
+      uint32_t min_bin = bin_offsets_[sub_feature];
+      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
+      return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
+    } else {
+      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
+      uint32_t min_bin = 1;
+      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
+      return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, most_freq_bin);
+    }
+  }
+
+  inline void FinishLoad() {
+    if (is_multi_val_) {
+      OMP_INIT_EX();
+      #pragma omp parallel for schedule(guided)
+      for (int i = 0; i < num_feature_; ++i) {
+        OMP_LOOP_EX_BEGIN();
+        multi_bin_data_[i]->FinishLoad();
+        OMP_LOOP_EX_END();
+      }
+      OMP_THROW_EX();
+    } else {
+      bin_data_->FinishLoad();
+    }
   }
 
   /*!
@@ -155,6 +211,9 @@ class FeatureGroup {
    * \return A pointer to the BinIterator object
    */
   inline BinIterator* FeatureGroupIterator() {
+    if (is_multi_val_) {
+      return nullptr;
+    }
     uint32_t min_bin = bin_offsets_[0];
     uint32_t max_bin = bin_offsets_.back() - 1;
     uint32_t most_freq_bin = 0;
@@ -168,17 +227,29 @@ class FeatureGroup {
     bool default_left,
     data_size_t* data_indices, data_size_t num_data,
     data_size_t* lte_indices, data_size_t* gt_indices) const {
-
-    uint32_t min_bin = bin_offsets_[sub_feature];
-    uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
     uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
     uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
-    if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
-      auto missing_type = bin_mappers_[sub_feature]->missing_type();
-      return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
-                              *threshold, data_indices, num_data, lte_indices, gt_indices);
+    if (!is_multi_val_) {
+      uint32_t min_bin = bin_offsets_[sub_feature];
+      uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
+      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
+        auto missing_type = bin_mappers_[sub_feature]->missing_type();
+        return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
+          *threshold, data_indices, num_data, lte_indices, gt_indices);
+      } else {
+        return bin_data_->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
+      }
     } else {
-      return bin_data_->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
+      int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
+      uint32_t min_bin = 1;
+      uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
+      if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
+        auto missing_type = bin_mappers_[sub_feature]->missing_type();
+        return multi_bin_data_[sub_feature]->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
+          *threshold, data_indices, num_data, lte_indices, gt_indices);
+      } else {
+        return multi_bin_data_[sub_feature]->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
+      }
     }
   }
   /*!
@@ -195,22 +266,35 @@ class FeatureGroup {
   * \param file File want to write
   */
   void SaveBinaryToFile(const VirtualFileWriter* writer) const {
+    writer->Write(&is_multi_val_, sizeof(is_multi_val_));
     writer->Write(&is_sparse_, sizeof(is_sparse_));
     writer->Write(&num_feature_, sizeof(num_feature_));
     for (int i = 0; i < num_feature_; ++i) {
       bin_mappers_[i]->SaveBinaryToFile(writer);
     }
-    bin_data_->SaveBinaryToFile(writer);
+    if (is_multi_val_) {
+      for (int i = 0; i < num_feature_; ++i) {
+        multi_bin_data_[i]->SaveBinaryToFile(writer);
+      }
+    } else {
+      bin_data_->SaveBinaryToFile(writer);
+    }
   }
   /*!
   * \brief Get sizes in byte of this object
   */
   size_t SizesInByte() const {
-    size_t ret = sizeof(is_sparse_) + sizeof(num_feature_);
+    size_t ret = sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_);
     for (int i = 0; i < num_feature_; ++i) {
       ret += bin_mappers_[i]->SizesInByte();
     }
-    ret += bin_data_->SizesInByte();
+    if (!is_multi_val_) {
+      ret += bin_data_->SizesInByte();
+    } else {
+      for (int i = 0; i < num_feature_; ++i) {
+        ret += multi_bin_data_[i]->SizesInByte();
+      }
+    }
     return ret;
   }
   /*! \brief Disable copy */
@@ -218,6 +302,7 @@ class FeatureGroup {
   /*! \brief Deep copy */
   FeatureGroup(const FeatureGroup& other) {
     num_feature_ = other.num_feature_;
+    is_multi_val_ = other.is_multi_val_;
     is_sparse_ = other.is_sparse_;
     num_total_bin_ = other.num_total_bin_;
     bin_offsets_ = other.bin_offsets_;
@@ -226,8 +311,14 @@ class FeatureGroup {
     for (auto& bin_mapper : other.bin_mappers_) {
       bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
     }
-
-    bin_data_.reset(other.bin_data_->Clone());
+    if (!is_multi_val_) {
+      bin_data_.reset(other.bin_data_->Clone());
+    } else {
+      multi_bin_data_.clear();
+      for (int i = 0; i < num_feature_; ++i) {
+        multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
+      }
+    }
   }
 
  private:
@@ -239,7 +330,9 @@ class FeatureGroup {
   std::vector<uint32_t> bin_offsets_;
   /*! \brief Bin data of this feature */
   std::unique_ptr<Bin> bin_data_;
+  std::vector<std::unique_ptr<Bin>> multi_bin_data_;
   /*! \brief True if this feature is sparse */
+  bool is_multi_val_;
   bool is_sparse_;
   int num_total_bin_;
 };
diff --git a/include/LightGBM/meta.h b/include/LightGBM/meta.h
index 9b5f2ea313db..ea8315be1b65 100644
--- a/include/LightGBM/meta.h
+++ b/include/LightGBM/meta.h
@@ -71,8 +71,9 @@ typedef void(*AllgatherFunction)(char* input, comm_size_t input_size, const comm
 
 #define NO_SPECIFIC (-1)
 
-// Prefetch size is usually 64 bytes
-const int kCacheLineSize = 64;
+const int kAlignedSize = 32;
+
+#define SIZE_ALIGNED(t) ((t) + kAlignedSize - 1) / kAlignedSize * kAlignedSize
 
 }  // namespace LightGBM
 
diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h
index 9bc069b45a85..5a855f84c7a1 100644
--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -213,6 +213,7 @@ class Tree {
 
   void RecomputeMaxDepth();
 
+  int NextLeafId() const { return num_leaves_; }
  private:
   std::string NumericalDecisionIfElse(int node) const;
 
diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h
index 66efe3d20a03..dca104e7531e 100644
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -71,6 +71,8 @@ class TreeLearner {
   virtual void SetBaggingData(const data_size_t* used_indices,
     data_size_t num_data) = 0;
 
+  virtual bool IsHistColWise() const = 0;
+
   /*!
   * \brief Using last trained tree to predict score then adding to out_score;
   * \param out_score output score
diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
index 3bacc9372329..bbcfe6328832 100644
--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -11,22 +11,36 @@
 #include <limits>
 #include <string>
 #include <algorithm>
+#include <chrono>
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
 #include <functional>
 #include <iomanip>
 #include <iterator>
+#include <map>
 #include <memory>
 #include <sstream>
 #include <type_traits>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
-#ifdef _MSC_VER
-#include "intrin.h"
+#if defined(_MSC_VER) 
+#include <malloc.h>
+#elif MM_MALLOC
+#include <mm_malloc.h>
+#elif defined(__GNUC__)
+#include <malloc.h>
+#define _mm_malloc(a, b) memalign(b, a)
+#define _mm_free(a) free(a)
+#else
+#include <stdlib.h>
+#define _mm_malloc(a, b) malloc(a)
+#define _mm_free(a) free(a)
 #endif
 
+
 namespace LightGBM {
 
 namespace Common {
@@ -946,8 +960,133 @@ inline bool CheckAllowedJSON(const std::string& s) {
   return true;
 }
 
+inline int RoundInt(double x) {
+  return static_cast<int>(x + 0.5f);
+}
+
+template <typename T, std::size_t N = 32>
+class AlignmentAllocator {
+public:
+  typedef T value_type;
+  typedef std::size_t size_type;
+  typedef std::ptrdiff_t difference_type;
+
+  typedef T* pointer;
+  typedef const T* const_pointer;
+
+  typedef T& reference;
+  typedef const T& const_reference;
+
+public:
+  inline AlignmentAllocator() throw () {}
+
+  template <typename T2>
+  inline AlignmentAllocator(const AlignmentAllocator<T2, N>&) throw () {}
+
+  inline ~AlignmentAllocator() throw () {}
+
+  inline pointer adress(reference r) {
+    return &r;
+  }
+
+  inline const_pointer adress(const_reference r) const {
+    return &r;
+  }
+
+  inline pointer allocate(size_type n) {
+    return (pointer)_mm_malloc(n * sizeof(value_type), N);
+  }
+
+  inline void deallocate(pointer p, size_type) {
+    _mm_free(p);
+  }
+
+  inline void construct(pointer p, const value_type& wert) {
+    new (p) value_type(wert);
+  }
+
+  inline void destroy(pointer p) {
+    p->~value_type();
+  }
+
+  inline size_type max_size() const throw () {
+    return size_type(-1) / sizeof(value_type);
+  }
+
+  template <typename T2>
+  struct rebind {
+    typedef AlignmentAllocator<T2, N> other;
+  };
+
+  bool operator!=(const AlignmentAllocator<T, N>& other) const {
+    return !(*this == other);
+  }
+
+  // Returns true if and only if storage allocated from *this
+  // can be deallocated from other, and vice versa.
+  // Always returns true for stateless allocators.
+  bool operator==(const AlignmentAllocator<T, N>&) const {
+    return true;
+  }
+};
+
+// Note: this class is not thread-safe, don't use it inside omp blocks
+class Timer {
+ public:
+  Timer() {}
+  ~Timer() {
+    Print();
+  }
+  #ifdef TIMETAG
+  void Start(const std::string& name) {
+    auto cur_time = std::chrono::steady_clock::now();
+    start_time_[name] = cur_time;
+  }
+  void Stop(const std::string& name) {
+    if (stats_.find(name) == stats_.end()) {
+      stats_[name] = std::chrono::duration<double, std::milli>(0);
+    }
+    stats_[name] += std::chrono::steady_clock::now() - start_time_[name];
+  }
+  #else
+  void Start(const std::string&) { }
+  void Stop(const std::string&) { }
+  #endif // TIMETAG
+
+  void Print() const {
+    #ifdef  TIMETAG
+    std::map<std::string, std::chrono::duration<double, std::milli>> ordered(stats_.begin(), stats_.end());
+    for (auto it = ordered.begin(); it != ordered.end(); ++it) {
+      Log::Info("%s costs:\t %f ", it->first.c_str(), it->second * 1e-3);
+    }
+    #endif
+  }
+  std::unordered_map<std::string, std::chrono::steady_clock::time_point> start_time_;
+  std::unordered_map<std::string, std::chrono::duration<double, std::milli>> stats_;
+};
+
+// Note: this class is not thread-safe, don't use it inside omp blocks
+class FunctionTimer {
+ public:
+  FunctionTimer(const std::string& name, Timer& timer): timer_(timer) {
+    timer.Start(name);
+    #ifdef TIMETAG
+    name_ = name;
+    #endif // TIMETAG
+
+  }
+  ~FunctionTimer() {
+    timer_.Stop(name_);
+  }
+ private:
+  std::string name_;
+  Timer& timer_;
+};
+
 }  // namespace Common
 
+extern Common::Timer global_timer;
+
 }  // namespace LightGBM
 
 #endif   // LightGBM_UTILS_COMMON_FUN_H_
diff --git a/src/application/application.cpp b/src/application/application.cpp
index aeeaac2606e5..e896971f4ca9 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -27,6 +27,8 @@
 
 namespace LightGBM {
 
+Common::Timer global_timer;
+
 Application::Application(int argc, char** argv) {
   LoadParameters(argc, argv);
   // set number of threads for openmp
diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp
index 64dc099e5db7..48497b4d0663 100644
--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
@@ -62,7 +62,7 @@ class Predictor {
     boosting_ = boosting;
     num_pred_one_row_ = boosting_->NumPredictOneRow(num_iteration, predict_leaf_index, predict_contrib);
     num_feature_ = boosting_->MaxFeatureIdx() + 1;
-    predict_buf_ = std::vector<std::vector<double>>(num_threads_, std::vector<double>(num_feature_, 0.0f));
+    predict_buf_.resize(num_threads_, std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>(num_feature_, 0.0f));
     const int kFeatureThreshold = 100000;
     const size_t KSparseThreshold = static_cast<size_t>(0.01 * num_feature_);
     if (predict_leaf_index) {
@@ -263,7 +263,7 @@ class Predictor {
   int num_feature_;
   int num_pred_one_row_;
   int num_threads_;
-  std::vector<std::vector<double>> predict_buf_;
+  std::vector<std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>> predict_buf_;
 };
 
 }  // namespace LightGBM
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 2d469d4c28a3..9c9c6238183a 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -17,7 +17,6 @@
 
 namespace LightGBM {
 
-
 GBDT::GBDT() : iter_(0),
 train_data_(nullptr),
 objective_function_(nullptr),
@@ -41,6 +40,7 @@ balanced_bagging_(false) {
 }
 
 GBDT::~GBDT() {
+
 }
 
 void GBDT::Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
@@ -148,6 +148,7 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
 }
 
 void GBDT::Boosting() {
+  Common::FunctionTimer fun_timer("GBDT::Boosting", global_timer);
   if (objective_function_ == nullptr) {
     Log::Fatal("No object function provided");
   }
@@ -208,23 +209,26 @@ data_size_t GBDT::BalancedBaggingHelper(Random* cur_rand, data_size_t start, dat
 }
 
 void GBDT::Bagging(int iter) {
+  Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer);
   // if need bagging
   if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0)
       || need_re_bagging_) {
     need_re_bagging_ = false;
-    const data_size_t min_inner_size = 1000;
-    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
-    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
+    const data_size_t min_inner_size = 1024;
+    const int n_block = std::min(
+        num_threads_, (num_data_ + min_inner_size - 1) / min_inner_size);
+    data_size_t inner_size = SIZE_ALIGNED((num_data_ + n_block - 1) / n_block);
     OMP_INIT_EX();
     #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < n_block; ++i) {
       OMP_LOOP_EX_BEGIN();
-      left_cnts_buf_[i] = 0;
-      right_cnts_buf_[i] = 0;
       data_size_t cur_start = i * inner_size;
-      if (cur_start > num_data_) { continue; }
-      data_size_t cur_cnt = inner_size;
-      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
+      data_size_t cur_cnt = std::min(inner_size, num_data_ - cur_start);
+      if (cur_cnt <= 0) {
+        left_cnts_buf_[i] = 0;
+        right_cnts_buf_[i] = 0;
+        continue;
+      }
       Random cur_rand(config_->bagging_seed + iter * num_threads_ + i);
       data_size_t cur_left_count = 0;
       if (balanced_bagging_) {
@@ -241,15 +245,14 @@ void GBDT::Bagging(int iter) {
     data_size_t left_cnt = 0;
     left_write_pos_buf_[0] = 0;
     right_write_pos_buf_[0] = 0;
-    for (int i = 1; i < num_threads_; ++i) {
+    for (int i = 1; i < n_block; ++i) {
       left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
       right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
     }
-    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
+    left_cnt = left_write_pos_buf_[n_block - 1] + left_cnts_buf_[n_block - 1];
 
     #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
-      OMP_LOOP_EX_BEGIN();
+    for (int i = 0; i < n_block; ++i) {
       if (left_cnts_buf_[i] > 0) {
         std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
                     tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
@@ -258,9 +261,7 @@ void GBDT::Bagging(int iter) {
         std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
                     tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
       }
-      OMP_LOOP_EX_END();
     }
-    OMP_THROW_EX();
     bag_data_cnt_ = left_cnt;
     Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
     // set bagging data to tree learner
@@ -276,6 +277,7 @@ void GBDT::Bagging(int iter) {
 }
 
 void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
+  Common::FunctionTimer fun_timer("GBDT::Train", global_timer);
   bool is_finished = false;
   auto start_time = std::chrono::steady_clock::now();
   for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) {
@@ -342,6 +344,7 @@ double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj, int class_id)
 }
 
 double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
+  Common::FunctionTimer fun_timer("GBDT::BoostFromAverage", global_timer);
   // boosting from average label; or customized "average" if implemented for the current objective
   if (models_.empty() && !train_score_updater_->has_init_score() && objective_function_ != nullptr) {
     if (config_->boost_from_average || (train_data_ != nullptr && train_data_->num_features() == 0)) {
@@ -366,6 +369,7 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
 }
 
 bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
+  Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer);
   std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
   // boosting first
   if (gradients == nullptr || hessians == nullptr) {
@@ -486,6 +490,7 @@ bool GBDT::EvalAndCheckEarlyStopping() {
 }
 
 void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
+  Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer);
   // update training score
   if (!is_use_subset_) {
     train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id);
@@ -755,17 +760,10 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
     right_write_pos_buf_.resize(num_threads_);
 
     double average_bag_rate = (bag_data_cnt_ / num_data_) / config->bagging_freq;
-    int sparse_group = 0;
-    for (int i = 0; i < train_data_->num_feature_groups(); ++i) {
-      if (train_data_->FeatureGroupIsSparse(i)) {
-        ++sparse_group;
-      }
-    }
     is_use_subset_ = false;
     const int group_threshold_usesubset = 100;
-    const int sparse_group_threshold_usesubset = train_data_->num_feature_groups() / 4;
-    if (average_bag_rate <= 0.5
-        && (train_data_->num_feature_groups() < group_threshold_usesubset || sparse_group < sparse_group_threshold_usesubset)) {
+    if (tree_learner_->IsHistColWise() && average_bag_rate <= 0.5
+        && (train_data_->num_feature_groups() < group_threshold_usesubset)) {
       if (tmp_subset_ == nullptr || is_change_dataset) {
         tmp_subset_.reset(new Dataset(bag_data_cnt_));
         tmp_subset_->CopyFeatureMapperFrom(train_data_);
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index 5103a93d540e..e891457dfada 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -457,11 +457,11 @@ class GBDT : public GBDTBase {
   /*! \brief Max feature index of training data*/
   int max_feature_idx_;
   /*! \brief First order derivative of training data */
-  std::vector<score_t> gradients_;
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> gradients_;
   /*! \brief Secend order derivative of training data */
-  std::vector<score_t> hessians_;
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> hessians_;
   /*! \brief Store the indices of in-bag data */
-  std::vector<data_size_t> bag_data_indices_;
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> bag_data_indices_;
   /*! \brief Number of in-bag data */
   data_size_t bag_data_cnt_;
   /*! \brief Store the indices of in-bag data */
diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp
index d9aa0fd64a5c..d12cb1f3a09a 100644
--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@@ -22,10 +22,6 @@
 
 namespace LightGBM {
 
-#ifdef TIMETAG
-std::chrono::duration<double, std::milli> subset_time;
-std::chrono::duration<double, std::milli> re_init_tree_time;
-#endif
 
 class GOSS: public GBDT {
  public:
@@ -36,10 +32,7 @@ class GOSS: public GBDT {
   }
 
   ~GOSS() {
-    #ifdef TIMETAG
-    Log::Info("GOSS::subset costs %f", subset_time * 1e-3);
-    Log::Info("GOSS::re_init_tree costs %f", re_init_tree_time * 1e-3);
-    #endif
+
   }
 
   void Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
@@ -143,19 +136,21 @@ class GOSS: public GBDT {
     // not subsample for first iterations
     if (iter < static_cast<int>(1.0f / config_->learning_rate)) { return; }
 
-    const data_size_t min_inner_size = 100;
-    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
-    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
+    const data_size_t min_inner_size = 128;
+    const int n_block = std::min(
+        num_threads_, (num_data_ + min_inner_size - 1) / min_inner_size);
+    data_size_t inner_size = SIZE_ALIGNED((num_data_ + n_block - 1) / n_block);
     OMP_INIT_EX();
     #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < n_block; ++i) {
       OMP_LOOP_EX_BEGIN();
-      left_cnts_buf_[i] = 0;
-      right_cnts_buf_[i] = 0;
       data_size_t cur_start = i * inner_size;
-      if (cur_start > num_data_) { continue; }
-      data_size_t cur_cnt = inner_size;
-      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
+      data_size_t cur_cnt = std::min(inner_size, num_data_ - cur_start);
+      if (cur_cnt <= 0) {
+        left_cnts_buf_[i] = 0;
+        right_cnts_buf_[i] = 0;
+        continue;
+      }
       Random cur_rand(config_->bagging_seed + iter * num_threads_ + i);
       data_size_t cur_left_count = BaggingHelper(&cur_rand, cur_start, cur_cnt,
                                                  tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start);
@@ -168,14 +163,14 @@ class GOSS: public GBDT {
     data_size_t left_cnt = 0;
     left_write_pos_buf_[0] = 0;
     right_write_pos_buf_[0] = 0;
-    for (int i = 1; i < num_threads_; ++i) {
+    for (int i = 1; i < n_block; ++i) {
       left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
       right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
     }
-    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
+    left_cnt = left_write_pos_buf_[n_block - 1] + left_cnts_buf_[n_block - 1];
 
     #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < n_block; ++i) {
       OMP_LOOP_EX_BEGIN();
       if (left_cnts_buf_[i] > 0) {
         std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
@@ -193,22 +188,10 @@ class GOSS: public GBDT {
     if (!is_use_subset_) {
       tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
     } else {
-      // get subset
-      #ifdef TIMETAG
-      auto start_time = std::chrono::steady_clock::now();
-      #endif
       tmp_subset_->ReSize(bag_data_cnt_);
       tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
-      #ifdef TIMETAG
-      subset_time += std::chrono::steady_clock::now() - start_time;
-      #endif
-      #ifdef TIMETAG
-      start_time = std::chrono::steady_clock::now();
-      #endif
       tree_learner_->ResetTrainingData(tmp_subset_.get());
-      #ifdef TIMETAG
-      re_init_tree_time += std::chrono::steady_clock::now() - start_time;
-      #endif
+
     }
   }
 
diff --git a/src/boosting/score_updater.hpp b/src/boosting/score_updater.hpp
index 0b2e208f3afe..d3f67a64819f 100644
--- a/src/boosting/score_updater.hpp
+++ b/src/boosting/score_updater.hpp
@@ -55,6 +55,7 @@ class ScoreUpdater {
   inline bool has_init_score() const { return has_init_score_; }
 
   inline void AddScore(double val, int cur_tree_id) {
+    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
     const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
     #pragma omp parallel for schedule(static)
     for (int i = 0; i < num_data_; ++i) {
@@ -76,6 +77,7 @@ class ScoreUpdater {
   * \param cur_tree_id Current tree for multiclass training
   */
   inline void AddScore(const Tree* tree, int cur_tree_id) {
+    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
     const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
     tree->AddPredictionToScore(data_, num_data_, score_.data() + offset);
   }
@@ -87,6 +89,7 @@ class ScoreUpdater {
   * \param cur_tree_id Current tree for multiclass training
   */
   inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) {
+    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
     const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
     tree_learner->AddPredictionToScore(tree, score_.data() + offset);
   }
@@ -100,6 +103,7 @@ class ScoreUpdater {
   */
   inline void AddScore(const Tree* tree, const data_size_t* data_indices,
                        data_size_t data_cnt, int cur_tree_id) {
+    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
     const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
     tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + offset);
   }
@@ -119,7 +123,7 @@ class ScoreUpdater {
   /*! \brief Pointer of data set */
   const Dataset* data_;
   /*! \brief Scores for data set */
-  std::vector<double> score_;
+  std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>> score_;
   bool has_init_score_;
 };
 
diff --git a/src/c_api.cpp b/src/c_api.cpp
index a06faa2ded68..1e060037de76 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -1065,7 +1065,7 @@ int LGBM_DatasetAddFeaturesFrom(DatasetHandle target,
   API_BEGIN();
   auto target_d = reinterpret_cast<Dataset*>(target);
   auto source_d = reinterpret_cast<Dataset*>(source);
-  target_d->addFeaturesFrom(source_d);
+  target_d->AddFeaturesFrom(source_d);
   API_END();
 }
 
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index db3b6e453954..f9ade8a91226 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -15,7 +15,8 @@
 
 #include "dense_bin.hpp"
 #include "dense_nbits_bin.hpp"
-#include "ordered_sparse_bin.hpp"
+#include "multi_val_dense_bin.hpp"
+#include "multi_val_sparse_bin.hpp"
 #include "sparse_bin.hpp"
 
 namespace LightGBM {
@@ -636,21 +637,10 @@ namespace LightGBM {
   template class SparseBin<uint16_t>;
   template class SparseBin<uint32_t>;
 
-  template class OrderedSparseBin<uint8_t>;
-  template class OrderedSparseBin<uint16_t>;
-  template class OrderedSparseBin<uint32_t>;
+  template class MultiValDenseBin<uint8_t>;
+  template class MultiValDenseBin<uint16_t>;
+  template class MultiValDenseBin<uint32_t>;
 
-  Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
-    bool is_enable_sparse, double sparse_threshold, bool* is_sparse) {
-    // sparse threshold
-    if (sparse_rate >= sparse_threshold && is_enable_sparse) {
-      *is_sparse = true;
-      return CreateSparseBin(num_data, num_bin);
-    } else {
-      *is_sparse = false;
-      return CreateDenseBin(num_data, num_bin);
-    }
-  }
 
   Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
     if (num_bin <= 16) {
@@ -674,4 +664,25 @@ namespace LightGBM {
     }
   }
 
+  MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate) {
+    const double multi_val_bin_sparse_threshold = 0.25f;
+    if (sparse_rate >= multi_val_bin_sparse_threshold) {
+      if (num_bin <= 256) {
+        return new MultiValSparseBin<uint8_t>(num_data, num_bin);
+      } else if (num_bin <= 65536) {
+        return new MultiValSparseBin<uint16_t>(num_data, num_bin);
+      } else {
+        return new MultiValSparseBin<uint32_t>(num_data, num_bin);
+      }
+    } else {
+      if (num_bin <= 256) {
+        return new MultiValDenseBin<uint8_t>(num_data, num_bin, num_feature);
+      } else if (num_bin <= 65536) {
+        return new MultiValDenseBin<uint16_t>(num_data, num_bin, num_feature);
+      } else {
+        return new MultiValDenseBin<uint32_t>(num_data, num_bin, num_feature);
+      }
+    }
+  }
+
 }  // namespace LightGBM
diff --git a/src/io/config.cpp b/src/io/config.cpp
index f619194262fa..0ec4c8eec56a 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -312,6 +312,11 @@ void Config::CheckParamConflict() {
       num_leaves = static_cast<int>(full_num_leaves);
     }
   }
+  // force col-wise for gpu
+  if (device_type == std::string("gpu")) {
+    force_col_wise = true;
+    force_row_wise = false;
+  }
 }
 
 std::string Config::ToString() const {
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 1fcaeb1b1483..91bebbc7d788 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -116,9 +116,6 @@ std::unordered_map<std::string, std::string> Config::alias_table({
   {"is_pre_partition", "pre_partition"},
   {"is_enable_bundle", "enable_bundle"},
   {"bundle", "enable_bundle"},
-  {"is_sparse", "is_enable_sparse"},
-  {"enable_sparse", "is_enable_sparse"},
-  {"sparse", "is_enable_sparse"},
   {"two_round_loading", "two_round"},
   {"use_two_round_loading", "two_round"},
   {"is_save_binary", "save_binary"},
@@ -181,6 +178,8 @@ std::unordered_set<std::string> Config::parameter_set({
   "num_threads",
   "device_type",
   "seed",
+  "force_col_wise",
+  "force_row_wise",
   "max_depth",
   "min_data_in_leaf",
   "min_sum_hessian_in_leaf",
@@ -236,9 +235,6 @@ std::unordered_set<std::string> Config::parameter_set({
   "valid_data_initscores",
   "pre_partition",
   "enable_bundle",
-  "max_conflict_rate",
-  "is_enable_sparse",
-  "sparse_threshold",
   "use_missing",
   "zero_as_missing",
   "two_round",
@@ -309,6 +305,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetInt(params, "num_threads", &num_threads);
 
+  GetBool(params, "force_col_wise", &force_col_wise);
+
+  GetBool(params, "force_row_wise", &force_row_wise);
+
   GetInt(params, "max_depth", &max_depth);
 
   GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
@@ -467,16 +467,6 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetBool(params, "enable_bundle", &enable_bundle);
 
-  GetDouble(params, "max_conflict_rate", &max_conflict_rate);
-  CHECK(max_conflict_rate >=0.0);
-  CHECK(max_conflict_rate <1.0);
-
-  GetBool(params, "is_enable_sparse", &is_enable_sparse);
-
-  GetDouble(params, "sparse_threshold", &sparse_threshold);
-  CHECK(sparse_threshold >0.0);
-  CHECK(sparse_threshold <=1.0);
-
   GetBool(params, "use_missing", &use_missing);
 
   GetBool(params, "zero_as_missing", &zero_as_missing);
@@ -600,6 +590,8 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[learning_rate: " << learning_rate << "]\n";
   str_buf << "[num_leaves: " << num_leaves << "]\n";
   str_buf << "[num_threads: " << num_threads << "]\n";
+  str_buf << "[force_col_wise: " << force_col_wise << "]\n";
+  str_buf << "[force_row_wise: " << force_row_wise << "]\n";
   str_buf << "[max_depth: " << max_depth << "]\n";
   str_buf << "[min_data_in_leaf: " << min_data_in_leaf << "]\n";
   str_buf << "[min_sum_hessian_in_leaf: " << min_sum_hessian_in_leaf << "]\n";
@@ -655,9 +647,6 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[valid_data_initscores: " << Common::Join(valid_data_initscores, ",") << "]\n";
   str_buf << "[pre_partition: " << pre_partition << "]\n";
   str_buf << "[enable_bundle: " << enable_bundle << "]\n";
-  str_buf << "[max_conflict_rate: " << max_conflict_rate << "]\n";
-  str_buf << "[is_enable_sparse: " << is_enable_sparse << "]\n";
-  str_buf << "[sparse_threshold: " << sparse_threshold << "]\n";
   str_buf << "[use_missing: " << use_missing << "]\n";
   str_buf << "[zero_as_missing: " << zero_as_missing << "]\n";
   str_buf << "[two_round: " << two_round << "]\n";
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 6dde7e23b211..8f5912016789 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -36,6 +36,7 @@ Dataset::Dataset(data_size_t num_data) {
 }
 
 Dataset::~Dataset() {
+
 }
 
 std::vector<std::vector<int>> NoGroup(
@@ -48,19 +49,20 @@ std::vector<std::vector<int>> NoGroup(
   return features_in_group;
 }
 
-int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, int max_cnt) {
+int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, data_size_t max_cnt) {
   int ret = 0;
   for (int i = 0; i < num_indices; ++i) {
     if (mark[indices[i]]) {
       ++ret;
-      if (ret > max_cnt) {
-        return -1;
-      }
+    }
+    if (ret > max_cnt) {
+      return -1;
     }
   }
   return ret;
 }
-void MarkUsed(std::vector<bool>* mark, const int* indices, int num_indices) {
+
+void MarkUsed(std::vector<bool>* mark, const int* indices, data_size_t num_indices) {
   auto& ref_mark = *mark;
   for (int i = 0; i < num_indices; ++i) {
     ref_mark[indices[i]] = true;
@@ -93,29 +95,31 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
                                          int** sample_indices,
                                          const int* num_per_col,
                                          int num_sample_col,
-                                         size_t total_sample_cnt,
-                                         data_size_t max_error_cnt,
-                                         data_size_t filter_cnt,
+                                         data_size_t total_sample_cnt,
                                          data_size_t num_data,
-                                         bool is_use_gpu) {
+                                         bool is_use_gpu,
+                                         std::vector<int8_t>* multi_val_group) {
   const int max_search_group = 100;
-  const int gpu_max_bin_per_group = 256;
+  const int max_bin_per_group = 256;
+  const data_size_t single_val_max_conflict_cnt = static_cast<data_size_t>(total_sample_cnt / 10000);
+  multi_val_group->clear();
+
   Random rand(num_data);
   std::vector<std::vector<int>> features_in_group;
   std::vector<std::vector<bool>> conflict_marks;
-  std::vector<int> group_conflict_cnt;
-  std::vector<size_t> group_non_zero_cnt;
+  std::vector<data_size_t> group_used_row_cnt;
+  std::vector<data_size_t> group_total_data_cnt;
   std::vector<int> group_num_bin;
 
+  // first round: fill the single val group
   for (auto fidx : find_order) {
     bool is_filtered_feature = fidx >= num_sample_col;
-    const size_t cur_non_zero_cnt = is_filtered_feature ? 0: num_per_col[fidx];
-    bool need_new_group = true;
+    const data_size_t cur_non_zero_cnt = is_filtered_feature ? 0 : num_per_col[fidx];
     std::vector<int> available_groups;
     for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
-      if (group_non_zero_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + max_error_cnt) {
-        if (!is_use_gpu || group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0)
-            <= gpu_max_bin_per_group) {
+      auto cur_num_bin = group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
+      if (group_total_data_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + single_val_max_conflict_cnt) {
+        if (!is_use_gpu || cur_num_bin <= max_bin_per_group) {
           available_groups.push_back(gid);
         }
       }
@@ -124,44 +128,82 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
     if (!available_groups.empty()) {
       int last = static_cast<int>(available_groups.size()) - 1;
       auto indices = rand.Sample(last, std::min(last, max_search_group - 1));
+      // always push the last group
       search_groups.push_back(available_groups.back());
       for (auto idx : indices) {
         search_groups.push_back(available_groups[idx]);
       }
     }
+    int best_gid = -1;
+    int best_conflict_cnt = -1;
     for (auto gid : search_groups) {
-      const int rest_max_cnt = max_error_cnt - group_conflict_cnt[gid];
-      const int cnt = is_filtered_feature ? 0 : GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
-      if (cnt >= 0 && cnt <= rest_max_cnt) {
-        data_size_t rest_non_zero_data = static_cast<data_size_t>(
-          static_cast<double>(cur_non_zero_cnt - cnt) * num_data / total_sample_cnt);
-        if (rest_non_zero_data < filter_cnt) { continue; }
-        need_new_group = false;
-        features_in_group[gid].push_back(fidx);
-        group_conflict_cnt[gid] += cnt;
-        group_non_zero_cnt[gid] += cur_non_zero_cnt - cnt;
-        if (!is_filtered_feature) {
-          MarkUsed(&conflict_marks[gid], sample_indices[fidx], num_per_col[fidx]);
-        }
-        if (is_use_gpu) {
-          group_num_bin[gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
-        }
+      const data_size_t rest_max_cnt = single_val_max_conflict_cnt - group_total_data_cnt[gid] + group_used_row_cnt[gid];
+      const data_size_t cnt = is_filtered_feature ? 0 : GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
+      if (cnt >= 0 && cnt <= rest_max_cnt && cnt <= cur_non_zero_cnt / 2) {
+        best_gid = gid;
+        best_conflict_cnt = cnt;
         break;
       }
     }
-    if (need_new_group) {
+    if (best_gid >= 0) {
+      features_in_group[best_gid].push_back(fidx);
+      group_total_data_cnt[best_gid] += cur_non_zero_cnt;
+      group_used_row_cnt[best_gid] += cur_non_zero_cnt - best_conflict_cnt;
+      if (!is_filtered_feature) {
+        MarkUsed(&conflict_marks[best_gid], sample_indices[fidx], num_per_col[fidx]);
+      }
+      group_num_bin[best_gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
+    } else {
       features_in_group.emplace_back();
       features_in_group.back().push_back(fidx);
-      group_conflict_cnt.push_back(0);
       conflict_marks.emplace_back(total_sample_cnt, false);
       if (!is_filtered_feature) {
         MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]);
       }
-      group_non_zero_cnt.emplace_back(cur_non_zero_cnt);
-      if (is_use_gpu) {
-        group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
+      group_total_data_cnt.emplace_back(cur_non_zero_cnt);
+      group_used_row_cnt.emplace_back(cur_non_zero_cnt);
+      group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
+    }
+  }
+  std::vector<int> second_round_features;
+  std::vector<std::vector<int>> features_in_group2;
+  std::vector<std::vector<bool>> conflict_marks2;
+
+  const double dense_threshold = 0.4;
+  for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
+    const double dense_rate = static_cast<double>(group_used_row_cnt[gid]) / total_sample_cnt;
+    if (dense_rate >= dense_threshold) {
+      features_in_group2.push_back(std::move(features_in_group[gid]));
+      conflict_marks2.push_back(std::move(conflict_marks[gid]));
+    } else {
+      for (auto fidx : features_in_group[gid]) {
+        second_round_features.push_back(fidx);
+      }
+    }
+  }
+
+  features_in_group = features_in_group2;
+  conflict_marks = conflict_marks2;
+  multi_val_group->resize(features_in_group.size(), false);
+  if (!second_round_features.empty()) {
+    features_in_group.emplace_back();
+    conflict_marks.emplace_back(total_sample_cnt, false);
+    bool is_multi_val = is_use_gpu ? true : false;
+    int conflict_cnt = 0;
+    for (auto fidx : second_round_features) {
+      features_in_group.back().push_back(fidx);
+      if (!is_multi_val) {
+        const int rest_max_cnt = single_val_max_conflict_cnt - conflict_cnt;
+        const auto cnt = GetConfilctCount(conflict_marks.back(), sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
+        conflict_cnt += cnt;
+        if (cnt < 0 || conflict_cnt > single_val_max_conflict_cnt) {
+          is_multi_val = true;
+          continue;
+        }
+        MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]);
       }
     }
+    multi_val_group->push_back(is_multi_val);
   }
   return features_in_group;
 }
@@ -171,17 +213,12 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
                                                   double** sample_values,
                                                   const int* num_per_col,
                                                   int num_sample_col,
-                                                  size_t total_sample_cnt,
+                                                  data_size_t total_sample_cnt,
                                                   const std::vector<int>& used_features,
-                                                  double max_conflict_rate,
                                                   data_size_t num_data,
-                                                  data_size_t min_data,
-                                                  double sparse_threshold,
-                                                  bool is_enable_sparse,
-                                                  bool is_use_gpu) {
-  // filter is based on sampling data, so decrease its range
-  const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * min_data) / num_data * total_sample_cnt);
-  const data_size_t max_error_cnt = static_cast<data_size_t>(total_sample_cnt * max_conflict_rate);
+                                                  bool is_use_gpu,
+                                                  std::vector<int8_t>* multi_val_group) {
+  Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
   std::vector<size_t> feature_non_zero_cnt;
   feature_non_zero_cnt.reserve(used_features.size());
   // put dense feature first
@@ -209,6 +246,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
   for (auto sidx : sorted_idx) {
     feature_order_by_cnt.push_back(used_features[sidx]);
   }
+
   std::vector<std::vector<int>> tmp_indices;
   std::vector<int> tmp_num_per_col(num_sample_col, 0);
   for (auto fidx : used_features) {
@@ -224,42 +262,25 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
       tmp_num_per_col[fidx] = num_per_col[fidx];
     }
   }
-  auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu);
-  auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu);
+  std::vector<int8_t> group_is_multi_val, group_is_multi_val2;
+  auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val);
+  auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val2);
+
   if (features_in_group.size() > group2.size()) {
     features_in_group = group2;
-  }
-  std::vector<std::vector<int>> ret;
-  for (size_t i = 0; i < features_in_group.size(); ++i) {
-    if (features_in_group[i].size() <= 1 || features_in_group[i].size() >= 5) {
-      ret.push_back(features_in_group[i]);
-    } else {
-      int cnt_non_zero = 0;
-      for (size_t j = 0; j < features_in_group[i].size(); ++j) {
-        const int fidx = features_in_group[i][j];
-        cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers[fidx]->sparse_rate()));
-      }
-      double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
-      // take apart small sparse group, due it will not gain on speed
-      if (sparse_rate >= sparse_threshold && is_enable_sparse) {
-        for (size_t j = 0; j < features_in_group[i].size(); ++j) {
-          const int fidx = features_in_group[i][j];
-          ret.emplace_back();
-          ret.back().push_back(fidx);
-        }
-      } else {
-        ret.push_back(features_in_group[i]);
-      }
-    }
+    group_is_multi_val = group_is_multi_val2;
   }
   // shuffle groups
-  int num_group = static_cast<int>(ret.size());
-  Random tmp_rand(12);
+  int num_group = static_cast<int>(features_in_group.size());
+  Random tmp_rand(num_data);
   for (int i = 0; i < num_group - 1; ++i) {
     int j = tmp_rand.NextShort(i + 1, num_group);
-    std::swap(ret[i], ret[j]);
+    std::swap(features_in_group[i], features_in_group[j]);
+    // Use std::swap for vector<bool> will cause the wrong result..
+    std::swap(group_is_multi_val[i], group_is_multi_val[j]);
   }
-  return ret;
+  *multi_val_group = group_is_multi_val;
+  return features_in_group;
 }
 
 void Dataset::Construct(
@@ -274,7 +295,6 @@ void Dataset::Construct(
   const Config& io_config) {
   num_total_features_ = num_total_features;
   CHECK(num_total_features_ == static_cast<int>(bin_mappers->size()));
-  sparse_threshold_ = io_config.sparse_threshold;
   // get num_features
   std::vector<int> used_features;
   auto& ref_bin_mappers = *bin_mappers;
@@ -287,13 +307,11 @@ void Dataset::Construct(
     Log::Warning("There are no meaningful features, as all feature values are constant.");
   }
   auto features_in_group = NoGroup(used_features);
-
+  std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
   if (io_config.enable_bundle && !used_features.empty()) {
     features_in_group = FastFeatureBundling(*bin_mappers,
-                                            sample_non_zero_indices, sample_values, num_per_col, num_sample_col, total_sample_cnt,
-                                            used_features, io_config.max_conflict_rate,
-                                            num_data_, io_config.min_data_in_leaf,
-                                            sparse_threshold_, io_config.is_enable_sparse, io_config.device_type == std::string("gpu"));
+                                            sample_non_zero_indices, sample_values, num_per_col, num_sample_col, static_cast<data_size_t>(total_sample_cnt),
+                                            used_features, num_data_, io_config.device_type == std::string("gpu"), &group_is_multi_val);
   }
 
   num_features_ = 0;
@@ -306,10 +324,14 @@ void Dataset::Construct(
   real_feature_idx_.resize(num_features_);
   feature2group_.resize(num_features_);
   feature2subfeature_.resize(num_features_);
+  int num_multi_val_group = 0;
   feature_need_push_zeros_.clear();
   for (int i = 0; i < num_groups_; ++i) {
     auto cur_features = features_in_group[i];
     int cur_cnt_features = static_cast<int>(cur_features.size());
+    if (group_is_multi_val[i]) {
+      ++num_multi_val_group;
+    }
     // get bin_mappers
     std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
     for (int j = 0; j < cur_cnt_features; ++j) {
@@ -325,8 +347,7 @@ void Dataset::Construct(
       ++cur_fidx;
     }
     feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
-      new FeatureGroup(cur_cnt_features, &cur_bin_mappers, num_data_, sparse_threshold_,
-                       io_config.is_enable_sparse)));
+      new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_)));
   }
   feature_groups_.shrink_to_fit();
   group_bin_boundaries_.clear();
@@ -414,9 +435,6 @@ void Dataset::ResetConfig(const char* parameters) {
   if (param.count("zero_as_missing") && io_config.zero_as_missing != zero_as_missing_) {
     Log::Warning("Cannot change zero_as_missing after constructed Dataset handle.");
   }
-  if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
-    Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
-  }
   if (param.count("forcedbins_filename")) {
     Log::Warning("Cannot change forced bins after constructed Dataset handle.");
   }
@@ -452,23 +470,229 @@ void Dataset::ResetConfig(const char* parameters) {
 void Dataset::FinishLoad() {
   if (is_finish_load_) { return; }
   if (num_groups_ > 0) {
-    OMP_INIT_EX();
-#pragma omp parallel for schedule(guided)
     for (int i = 0; i < num_groups_; ++i) {
-      OMP_LOOP_EX_BEGIN();
-      feature_groups_[i]->bin_data_->FinishLoad();
-      OMP_LOOP_EX_END();
+      feature_groups_[i]->FinishLoad();
     }
-    OMP_THROW_EX();
   }
   is_finish_load_ = true;
 }
 
+void PushDataToMultiValBin(int num_threads, data_size_t num_data, const std::vector<uint32_t> most_freq_bins,
+  const std::vector<uint32_t> offsets, std::vector<std::vector<std::unique_ptr<BinIterator>>>& iters, MultiValBin* ret) {
+  Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin", global_timer);
+  const data_size_t min_block_size = 4096;
+  const int n_block = std::min(num_threads, (num_data + min_block_size - 1) / min_block_size);
+  const data_size_t block_size = (num_data + n_block - 1) / n_block;
+  if (ret->IsSparse()) {
+    #pragma omp parallel for schedule(static)
+    for (int tid = 0; tid < n_block; ++tid) {
+      std::vector<uint32_t> cur_data;
+      data_size_t start = tid * block_size;
+      data_size_t end = std::min(num_data, start + block_size);
+      for (size_t j = 0; j < most_freq_bins.size(); ++j) {
+        iters[tid][j]->Reset(start);
+      }
+      for (data_size_t i = start; i < end; ++i) {
+        cur_data.clear();
+        for (size_t j = 0; j < most_freq_bins.size(); ++j) {
+          auto cur_bin = iters[tid][j]->Get(i);
+          if (cur_bin == most_freq_bins[j]) {
+            continue;
+          }
+          cur_bin += offsets[j];
+          if (most_freq_bins[j] == 0) {
+            cur_bin -= 1;
+          }
+          cur_data.push_back(cur_bin);
+        }
+        ret->PushOneRow(tid, i, cur_data);
+      }
+    }
+  } else {
+    #pragma omp parallel for schedule(static)
+    for (int tid = 0; tid < n_block; ++tid) {
+      std::vector<uint32_t> cur_data;
+      data_size_t start = tid * block_size;
+      data_size_t end = std::min(num_data, start + block_size);
+      for (size_t j = 0; j < most_freq_bins.size(); ++j) {
+        iters[tid][j]->Reset(start);
+      }
+      for (data_size_t i = start; i < end; ++i) {
+        cur_data.clear();
+        for (size_t j = 0; j < most_freq_bins.size(); ++j) {
+          auto cur_bin = iters[tid][j]->Get(i);
+          if (cur_bin == most_freq_bins[j]) {
+            cur_bin = 0;
+          } else {
+            cur_bin += offsets[j];
+            if (most_freq_bins[j] == 0) {
+              cur_bin -= 1;
+            }
+          }
+          cur_data.push_back(cur_bin);
+        }
+        ret->PushOneRow(tid, i, cur_data);
+      }
+    }
+  }
+}
+
+MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
+  Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures", global_timer);
+  int multi_group_id = -1;
+  for (int i = 0; i < num_groups_; ++i) {
+    if (feature_groups_[i]->is_multi_val_) {
+      if (multi_group_id < 0) {
+        multi_group_id = i;
+      } else {
+        Log::Fatal("Bug. There should be only one multi-val group.");
+      }
+    }
+  }
+  if (multi_group_id < 0) {
+    return nullptr;
+  }
+  const auto& offsets = feature_groups_[multi_group_id]->bin_offsets_;
+  const int num_feature = feature_groups_[multi_group_id]->num_feature_;
+  int num_threads = 1;
+  #pragma omp parallel
+  #pragma omp master
+  {
+    num_threads = omp_get_num_threads();
+  }
+
+  std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
+  std::vector<uint32_t> most_freq_bins;
+  double sum_sparse_rate = 0;
+  for (int i = 0; i < num_feature; ++i) {
+    for (int tid = 0; tid < num_threads; ++tid) {
+      iters[tid].emplace_back(feature_groups_[multi_group_id]->SubFeatureIterator(i));
+    }
+    most_freq_bins.push_back(feature_groups_[multi_group_id]->bin_mappers_[i]->GetMostFreqBin());
+    sum_sparse_rate += feature_groups_[multi_group_id]->bin_mappers_[i]->sparse_rate();
+  }
+  sum_sparse_rate /= num_feature;
+  Log::Debug("GetMultiBinFromSparseFeatures:: sparse rate %f", sum_sparse_rate);
+  std::unique_ptr<MultiValBin> ret;
+  ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(), num_feature, sum_sparse_rate));
+  PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get());
+  ret->FinishLoad();
+  return ret.release();
+}
+
+MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
+  Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures", global_timer);
+  int num_threads = 1;
+  #pragma omp parallel
+  #pragma omp master
+  {
+    num_threads = omp_get_num_threads();
+  }
+  double sum_dense_ratio = 0;
+
+  std::unique_ptr<MultiValBin> ret;
+  std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
+  std::vector<uint32_t> most_freq_bins;
+  std::vector<uint32_t> offsets;
+  int num_total_bin = 1;
+  offsets.push_back(num_total_bin);
+  for (int gid = 0; gid < num_groups_; ++gid) {
+    if (feature_groups_[gid]->is_multi_val_) {
+      for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
+        const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
+        sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
+        most_freq_bins.push_back(bin_mapper->GetMostFreqBin());
+        num_total_bin += bin_mapper->num_bin();
+        if (most_freq_bins.back() == 0) {
+          num_total_bin -= 1;
+        }
+        offsets.push_back(num_total_bin);
+        for (int tid = 0; tid < num_threads; ++tid) {
+          iters[tid].emplace_back(feature_groups_[gid]->SubFeatureIterator(fid));
+        }
+      }
+    } else {
+      most_freq_bins.push_back(0);
+      num_total_bin += feature_groups_[gid]->bin_offsets_.back() - 1;
+      for (int tid = 0; tid < num_threads; ++tid) {
+        iters[tid].emplace_back(feature_groups_[gid]->FeatureGroupIterator());
+      }
+      offsets.push_back(num_total_bin);
+      for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
+        const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
+        sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
+      }
+    }
+  }
+  sum_dense_ratio /= static_cast<double>(most_freq_bins.size());
+  Log::Debug("GetMultiBinFromAllFeatures:: sparse rate %f", 1.0 - sum_dense_ratio);
+  ret.reset(MultiValBin::CreateMultiValBin(num_data_, num_total_bin, static_cast<int>(most_freq_bins.size()), 1.0 - sum_dense_ratio));
+  PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get());
+  ret->FinishLoad();
+  return ret.release();
+}
+
+MultiValBin* Dataset::TestMultiThreadingMethod(score_t* gradients, score_t* hessians, const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
+  bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const {
+  int num_threads = 1;
+#pragma omp parallel
+#pragma omp master
+  { num_threads = omp_get_num_threads(); }
+  Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod", global_timer);
+  if (force_colwise && force_rowwise) {
+    Log::Fatal("cannot set both `force_col_wise` and `force_row_wise` to `true`.");
+  }
+  if (num_groups_ <= 0) {
+    return nullptr;
+  }
+  if (force_colwise) {
+    *is_hist_col_wise = true;
+    return GetMultiBinFromSparseFeatures();
+  } else if (force_rowwise) {
+    *is_hist_col_wise = false;
+    auto ret = GetMultiBinFromAllFeatures();
+    const int num_bin_aligned =
+        (ret->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
+    hist_buf_.resize(static_cast<size_t>(num_bin_aligned) * 2 * num_threads);
+    return ret;
+  } else {
+    std::unique_ptr<MultiValBin> sparse_bin;
+    std::unique_ptr<MultiValBin> all_bin;
+    sparse_bin.reset(GetMultiBinFromSparseFeatures());
+    all_bin.reset(GetMultiBinFromAllFeatures());
+    std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_data(NumTotalBin() * 2);
+    const int num_bin_aligned =
+        (all_bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
+    hist_buf_.resize(static_cast<size_t>(num_bin_aligned) * 2 * num_threads);
+    std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
+    auto start_time = std::chrono::steady_clock::now();
+    ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, hessians, gradients, hessians, is_constant_hessian, sparse_bin.get(), true, hist_data.data());
+    col_wise_time = std::chrono::steady_clock::now() - start_time;
+    start_time = std::chrono::steady_clock::now();
+    ConstructHistogramsMultiVal(all_bin.get(), nullptr, num_data_, gradients, hessians, is_constant_hessian, hist_data.data());
+    row_wise_time = std::chrono::steady_clock::now() - start_time;
+    Log::Debug("colwise cost %f seconds, rowwise cost %f seconds", col_wise_time * 1e-3, row_wise_time * 1e-3);
+    if (col_wise_time < row_wise_time) {
+      *is_hist_col_wise = true;
+      hist_buf_.clear();
+      return sparse_bin.release();
+    } else {
+      *is_hist_col_wise = false;
+      Log::Info("Use row-wise multi-threading, may increase memory usage. If memory is not enough, you can set `force_col_wise=true`.");
+      if (all_bin->IsSparse()) {
+        Log::Debug("Use Sparse Multi-Val Bin");
+      } else {
+        Log::Debug("Use Dense Multi-Val Bin");
+      }
+      return all_bin.release();
+    }
+  }
+}
+
 void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
   feature_groups_.clear();
   num_features_ = dataset->num_features_;
   num_groups_ = dataset->num_groups_;
-  sparse_threshold_ = dataset->sparse_threshold_;
   // copy feature bin mapper data
   for (int i = 0; i < num_groups_; ++i) {
     std::vector<std::unique_ptr<BinMapper>> bin_mappers;
@@ -477,9 +701,9 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
     }
     feature_groups_.emplace_back(new FeatureGroup(
       dataset->feature_groups_[i]->num_feature_,
+      dataset->feature_groups_[i]->is_multi_val_,
       &bin_mappers,
-      num_data_,
-      dataset->feature_groups_[i]->is_sparse_));
+      num_data_));
   }
   feature_groups_.shrink_to_fit();
   used_feature_map_ = dataset->used_feature_map_;
@@ -502,8 +726,6 @@ void Dataset::CreateValid(const Dataset* dataset) {
   feature_groups_.clear();
   num_features_ = dataset->num_features_;
   num_groups_ = num_features_;
-  sparse_threshold_ = dataset->sparse_threshold_;
-  bool is_enable_sparse = true;
   feature2group_.clear();
   feature2subfeature_.clear();
   // copy feature bin mapper data
@@ -514,12 +736,8 @@ void Dataset::CreateValid(const Dataset* dataset) {
     if (bin_mappers.back()->GetDefaultBin() != bin_mappers.back()->GetMostFreqBin()) {
       feature_need_push_zeros_.push_back(i);
     }
-    feature_groups_.emplace_back(new FeatureGroup(
-      1,
-      &bin_mappers,
-      num_data_,
-      sparse_threshold_,
-      is_enable_sparse));
+    feature_groups_.emplace_back(new FeatureGroup(&bin_mappers,
+                                                  num_data_));
     feature2group_.push_back(i);
     feature2subfeature_.push_back(0);
   }
@@ -721,7 +939,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
     writer->Write(binary_file_token, size_of_token);
     // get size of header
     size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
-      + sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_) + sizeof(sparse_threshold_)
+      + sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_) 
       + 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_
       + sizeof(double) * num_features_ + sizeof(int32_t) * num_total_features_ + sizeof(int) * 3 + sizeof(bool) * 2;
     // size of feature names
@@ -743,7 +961,6 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
     writer->Write(&min_data_in_bin_, sizeof(min_data_in_bin_));
     writer->Write(&use_missing_, sizeof(use_missing_));
     writer->Write(&zero_as_missing_, sizeof(zero_as_missing_));
-    writer->Write(&sparse_threshold_, sizeof(sparse_threshold_));
     writer->Write(used_feature_map_.data(), sizeof(int) * num_total_features_);
     writer->Write(&num_groups_, sizeof(num_groups_));
     writer->Write(real_feature_idx_.data(), sizeof(int) * num_features_);
@@ -866,20 +1083,110 @@ void Dataset::DumpTextFile(const char* text_filename) {
   fclose(file);
 }
 
+void Dataset::ConstructHistogramsMultiVal(const MultiValBin* multi_val_bin, const data_size_t* data_indices, data_size_t num_data,
+                                          const score_t* gradients, const score_t* hessians,
+                                          bool is_constant_hessian,
+                                          hist_t* hist_data) const {
+  Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal", global_timer);
+  if (multi_val_bin == nullptr) { return; }
+  int num_threads = 1;
+  #pragma omp parallel
+  #pragma omp master
+  {
+    num_threads = omp_get_num_threads();
+  }
+
+  global_timer.Start("Dataset::sparse_bin_histogram");
+  const int num_bin = multi_val_bin->num_bin();
+  const int num_bin_aligned = (num_bin + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
+  const int min_data_block_size = 1024;
+  const int n_data_block = std::min(num_threads, (num_data + min_data_block_size - 1) / min_data_block_size);
+  const int data_block_size = (num_data + n_data_block - 1) / n_data_block;
+
+  const size_t buf_size = static_cast<size_t>(n_data_block - 1)* num_bin_aligned * 2;
+  if (hist_buf_.size() < buf_size) {
+    hist_buf_.resize(buf_size);
+  }
+
+  #pragma omp parallel for schedule(static)
+  for (int tid = 0; tid < n_data_block; ++tid) {
+    data_size_t start = tid * data_block_size;
+    data_size_t end = std::min(start + data_block_size, num_data);
+    auto data_ptr = hist_data;
+    if (tid > 0) {
+      data_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
+    }
+    std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin* KHistEntrySize);
+    if (data_indices != nullptr && num_data < num_data_) {
+      if (!is_constant_hessian) {
+        multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, hessians, data_ptr);
+      } else {
+        multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, data_ptr);
+      }
+    } else {
+      if (!is_constant_hessian) {
+        multi_val_bin->ConstructHistogram(start, end, gradients, hessians, data_ptr);
+      } else {
+        multi_val_bin->ConstructHistogram(start, end, gradients, data_ptr);
+      }
+    }
+  }
+  global_timer.Stop("Dataset::sparse_bin_histogram");
+
+  global_timer.Start("Dataset::sparse_bin_histogram_merge");
+
+  const int min_bin_block_size = 512;
+  const int n_bin_block = std::min(num_threads, (num_bin + min_bin_block_size - 1) / min_bin_block_size);
+  const int bin_block_size = (num_bin + n_bin_block - 1) / n_bin_block;
+  if (!is_constant_hessian) {
+    #pragma omp parallel for schedule(static)
+    for (int t = 0; t < n_bin_block; ++t) {
+      const int start = t * bin_block_size;
+      const int end = std::min(start + bin_block_size, num_bin);
+      for (int tid = 1; tid < n_data_block; ++tid) {
+        auto src_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
+        for (int i = start * 2; i < end * 2; ++i) {
+          hist_data[i] += src_ptr[i];
+        }
+      }
+    }
+  } else {
+    #pragma omp parallel for schedule(static)
+    for (int t = 0; t < n_bin_block; ++t) {
+      const int start = t * bin_block_size;
+      const int end = std::min(start + bin_block_size, num_bin);
+      for (int tid = 1; tid < n_data_block; ++tid) {
+        auto src_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
+        for (int i = start * 2; i < end * 2; ++i) {
+          hist_data[i] += src_ptr[i];
+        }
+      }
+      for (int i = start; i < end; i++) {
+        GET_HESS(hist_data, i) = GET_HESS(hist_data, i) * hessians[0];
+      }
+    }
+  }
+  global_timer.Stop("Dataset::sparse_bin_histogram_merge");
+}
+
 void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
                                   const data_size_t* data_indices, data_size_t num_data,
-                                  int leaf_idx,
-                                  std::vector<std::unique_ptr<OrderedBin>>* ordered_bins,
                                   const score_t* gradients, const score_t* hessians,
                                   score_t* ordered_gradients, score_t* ordered_hessians,
                                   bool is_constant_hessian,
-                                  HistogramBinEntry* hist_data) const {
-  if (leaf_idx < 0 || num_data < 0 || hist_data == nullptr) {
+                                  const MultiValBin* multi_val_bin, bool is_colwise,
+                                  hist_t* hist_data) const {
+  Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer);
+  if (num_data < 0 || hist_data == nullptr) {
     return;
   }
-
-  std::vector<int> used_group;
-  used_group.reserve(num_groups_);
+  if (!is_colwise) {
+    return ConstructHistogramsMultiVal(multi_val_bin, data_indices, num_data, gradients, hessians, is_constant_hessian, hist_data);
+  }
+  global_timer.Start("Dataset::Get used group");
+  std::vector<int> used_dense_group;
+  int multi_val_groud_id = -1;
+  used_dense_group.reserve(num_groups_);
   for (int group = 0; group < num_groups_; ++group) {
     const int f_cnt = group_feature_cnt_[group];
     bool is_group_used = false;
@@ -891,172 +1198,137 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
       }
     }
     if (is_group_used) {
-      used_group.push_back(group);
-    }
-  }
-  int num_used_group = static_cast<int>(used_group.size());
-  auto ptr_ordered_grad = gradients;
-  auto ptr_ordered_hess = hessians;
-  auto& ref_ordered_bins = *ordered_bins;
-  if (data_indices != nullptr && num_data < num_data_) {
-    if (!is_constant_hessian) {
-      #pragma omp parallel for schedule(static)
-      for (data_size_t i = 0; i < num_data; ++i) {
-        ordered_gradients[i] = gradients[data_indices[i]];
-        ordered_hessians[i] = hessians[data_indices[i]];
-      }
-    } else {
-      #pragma omp parallel for schedule(static)
-      for (data_size_t i = 0; i < num_data; ++i) {
-        ordered_gradients[i] = gradients[data_indices[i]];
+      if (feature_groups_[group]->is_multi_val_) {
+        multi_val_groud_id = group;
+      } else {
+        used_dense_group.push_back(group);
       }
     }
-    ptr_ordered_grad = ordered_gradients;
-    ptr_ordered_hess = ordered_hessians;
-    if (!is_constant_hessian) {
-      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
-      for (int gi = 0; gi < num_used_group; ++gi) {
-        OMP_LOOP_EX_BEGIN();
-        int group = used_group[gi];
-        // feature is not used
-        auto data_ptr = hist_data + group_bin_boundaries_[group];
-        const int num_bin = feature_groups_[group]->num_total_bin_;
-        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
-        // construct histograms for smaller leaf
-        if (ref_ordered_bins[group] == nullptr) {
-          // if not use ordered bin
-          feature_groups_[group]->bin_data_->ConstructHistogram(
-            data_indices,
-            0,
-            num_data,
-            ptr_ordered_grad,
-            ptr_ordered_hess,
-            data_ptr);
-        } else {
-          // used ordered bin
-          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
-                                                      gradients,
-                                                      hessians,
-                                                      data_ptr);
+  }
+  int num_used_dense_group = static_cast<int>(used_dense_group.size());
+  global_timer.Stop("Dataset::Get used group");
+  global_timer.Start("Dataset::dense_bin_histogram");
+  if (num_used_dense_group > 0) {
+    auto ptr_ordered_grad = gradients;
+    auto ptr_ordered_hess = hessians;
+    if (data_indices != nullptr && num_data < num_data_) {
+      if (!is_constant_hessian) {
+#pragma omp parallel for schedule(static)
+        for (data_size_t i = 0; i < num_data; ++i) {
+          ordered_gradients[i] = gradients[data_indices[i]];
+          ordered_hessians[i] = hessians[data_indices[i]];
+        }
+      } else {
+#pragma omp parallel for schedule(static)
+        for (data_size_t i = 0; i < num_data; ++i) {
+          ordered_gradients[i] = gradients[data_indices[i]];
         }
-        OMP_LOOP_EX_END();
       }
-      OMP_THROW_EX();
-    } else {
-      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
-      for (int gi = 0; gi < num_used_group; ++gi) {
-        OMP_LOOP_EX_BEGIN();
-        int group = used_group[gi];
-        // feature is not used
-        auto data_ptr = hist_data + group_bin_boundaries_[group];
-        const int num_bin = feature_groups_[group]->num_total_bin_;
-        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
-        // construct histograms for smaller leaf
-        if (ref_ordered_bins[group] == nullptr) {
-          // if not use ordered bin
+      ptr_ordered_grad = ordered_gradients;
+      ptr_ordered_hess = ordered_hessians;
+      if (!is_constant_hessian) {
+        OMP_INIT_EX();
+#pragma omp parallel for schedule(static)
+        for (int gi = 0; gi < num_used_dense_group; ++gi) {
+          OMP_LOOP_EX_BEGIN();
+          int group = used_dense_group[gi];
+          // feature is not used
+          auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
+          const int num_bin = feature_groups_[group]->num_total_bin_;
+          std::memset(reinterpret_cast<void*>(data_ptr), 0,
+                      num_bin * KHistEntrySize);
+          // construct histograms for smaller leaf
           feature_groups_[group]->bin_data_->ConstructHistogram(
-            data_indices,
-            0,
-            num_data,
-            ptr_ordered_grad,
-            data_ptr);
-        } else {
-          // used ordered bin
-          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
-                                                      gradients,
-                                                      data_ptr);
-        }
-        // fixed hessian.
-        for (int i = 0; i < num_bin; ++i) {
-          data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
+              data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
+              data_ptr);
+          OMP_LOOP_EX_END();
         }
-        OMP_LOOP_EX_END();
-      }
-      OMP_THROW_EX();
-    }
-  } else {
-    if (!is_constant_hessian) {
-      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
-      for (int gi = 0; gi < num_used_group; ++gi) {
-        OMP_LOOP_EX_BEGIN();
-        int group = used_group[gi];
-        // feature is not used
-        auto data_ptr = hist_data + group_bin_boundaries_[group];
-        const int num_bin = feature_groups_[group]->num_total_bin_;
-        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
-        // construct histograms for smaller leaf
-        if (ref_ordered_bins[group] == nullptr) {
-          // if not use ordered bin
+        OMP_THROW_EX();
+
+      } else {
+        OMP_INIT_EX();
+#pragma omp parallel for schedule(static)
+        for (int gi = 0; gi < num_used_dense_group; ++gi) {
+          OMP_LOOP_EX_BEGIN();
+          int group = used_dense_group[gi];
+          // feature is not used
+          auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
+          const int num_bin = feature_groups_[group]->num_total_bin_;
+          std::memset(reinterpret_cast<void*>(data_ptr), 0,
+                      num_bin * KHistEntrySize);
+          // construct histograms for smaller leaf
           feature_groups_[group]->bin_data_->ConstructHistogram(
-            0,
-            num_data,
-            ptr_ordered_grad,
-            ptr_ordered_hess,
-            data_ptr);
-        } else {
-          // used ordered bin
-          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
-                                                      gradients,
-                                                      hessians,
-                                                      data_ptr);
+              data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
+          // fixed hessian.
+          for (int i = 0; i < num_bin; ++i) {
+            GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
+          }
+          OMP_LOOP_EX_END();
         }
-        OMP_LOOP_EX_END();
+        OMP_THROW_EX();
       }
-      OMP_THROW_EX();
     } else {
-      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
-      for (int gi = 0; gi < num_used_group; ++gi) {
-        OMP_LOOP_EX_BEGIN();
-        int group = used_group[gi];
-        // feature is not used
-        auto data_ptr = hist_data + group_bin_boundaries_[group];
-        const int num_bin = feature_groups_[group]->num_total_bin_;
-        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
-        // construct histograms for smaller leaf
-        if (ref_ordered_bins[group] == nullptr) {
-          // if not use ordered bin
+      if (!is_constant_hessian) {
+        OMP_INIT_EX();
+#pragma omp parallel for schedule(static)
+        for (int gi = 0; gi < num_used_dense_group; ++gi) {
+          OMP_LOOP_EX_BEGIN();
+          int group = used_dense_group[gi];
+          // feature is not used
+          auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
+          const int num_bin = feature_groups_[group]->num_total_bin_;
+          std::memset(reinterpret_cast<void*>(data_ptr), 0,
+                      num_bin * KHistEntrySize);
+          // construct histograms for smaller leaf
           feature_groups_[group]->bin_data_->ConstructHistogram(
-            0,
-            num_data,
-            ptr_ordered_grad,
-            data_ptr);
-        } else {
-          // used ordered bin
-          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
-                                                      gradients,
-                                                      data_ptr);
+              0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+          OMP_LOOP_EX_END();
         }
-        // fixed hessian.
-        for (int i = 0; i < num_bin; ++i) {
-          data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
+        OMP_THROW_EX();
+      } else {
+        OMP_INIT_EX();
+#pragma omp parallel for schedule(static)
+        for (int gi = 0; gi < num_used_dense_group; ++gi) {
+          OMP_LOOP_EX_BEGIN();
+          int group = used_dense_group[gi];
+          // feature is not used
+          auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
+          const int num_bin = feature_groups_[group]->num_total_bin_;
+          std::memset(reinterpret_cast<void*>(data_ptr), 0,
+                      num_bin * KHistEntrySize);
+          // construct histograms for smaller leaf
+          feature_groups_[group]->bin_data_->ConstructHistogram(
+              0, num_data, ptr_ordered_grad, data_ptr);
+          // fixed hessian.
+          for (int i = 0; i < num_bin; ++i) {
+            GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
+          }
+          OMP_LOOP_EX_END();
         }
-        OMP_LOOP_EX_END();
+        OMP_THROW_EX();
       }
-      OMP_THROW_EX();
     }
   }
+  global_timer.Stop("Dataset::dense_bin_histogram");
+  if (multi_val_groud_id >= 0) {
+    ConstructHistogramsMultiVal(multi_val_bin, data_indices, num_data, gradients, hessians, is_constant_hessian,
+                                hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
+  }
 }
 
-void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
-                           HistogramBinEntry* data) const {
+void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const {
   const int group = feature2group_[feature_idx];
   const int sub_feature = feature2subfeature_[feature_idx];
   const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get();
   const int most_freq_bin = bin_mapper->GetMostFreqBin();
   if (most_freq_bin > 0) {
     const int num_bin = bin_mapper->num_bin();
-    data[most_freq_bin].sum_gradients = sum_gradient;
-    data[most_freq_bin].sum_hessians = sum_hessian;
-    data[most_freq_bin].cnt = num_data;
+    GET_GRAD(data, most_freq_bin) = sum_gradient;
+    GET_HESS(data, most_freq_bin) = sum_hessian;
     for (int i = 0; i < num_bin; ++i) {
       if (i != most_freq_bin) {
-        data[most_freq_bin].sum_gradients -= data[i].sum_gradients;
-        data[most_freq_bin].sum_hessians -= data[i].sum_hessians;
-        data[most_freq_bin].cnt -= data[i].cnt;
+        GET_GRAD(data, most_freq_bin) -= GET_GRAD(data, i);
+        GET_HESS(data, most_freq_bin) -= GET_HESS(data, i);
       }
     }
   }
@@ -1094,7 +1366,7 @@ void PushClearIfEmpty(std::vector<T>* dest, const size_t dest_len, const std::ve
   }
 }
 
-void Dataset::addFeaturesFrom(Dataset* other) {
+void Dataset::AddFeaturesFrom(Dataset* other) {
   if (other->num_data_ != num_data_) {
     throw std::runtime_error("Cannot add features from other Dataset with a different number of rows");
   }
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 31288aea6675..f1e8b749f799 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -335,8 +335,6 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
   mem_ptr += sizeof(dataset->use_missing_);
   dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
   mem_ptr += sizeof(dataset->zero_as_missing_);
-  dataset->sparse_threshold_ = *(reinterpret_cast<const double*>(mem_ptr));
-  mem_ptr += sizeof(dataset->sparse_threshold_);
   const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
   dataset->used_feature_map_.clear();
   for (int i = 0; i < dataset->num_total_features_; ++i) {
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index fab014c28675..082065ce40bf 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -31,9 +31,9 @@ class DenseBinIterator: public BinIterator {
   }
   inline uint32_t RawGet(data_size_t idx) override;
   inline uint32_t Get(data_size_t idx) override;
-  inline void Reset(data_size_t) override { }
+  inline void Reset(data_size_t) override {}
 
- private:
+private:
   const DenseBin<VAL_T>* bin_data_;
   VAL_T min_bin_;
   VAL_T max_bin_;
@@ -46,7 +46,7 @@ class DenseBinIterator: public BinIterator {
 */
 template <typename VAL_T>
 class DenseBin: public Bin {
- public:
+public:
   friend DenseBinIterator<VAL_T>;
   explicit DenseBin(data_size_t num_data)
     : num_data_(num_data), data_(num_data_, static_cast<VAL_T>(0)) {
@@ -68,84 +68,65 @@ class DenseBin: public Bin {
 
   BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
 
-  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
-    const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64 / sizeof(VAL_T);
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
+  #define ACC_GH(hist, i, g, h) \
+  const auto ti = static_cast<int>(i) << 1; \
+  hist[ti] += g; \
+  hist[ti + 1] += h; \
+
+  template<bool use_indices, bool use_prefetch, bool use_hessians>
+  void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
     data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + data_indices[i + pf_offset]);
-      const VAL_T bin = data_[data_indices[i]];
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
+
+    if (use_prefetch) {
+      const data_size_t pf_offset = 64 / sizeof(VAL_T);
+      const data_size_t pf_end = end - pf_offset;
+      for (; i < pf_end; ++i) {
+        const auto idx = use_indices ? data_indices[i] : i;
+        const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
+        PREFETCH_T0(data_.data() + pf_idx);
+        const VAL_T bin = data_[idx];
+        if (use_hessians) {
+          ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
+        } else {
+          ACC_GH(out, bin, ordered_gradients[i], 1.0f);
+        }
+      }
     }
-    for (; i < end; i++) {
-      const VAL_T bin = data_[data_indices[i]];
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
+    for (; i < end; ++i) {
+      const auto idx = use_indices ? data_indices[i] : i;
+      const VAL_T bin = data_[idx];
+      if (use_hessians) {
+        ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
+      } else {
+        ACC_GH(out, bin, ordered_gradients[i], 1.0f);
+      }
     }
   }
+  #undef ACC_GH
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, true>(data_indices, start, end, ordered_gradients, ordered_hessians, out);
+  }
 
   void ConstructHistogram(data_size_t start, data_size_t end,
     const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64 / sizeof(VAL_T);
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + i + pf_offset);
-      const VAL_T bin = data_[i];
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const VAL_T bin = data_[i];
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, true>(nullptr, start, end, ordered_gradients, ordered_hessians, out);
   }
 
   void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
     const score_t* ordered_gradients,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64 / sizeof(VAL_T);
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + data_indices[i + pf_offset]);
-      const VAL_T bin = data_[data_indices[i]];
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const VAL_T bin = data_[data_indices[i]];
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, false>(data_indices, start, end, ordered_gradients, nullptr, out);
   }
 
   void ConstructHistogram(data_size_t start, data_size_t end,
     const score_t* ordered_gradients,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64 / sizeof(VAL_T);
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + i + pf_offset);
-      const VAL_T bin = data_[i];
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const VAL_T bin = data_[i];
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, false>(nullptr, start, end, ordered_gradients, nullptr, out);
   }
 
   data_size_t Split(
@@ -257,9 +238,6 @@ class DenseBin: public Bin {
 
   data_size_t num_data() const override { return num_data_; }
 
-  /*! \brief not ordered bin for dense feature */
-  OrderedBin* CreateOrderedBin() const override { return nullptr; }
-
   void FinishLoad() override {}
 
   void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
@@ -287,17 +265,18 @@ class DenseBin: public Bin {
   }
 
   size_t SizesInByte() const override {
-    return sizeof(VAL_T) * num_data_;
+    return sizeof(VAL_T)* num_data_;
   }
 
   DenseBin<VAL_T>* Clone() override;
 
- private:
+private:
   data_size_t num_data_;
-  std::vector<VAL_T> data_;
+  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
 
   DenseBin<VAL_T>(const DenseBin<VAL_T>& other)
-    : num_data_(other.num_data_), data_(other.data_){}
+    : num_data_(other.num_data_), data_(other.data_) {
+  }
 };
 
 template<typename VAL_T>
diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp
index 8c3c18cdd707..c65540d9733b 100644
--- a/src/io/dense_nbits_bin.hpp
+++ b/src/io/dense_nbits_bin.hpp
@@ -16,7 +16,7 @@ namespace LightGBM {
 class Dense4bitsBin;
 
 class Dense4bitsBinIterator : public BinIterator {
- public:
+public:
   explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin)
     : bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)),
     max_bin_(static_cast<uint8_t>(max_bin)),
@@ -31,7 +31,7 @@ class Dense4bitsBinIterator : public BinIterator {
   inline uint32_t Get(data_size_t idx) override;
   inline void Reset(data_size_t) override {}
 
- private:
+private:
   const Dense4bitsBin* bin_data_;
   uint8_t min_bin_;
   uint8_t max_bin_;
@@ -40,12 +40,12 @@ class Dense4bitsBinIterator : public BinIterator {
 };
 
 class Dense4bitsBin : public Bin {
- public:
+public:
   friend Dense4bitsBinIterator;
   explicit Dense4bitsBin(data_size_t num_data)
     : num_data_(num_data) {
     int len = (num_data_ + 1) / 2;
-    data_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
+    data_.resize(len, static_cast<uint8_t>(0));
     buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
   }
 
@@ -73,88 +73,65 @@ class Dense4bitsBin : public Bin {
 
   inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
 
-  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
-    const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64;
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize;
+  #define ACC_GH(hist, i, g, h) \
+  const auto ti = (i) << 1; \
+  hist[ti] += g; \
+  hist[ti + 1] += h; \
+
+  template<bool use_indices, bool use_prefetch, bool use_hessians>
+  void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
     data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1));
-      const data_size_t idx = data_indices[i];
-      const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
+
+    if (use_prefetch) {
+      const data_size_t pf_offset = 64;
+      const data_size_t pf_end = end - pf_offset;
+      for (; i < pf_end; ++i) {
+        const auto idx = use_indices ? data_indices[i] : i;
+        const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
+        PREFETCH_T0(data_.data() + (pf_idx >> 1));
+        const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+        if (use_hessians) {
+          ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
+        } else {
+          ACC_GH(out, bin, ordered_gradients[i], 1.0f);
+        }
+      }
     }
-    for (; i < end; i++) {
-      const data_size_t idx = data_indices[i];
+    for (; i < end; ++i) {
+      const auto idx = use_indices ? data_indices[i] : i;
       const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
+      if (use_hessians) {
+        ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
+      } else {
+        ACC_GH(out, bin, ordered_gradients[i], 1.0f);
+      }
     }
   }
+  #undef ACC_GH
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, true>(data_indices, start, end, ordered_gradients, ordered_hessians, out);
+  }
 
   void ConstructHistogram(data_size_t start, data_size_t end,
     const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64;
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize;
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
-      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      out[bin].sum_hessians += ordered_hessians[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, true>(nullptr, start, end, ordered_gradients, ordered_hessians, out);
   }
 
   void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
     const score_t* ordered_gradients,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64;
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize;
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1));
-      const data_size_t idx = data_indices[i];
-      const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const data_size_t idx = data_indices[i];
-      const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, false>(data_indices, start, end, ordered_gradients, nullptr, out);
   }
 
   void ConstructHistogram(data_size_t start, data_size_t end,
     const score_t* ordered_gradients,
-    HistogramBinEntry* out) const override {
-    const data_size_t pf_offset = 64;
-    const data_size_t pf_end = end - pf_offset - kCacheLineSize;
-    data_size_t i = start;
-    for (; i < pf_end; i++) {
-      PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
-      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
-    for (; i < end; i++) {
-      const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
-      out[bin].sum_gradients += ordered_gradients[i];
-      ++out[bin].cnt;
-    }
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, false>(nullptr, start, end, ordered_gradients, nullptr, out);
   }
 
   data_size_t Split(
@@ -266,8 +243,6 @@ class Dense4bitsBin : public Bin {
 
   data_size_t num_data() const override { return num_data_; }
 
-  /*! \brief not ordered bin for dense feature */
-  OrderedBin* CreateOrderedBin() const override { return nullptr; }
 
   void FinishLoad() override {
     if (buf_.empty()) { return; }
@@ -325,19 +300,20 @@ class Dense4bitsBin : public Bin {
   }
 
   size_t SizesInByte() const override {
-    return sizeof(uint8_t) * data_.size();
+    return sizeof(uint8_t)* data_.size();
   }
 
   Dense4bitsBin* Clone() override {
     return new Dense4bitsBin(*this);
   }
 
- protected:
+protected:
   Dense4bitsBin(const Dense4bitsBin& other)
-    : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {}
+    : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {
+  }
 
   data_size_t num_data_;
-  std::vector<uint8_t> data_;
+  std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>> data_;
   std::vector<uint8_t> buf_;
 };
 
diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp
new file mode 100644
index 000000000000..e9589ce192b1
--- /dev/null
+++ b/src/io/multi_val_dense_bin.hpp
@@ -0,0 +1,168 @@
+/*!
+ * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+#ifndef LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
+#define LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
+
+
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace LightGBM {
+
+
+template <typename VAL_T>
+class MultiValDenseBin : public MultiValBin {
+public:
+
+  explicit MultiValDenseBin(data_size_t num_data, int num_bin, int num_feature)
+    : num_data_(num_data), num_bin_(num_bin), num_feature_(num_feature) {
+    data_.resize(static_cast<size_t>(num_data_) * num_feature_, static_cast<VAL_T>(0));
+  }
+
+  ~MultiValDenseBin() {
+  }
+
+  data_size_t num_data() const override {
+    return num_data_;
+  }
+
+  int num_bin() const override {
+    return num_bin_;
+  }
+
+
+  void PushOneRow(int , data_size_t idx, const std::vector<uint32_t>& values) override {
+    auto start = RowPtr(idx);
+    CHECK(num_feature_ == static_cast<int>(values.size()));
+    for (auto i = 0; i < num_feature_; ++i) {
+      data_[start + i] = static_cast<VAL_T>(values[i]);
+    }
+  }
+
+  void FinishLoad() override {
+
+  }
+
+  bool IsSparse() override{
+    return false;
+  }
+
+  void ReSize(data_size_t num_data) override {
+    if (num_data_ != num_data) {
+      num_data_ = num_data;
+    }
+  }
+
+  #define ACC_GH(hist, i, g, h) \
+  const auto ti = static_cast<int>(i) << 1; \
+  hist[ti] += g; \
+  hist[ti + 1] += h; \
+
+  template<bool use_indices, bool use_prefetch, bool use_hessians>
+  void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians, hist_t* out) const {
+    data_size_t i = start;
+    if (use_prefetch) {
+      const data_size_t pf_offset = 32 / sizeof(VAL_T);
+      const data_size_t pf_end = end - pf_offset;
+
+      for (; i < pf_end; ++i) {
+        const auto idx = use_indices ? data_indices[i] : i;
+        const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
+        PREFETCH_T0(gradients + pf_idx);
+        if (use_hessians) {
+          PREFETCH_T0(hessians + pf_idx);
+        }
+        PREFETCH_T0(data_.data() + RowPtr(pf_idx));
+        const auto j_start = RowPtr(idx);
+        for (auto j = j_start; j < j_start + num_feature_; ++j) {
+          const VAL_T bin = data_[j];
+          if (use_hessians) {
+            ACC_GH(out, bin, gradients[idx], hessians[idx]);
+          } else {
+            ACC_GH(out, bin, gradients[idx], 1.0f);
+          }
+        }
+      }
+    }
+    for (; i < end; ++i) {
+      const auto idx = use_indices ? data_indices[i] : i;
+      const auto j_start = RowPtr(idx);
+      for (auto j = j_start; j < j_start + num_feature_; ++j) {
+        const VAL_T bin = data_[j];
+        if (use_hessians) {
+          ACC_GH(out, bin, gradients[idx], hessians[idx]);
+        } else {
+          ACC_GH(out, bin, gradients[idx], 1.0f);
+        }
+      }
+    }
+  }
+  #undef ACC_GH
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out);
+  }
+
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out);
+  }
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
+  }
+
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients,
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
+  }
+
+  void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
+    auto other_bin = dynamic_cast<const MultiValDenseBin<VAL_T>*>(full_bin);
+    data_.clear();
+    for (data_size_t i = 0; i < num_used_indices; ++i) {
+      for (int64_t j = other_bin->RowPtr(used_indices[i]); j < other_bin->RowPtr(used_indices[i] + 1); ++j) {
+        data_.push_back(other_bin->data_[j]);
+      }
+    }
+  }
+
+  inline int64_t RowPtr(data_size_t idx) const {
+    return static_cast<int64_t>(idx) * num_feature_;
+  }
+
+  MultiValDenseBin<VAL_T>* Clone() override;
+
+private:
+  data_size_t num_data_;
+  int num_bin_;
+  int num_feature_;
+  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>> data_;
+
+  MultiValDenseBin<VAL_T>(const MultiValDenseBin<VAL_T>& other)
+    : num_data_(other.num_data_), num_bin_(other.num_bin_), num_feature_(other.num_feature_), data_(other.data_) {
+  }
+};
+
+template<typename VAL_T>
+MultiValDenseBin<VAL_T>* MultiValDenseBin<VAL_T>::Clone() {
+  return new MultiValDenseBin<VAL_T>(*this);
+}
+
+
+
+}  // namespace LightGBM
+#endif   // LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp
new file mode 100644
index 000000000000..7f6fc866f617
--- /dev/null
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -0,0 +1,204 @@
+/*!
+ * Copyright (c) 2020 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+#ifndef LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
+#define LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
+
+
+#include <LightGBM/bin.h>
+#include <LightGBM/utils/openmp_wrapper.h>
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace LightGBM {
+
+
+template <typename VAL_T>
+class MultiValSparseBin : public MultiValBin {
+public:
+
+  explicit MultiValSparseBin(data_size_t num_data, int num_bin)
+    : num_data_(num_data), num_bin_(num_bin) {
+    row_ptr_.resize(num_data_ + 1, 0);
+    data_.reserve(num_data_);
+    int num_threads = 1;
+    #pragma omp parallel
+    #pragma omp master
+    {
+      num_threads = omp_get_num_threads();
+    }
+    if (num_threads > 1) {
+      t_data_.resize(num_threads - 1);
+    }
+  }
+
+  ~MultiValSparseBin() {
+  }
+
+  data_size_t num_data() const override {
+    return num_data_;
+  }
+
+  int num_bin() const override {
+    return num_bin_;
+  }
+
+
+  void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t> & values) override {
+    row_ptr_[idx + 1] = static_cast<data_size_t>(values.size());
+    if (tid == 0) {
+      for (auto val : values) {
+        data_.push_back(static_cast<VAL_T>(val));
+      }
+    } else {
+      for (auto val : values) {
+        t_data_[tid - 1].push_back(static_cast<VAL_T>(val));
+      }
+    }
+  }
+
+  void FinishLoad() override {
+    for (data_size_t i = 0; i < num_data_; ++i) {
+      row_ptr_[i + 1] += row_ptr_[i];
+    }
+    if (t_data_.size() > 0) {
+      size_t offset = data_.size();
+      data_.resize(row_ptr_[num_data_]);
+      for (size_t tid = 0; tid < t_data_.size(); ++tid) {
+        std::memcpy(data_.data() + offset, t_data_[tid].data(), t_data_[tid].size() * sizeof(VAL_T));
+        offset += t_data_[tid].size();
+        t_data_[tid].clear();
+      }
+    }
+    row_ptr_.shrink_to_fit();
+    data_.shrink_to_fit();
+    t_data_.clear();
+    t_data_.shrink_to_fit();
+  }
+
+  bool IsSparse() override {
+    return true;
+  }
+
+  void ReSize(data_size_t num_data) override {
+    if (num_data_ != num_data) {
+      num_data_ = num_data;
+    }
+  }
+
+  #define ACC_GH(hist, i, g, h) \
+  const auto ti = static_cast<int>(i) << 1; \
+  hist[ti] += g; \
+  hist[ti + 1] += h; \
+
+  template<bool use_indices, bool use_prefetch, bool use_hessians>
+  void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians, hist_t* out) const {
+    data_size_t i = start;
+    if (use_prefetch) {
+      const data_size_t pf_offset = 32 / sizeof(VAL_T);
+      const data_size_t pf_end = end - pf_offset;
+
+      for (; i < pf_end; ++i) {
+        const auto idx = use_indices ? data_indices[i] : i;
+        const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
+        PREFETCH_T0(gradients + pf_idx);
+        if (use_hessians) {
+          PREFETCH_T0(hessians + pf_idx);
+        }
+        PREFETCH_T0(row_ptr_.data() + pf_idx);
+        PREFETCH_T0(data_.data() + row_ptr_[pf_idx]);
+        const auto j_start = RowPtr(idx);
+        const auto j_end = RowPtr(idx + 1);
+        for (auto j = j_start; j < j_end; ++j) {
+          const VAL_T bin = data_[j];
+          if (use_hessians) {
+            ACC_GH(out, bin, gradients[idx], hessians[idx]);
+          } else {
+            ACC_GH(out, bin, gradients[idx], 1.0f);
+          }
+        }
+      }
+    }
+    for (; i < end; ++i) {
+      const auto idx = use_indices ? data_indices[i] : i;
+      const auto j_start = RowPtr(idx);
+      const auto j_end = RowPtr(idx + 1);
+      for (auto j = j_start; j < j_end; ++j) {
+        const VAL_T bin = data_[j];
+        if (use_hessians) {
+          ACC_GH(out, bin, gradients[idx], hessians[idx]);
+        } else {
+          ACC_GH(out, bin, gradients[idx], 1.0f);
+        }
+      }
+    }
+  }
+  #undef ACC_GH
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out);
+  }
+
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out);
+  }
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients,
+    hist_t* out) const override {
+    ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
+  }
+
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients,
+    hist_t* out) const override {
+    ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
+  }
+
+  void CopySubset(const Bin * full_bin, const data_size_t * used_indices, data_size_t num_used_indices) override {
+    auto other_bin = dynamic_cast<const MultiValSparseBin<VAL_T>*>(full_bin);
+    row_ptr_.resize(num_data_ + 1, 0);
+    data_.clear();
+    for (data_size_t i = 0; i < num_used_indices; ++i) {
+      for (data_size_t j = other_bin->row_ptr_[used_indices[i]]; j < other_bin->row_ptr_[used_indices[i] + 1]; ++j) {
+        data_.push_back(other_bin->data_[j]);
+      }
+      row_ptr_[i + 1] = row_ptr_[i] + other_bin->row_ptr_[used_indices[i] + 1] - other_bin->row_ptr_[used_indices[i]];
+    }
+  }
+
+  inline data_size_t RowPtr(data_size_t idx) const {
+    return row_ptr_[idx];
+  }
+
+  MultiValSparseBin<VAL_T>* Clone() override;
+
+private:
+  data_size_t num_data_;
+  int num_bin_;
+  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>> data_;
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, 32>> row_ptr_;
+  std::vector<std::vector<VAL_T>> t_data_;
+
+  MultiValSparseBin<VAL_T>(const MultiValSparseBin<VAL_T> & other)
+    : num_data_(other.num_data_), num_bin_(other.num_bin_), data_(other.data_), row_ptr_(other.row_ptr_) {
+  }
+};
+
+template<typename VAL_T>
+MultiValSparseBin<VAL_T>* MultiValSparseBin<VAL_T>::Clone() {
+  return new MultiValSparseBin<VAL_T>(*this);
+}
+
+
+
+}  // namespace LightGBM
+#endif   // LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
diff --git a/src/io/ordered_sparse_bin.hpp b/src/io/ordered_sparse_bin.hpp
deleted file mode 100644
index d1b8bdf61bf9..000000000000
--- a/src/io/ordered_sparse_bin.hpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*!
- * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifndef LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_
-#define LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_
-
-#include <LightGBM/bin.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <mutex>
-#include <utility>
-#include <vector>
-
-#include "sparse_bin.hpp"
-
-namespace LightGBM {
-
-/*!
-* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
-*        There are 2 advantages by using ordered bin.
-*        1. group the data by leafs to improve the cache hit.
-*        2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
-*        However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
-*        So we only using ordered bin for sparse situations.
-*/
-template <typename VAL_T>
-class OrderedSparseBin: public OrderedBin {
- public:
-  /*! \brief Pair to store one bin entry */
-  struct SparsePair {
-    data_size_t ridx;  // data(row) index
-    VAL_T bin;  // bin for this data
-    SparsePair() : ridx(0), bin(0) {}
-  };
-
-  explicit OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
-    :bin_data_(bin_data) {
-    data_size_t cur_pos = 0;
-    data_size_t i_delta = -1;
-    int non_zero_cnt = 0;
-    while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
-      ++non_zero_cnt;
-    }
-    ordered_pair_.resize(non_zero_cnt);
-    leaf_cnt_.push_back(non_zero_cnt);
-  }
-
-  ~OrderedSparseBin() {
-  }
-
-  void Init(const char* used_idices, int num_leaves) override {
-    // initialize the leaf information
-    leaf_start_ = std::vector<data_size_t>(num_leaves, 0);
-    leaf_cnt_ = std::vector<data_size_t>(num_leaves, 0);
-    if (used_idices == nullptr) {
-      // if using all data, copy all non-zero pair
-      data_size_t j = 0;
-      data_size_t cur_pos = 0;
-      data_size_t i_delta = -1;
-      while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
-        ordered_pair_[j].ridx = cur_pos;
-        ordered_pair_[j].bin = bin_data_->vals_[i_delta];
-        ++j;
-      }
-      leaf_cnt_[0] = static_cast<data_size_t>(j);
-    } else {
-      // if using part of data(bagging)
-      data_size_t j = 0;
-      data_size_t cur_pos = 0;
-      data_size_t i_delta = -1;
-      while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
-        if (used_idices[cur_pos]) {
-          ordered_pair_[j].ridx = cur_pos;
-          ordered_pair_[j].bin = bin_data_->vals_[i_delta];
-          ++j;
-        }
-      }
-      leaf_cnt_[0] = j;
-    }
-  }
-
-  void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian,
-                          HistogramBinEntry* out) const override {
-    // get current leaf boundary
-    const data_size_t start = leaf_start_[leaf];
-    const data_size_t end = start + leaf_cnt_[leaf];
-    for (data_size_t i = start; i < end; ++i) {
-      const VAL_T bin = ordered_pair_[i].bin;
-      const auto g = gradient[ordered_pair_[i].ridx];
-      const auto h = hessian[ordered_pair_[i].ridx];
-
-      out[bin].sum_gradients += g;
-      out[bin].sum_hessians += h;
-      ++out[bin].cnt;
-    }
-  }
-
-  void ConstructHistogram(int leaf, const score_t* gradient,
-                          HistogramBinEntry* out) const override {
-    // get current leaf boundary
-    const data_size_t start = leaf_start_[leaf];
-    const data_size_t end = start + leaf_cnt_[leaf];
-    for (data_size_t i = start; i < end; ++i) {
-      const VAL_T bin = ordered_pair_[i].bin;
-      const auto g = gradient[ordered_pair_[i].ridx];
-      out[bin].sum_gradients += g;
-      ++out[bin].cnt;
-    }
-  }
-
-  void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) override {
-    // get current leaf boundary
-    const data_size_t l_start = leaf_start_[leaf];
-    const data_size_t l_end = l_start + leaf_cnt_[leaf];
-    // new left leaf end after split
-    data_size_t new_left_end = l_start;
-
-    for (data_size_t i = l_start; i < l_end; ++i) {
-      if (is_in_leaf[ordered_pair_[i].ridx] == mark) {
-        std::swap(ordered_pair_[new_left_end], ordered_pair_[i]);
-        ++new_left_end;
-      }
-    }
-
-    leaf_start_[right_leaf] = new_left_end;
-    leaf_cnt_[leaf] = new_left_end - l_start;
-    leaf_cnt_[right_leaf] = l_end - new_left_end;
-  }
-  data_size_t NonZeroCount(int leaf) const override {
-    return static_cast<data_size_t>(leaf_cnt_[leaf]);
-  }
-  /*! \brief Disable copy */
-  OrderedSparseBin<VAL_T>& operator=(const OrderedSparseBin<VAL_T>&) = delete;
-  /*! \brief Disable copy */
-  OrderedSparseBin<VAL_T>(const OrderedSparseBin<VAL_T>&) = delete;
-
- private:
-  const SparseBin<VAL_T>* bin_data_;
-  /*! \brief Store non-zero pair , group by leaf */
-  std::vector<SparsePair> ordered_pair_;
-  /*! \brief leaf_start_[i] means data in i-th leaf start from */
-  std::vector<data_size_t> leaf_start_;
-  /*! \brief leaf_cnt_[i] means number of data in i-th leaf */
-  std::vector<data_size_t> leaf_cnt_;
-};
-
-template <typename VAL_T>
-OrderedBin* SparseBin<VAL_T>::CreateOrderedBin() const {
-  return new OrderedSparseBin<VAL_T>(this);
-}
-
-}  // namespace LightGBM
-#endif   // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index 7cd2d7c15e89..07898fa1ac65 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -24,7 +24,7 @@ const size_t kNumFastIndex = 64;
 
 template <typename VAL_T>
 class SparseBinIterator: public BinIterator {
- public:
+public:
   SparseBinIterator(const SparseBin<VAL_T>* bin_data,
     uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin)
     : bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
@@ -56,7 +56,7 @@ class SparseBinIterator: public BinIterator {
 
   inline void Reset(data_size_t idx) override;
 
- private:
+private:
   const SparseBin<VAL_T>* bin_data_;
   data_size_t cur_pos_;
   data_size_t i_delta_;
@@ -66,20 +66,16 @@ class SparseBinIterator: public BinIterator {
   uint8_t offset_;
 };
 
-template <typename VAL_T>
-class OrderedSparseBin;
-
 template <typename VAL_T>
 class SparseBin: public Bin {
- public:
+public:
   friend class SparseBinIterator<VAL_T>;
-  friend class OrderedSparseBin<VAL_T>;
 
   explicit SparseBin(data_size_t num_data)
     : num_data_(num_data) {
     int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
+    #pragma omp parallel
+    #pragma omp master
     {
       num_threads = omp_get_num_threads();
     }
@@ -102,41 +98,97 @@ class SparseBin: public Bin {
 
   BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
 
-  void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*,
-    const score_t*, HistogramBinEntry*) const override {
-    // Will use OrderedSparseBin->ConstructHistogram() instead
-    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
+  #define ACC_GH(hist, i, g, h) \
+  const auto ti = static_cast<int>(i) << 1; \
+  hist[ti] += g; \
+  hist[ti + 1] += h; \
+
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const override {
+    data_size_t i_delta, cur_pos;
+    InitIndex(data_indices[start], &i_delta, &cur_pos);
+    data_size_t i = start;
+    for (;;) {
+      if (cur_pos < data_indices[i]) {
+        cur_pos += deltas_[++i_delta];
+        if (i_delta >= num_vals_) { break; }
+      } else if (cur_pos > data_indices[i]) {
+        if (++i >= end) { break; }
+      } else {
+        const VAL_T bin = vals_[i_delta];
+        ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
+        if (++i >= end) { break; }
+        cur_pos += deltas_[++i_delta];
+        if (i_delta >= num_vals_) { break; }
+      }
+    }
   }
 
-  void ConstructHistogram(data_size_t, data_size_t, const score_t*,
-                          const score_t*, HistogramBinEntry*) const override {
-    // Will use OrderedSparseBin->ConstructHistogram() instead
-    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const override {
+    data_size_t i_delta, cur_pos;
+    InitIndex(start, &i_delta, &cur_pos);
+    while (cur_pos < start && i_delta < num_vals_) {
+      cur_pos += deltas_[++i_delta];
+    }
+    while (cur_pos < end && i_delta < num_vals_) {
+      const VAL_T bin = vals_[i_delta];
+      ACC_GH(out, bin, ordered_gradients[cur_pos], ordered_hessians[cur_pos]);
+      cur_pos += deltas_[++i_delta];
+    }
   }
 
-  void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*,
-                          HistogramBinEntry*) const override {
-    // Will use OrderedSparseBin->ConstructHistogram() instead
-    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
+  void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients,
+    hist_t* out) const override {
+    data_size_t i_delta, cur_pos;
+    InitIndex(data_indices[start], &i_delta, &cur_pos);
+    data_size_t i = start;
+    for (;;) {
+      if (cur_pos < data_indices[i]) {
+        cur_pos += deltas_[++i_delta];
+        if (i_delta >= num_vals_) { break; }
+      } else if (cur_pos > data_indices[i]) {
+        if (++i >= end) { break; }
+      } else {
+        const VAL_T bin = vals_[i_delta];
+        ACC_GH(out, bin, ordered_gradients[i], 1.0f);
+        if (++i >= end) { break; }
+        cur_pos += deltas_[++i_delta];
+        if (i_delta >= num_vals_) { break; }
+      }
+    }
   }
 
-  void ConstructHistogram(data_size_t, data_size_t, const score_t*,
-                          HistogramBinEntry*) const override {
-    // Will use OrderedSparseBin->ConstructHistogram() instead
-    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
+  void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* ordered_gradients,
+    hist_t* out) const override {
+    data_size_t i_delta, cur_pos;
+    InitIndex(start, &i_delta, &cur_pos);
+    while (cur_pos < start && i_delta < num_vals_) {
+      cur_pos += deltas_[++i_delta];
+    }
+    while (cur_pos < end && i_delta < num_vals_) {
+      const VAL_T bin = vals_[i_delta];
+      ACC_GH(out, bin, ordered_gradients[cur_pos], 1.0f);
+      cur_pos += deltas_[++i_delta];
+    }
   }
+  #undef ACC_GH
 
-  inline bool NextNonzero(data_size_t* i_delta,
-                          data_size_t* cur_pos) const {
-    ++(*i_delta);
-    data_size_t shift = 0;
-    data_size_t delta = deltas_[*i_delta];
-    while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
-      ++(*i_delta);
-      shift += 8;
-      delta |= static_cast<data_size_t>(deltas_[*i_delta]) << shift;
+  inline void NextNonzeroFast(data_size_t* i_delta,
+    data_size_t* cur_pos) const {
+    *cur_pos += deltas_[++(*i_delta)];
+    if (*i_delta >= num_vals_) {
+      *cur_pos = num_data_;
     }
-    *cur_pos += delta;
+  }
+
+  inline bool NextNonzero(data_size_t* i_delta,
+    data_size_t* cur_pos) const {
+    *cur_pos += deltas_[++(*i_delta)];
     if (*i_delta < num_vals_) {
       return true;
     } else {
@@ -257,8 +309,6 @@ class SparseBin: public Bin {
 
   data_size_t num_data() const override { return num_data_; }
 
-  OrderedBin* CreateOrderedBin() const override;
-
   void FinishLoad() override {
     // get total non zero size
     size_t pair_cnt = 0;
@@ -276,8 +326,8 @@ class SparseBin: public Bin {
     // sort by data index
     std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
       [](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
-      return a.first < b.first;
-    });
+        return a.first < b.first;
+      });
     // load delta array
     LoadFromPair(idx_val_pairs);
   }
@@ -291,11 +341,12 @@ class SparseBin: public Bin {
       const data_size_t cur_idx = idx_val_pairs[i].first;
       const VAL_T bin = idx_val_pairs[i].second;
       data_size_t cur_delta = cur_idx - last_idx;
+      // disallow the multi-val in one row
       if (i > 0 && cur_delta == 0) { continue; }
       while (cur_delta >= 256) {
-        deltas_.push_back(cur_delta & 0xff);
+        deltas_.push_back(255);
         vals_.push_back(0);
-        cur_delta >>= 8;
+        cur_delta -= 255;
       }
       deltas_.push_back(static_cast<uint8_t>(cur_delta));
       vals_.push_back(bin);
@@ -384,7 +435,7 @@ class SparseBin: public Bin {
         while (cur_pos < idx && j < num_vals_) {
           NextNonzero(&j, &cur_pos);
         }
-        if (cur_pos == idx && j < num_vals_) {
+        if (cur_pos == idx && j < num_vals_ && vals_[j] > 0) {
           // new row index is i
           tmp_pair.emplace_back(i, vals_[j]);
         }
@@ -405,13 +456,13 @@ class SparseBin: public Bin {
     // transform to delta array
     data_size_t last_idx = 0;
     for (data_size_t i = 0; i < num_used_indices; ++i) {
-      VAL_T bin = iterator.InnerRawGet(used_indices[i]);
+      auto bin = iterator.InnerRawGet(used_indices[i]);
       if (bin > 0) {
         data_size_t cur_delta = i - last_idx;
         while (cur_delta >= 256) {
-          deltas_.push_back(cur_delta & 0xff);
+          deltas_.push_back(255);
           vals_.push_back(0);
-          cur_delta >>= 8;
+          cur_delta -= 255;
         }
         deltas_.push_back(static_cast<uint8_t>(cur_delta));
         vals_.push_back(bin);
@@ -432,15 +483,29 @@ class SparseBin: public Bin {
 
   SparseBin<VAL_T>* Clone() override;
 
- protected:
   SparseBin<VAL_T>(const SparseBin<VAL_T>& other)
     : num_data_(other.num_data_), deltas_(other.deltas_), vals_(other.vals_),
-      num_vals_(other.num_vals_), push_buffers_(other.push_buffers_),
-      fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) {}
+    num_vals_(other.num_vals_), push_buffers_(other.push_buffers_),
+    fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) {
+  }
+
+  void InitIndex(data_size_t start_idx, data_size_t * i_delta, data_size_t * cur_pos) const {
+    auto idx = start_idx >> fast_index_shift_;
+    if (static_cast<size_t>(idx) < fast_index_.size()) {
+      const auto fast_pair = fast_index_[start_idx >> fast_index_shift_];
+      *i_delta = fast_pair.first;
+      *cur_pos = fast_pair.second;
+    } else {
+      *i_delta = -1;
+      *cur_pos = 0;
+    }
+  }
+
+private:
 
   data_size_t num_data_;
-  std::vector<uint8_t> deltas_;
-  std::vector<VAL_T> vals_;
+  std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>> deltas_;
+  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> vals_;
   data_size_t num_vals_;
   std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
   std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
@@ -460,7 +525,7 @@ inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
 template <typename VAL_T>
 inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
   while (cur_pos_ < idx) {
-    bin_data_->NextNonzero(&i_delta_, &cur_pos_);
+    bin_data_->NextNonzeroFast(&i_delta_, &cur_pos_);
   }
   if (cur_pos_ == idx) {
     return bin_data_->vals_[i_delta_];
@@ -471,15 +536,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
 
 template <typename VAL_T>
 inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
-  auto idx = start_idx >> bin_data_->fast_index_shift_;
-  if (static_cast<size_t>(idx) < bin_data_->fast_index_.size()) {
-    const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
-    i_delta_ = fast_pair.first;
-    cur_pos_ = fast_pair.second;
-  } else {
-    i_delta_ = -1;
-    cur_pos_ = 0;
-  }
+  bin_data_->InitIndex(start_idx, &i_delta_, &cur_pos_);
 }
 
 template <typename VAL_T>
diff --git a/src/objective/rank_xendcg_objective.hpp b/src/objective/rank_xendcg_objective.hpp
index 81c8ab70f33f..1f9d4ae75327 100644
--- a/src/objective/rank_xendcg_objective.hpp
+++ b/src/objective/rank_xendcg_objective.hpp
@@ -73,9 +73,9 @@ class RankXENDCG: public ObjectiveFunction {
     // Skip query if sum of labels is 0.
     float sum_labels = 0;
     for (data_size_t i = 0; i < cnt; ++i) {
-      sum_labels += phi(label[i], gammas[i]);
+      sum_labels += static_cast<float>(phi(label[i], gammas[i]));
     }
-    if (sum_labels == 0) {
+    if (std::fabs(sum_labels) < kEpsilon) {
       return;
     }
 
@@ -111,7 +111,7 @@ class RankXENDCG: public ObjectiveFunction {
   }
 
   double phi(const label_t l, double g) const {
-    return Common::Pow(2, l) - g;
+    return Common::Pow(2, static_cast<int>(l)) - g;
   }
 
   const char* GetName() const override {
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index f415e4e59991..a728a17b7193 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -27,7 +27,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, boo
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
   // allocate buffer for communication
-  size_t buffer_size = this->train_data_->NumTotalBin() * sizeof(HistogramBinEntry);
+  size_t buffer_size = this->train_data_->NumTotalBin() * KHistEntrySize;
 
   input_buffer_.resize(buffer_size);
   output_buffer_.resize(buffer_size);
@@ -82,7 +82,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
       if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
         num_bin -= 1;
       }
-      block_len_[i] += num_bin * sizeof(HistogramBinEntry);
+      block_len_[i] += num_bin * KHistEntrySize;
     }
     reduce_scatter_size_ += block_len_[i];
   }
@@ -101,7 +101,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
       if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
         num_bin -= 1;
       }
-      bin_size += num_bin * sizeof(HistogramBinEntry);
+      bin_size += num_bin * KHistEntrySize;
     }
   }
 
@@ -113,7 +113,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
     if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
       num_bin -= 1;
     }
-    bin_size += num_bin * sizeof(HistogramBinEntry);
+    bin_size += num_bin * KHistEntrySize;
   }
 
   // sync global data sumup info
@@ -158,8 +158,8 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
                 this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
   }
   // Reduce scatter for histogram
-  Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(),
-                         block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramBinEntry::SumReducer);
+  Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(),
+                         block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramSumReducer);
   this->FindBestSplitsFromHistograms(this->is_feature_used_, true);
 }
 
@@ -186,7 +186,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
 
     this->train_data_->FixHistogram(feature_index,
                                     this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
-                                    GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()),
                                     this->smaller_leaf_histogram_array_[feature_index].RawData());
     SplitInfo smaller_split;
     // find best threshold for smaller child
diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp
index 4cf9b5b22756..98847683f5dc 100644
--- a/src/treelearner/data_partition.hpp
+++ b/src/treelearner/data_partition.hpp
@@ -108,58 +108,70 @@ class DataPartition {
   * \param threshold threshold that want to split
   * \param right_leaf index of right leaf
   */
-  void Split(int leaf, const Dataset* dataset, int feature, const uint32_t* threshold, int num_threshold, bool default_left, int right_leaf) {
+  void Split(int leaf, const Dataset* dataset, int feature,
+             const uint32_t* threshold, int num_threshold, bool default_left,
+             int right_leaf) {
+    Common::FunctionTimer fun_timer("DataPartition::Split", global_timer);
     const data_size_t min_inner_size = 512;
     // get leaf boundary
     const data_size_t begin = leaf_begin_[leaf];
     const data_size_t cnt = leaf_count_[leaf];
 
-    data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_;
-    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
+    const int nblock =
+        std::min(num_threads_, (cnt + min_inner_size - 1) / min_inner_size);
+    data_size_t inner_size = SIZE_ALIGNED((cnt + nblock - 1) / nblock);
+    auto left_start = indices_.data() + begin;
+    global_timer.Start("DataPartition::Split.MT");
     // split data multi-threading
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
+#pragma omp parallel for schedule(static, 1)
+    for (int i = 0; i < nblock; ++i) {
       OMP_LOOP_EX_BEGIN();
-      left_cnts_buf_[i] = 0;
-      right_cnts_buf_[i] = 0;
       data_size_t cur_start = i * inner_size;
-      if (cur_start > cnt) { continue; }
-      data_size_t cur_cnt = inner_size;
-      if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
+      data_size_t cur_cnt = std::min(inner_size, cnt - cur_start);
+      if (cur_cnt <= 0) {
+        left_cnts_buf_[i] = 0;
+        right_cnts_buf_[i] = 0;
+        continue;
+      }
       // split data inner, reduce the times of function called
-      data_size_t cur_left_count = dataset->Split(feature, threshold, num_threshold, default_left, indices_.data() + begin + cur_start, cur_cnt,
-                                                  temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
+      data_size_t cur_left_count =
+          dataset->Split(feature, threshold, num_threshold, default_left,
+                         left_start + cur_start, cur_cnt,
+                         temp_left_indices_.data() + cur_start,
+                         temp_right_indices_.data() + cur_start);
       offsets_buf_[i] = cur_start;
       left_cnts_buf_[i] = cur_left_count;
       right_cnts_buf_[i] = cur_cnt - cur_left_count;
       OMP_LOOP_EX_END();
     }
     OMP_THROW_EX();
-    data_size_t left_cnt = 0;
+    global_timer.Stop("DataPartition::Split.MT");
+    global_timer.Start("DataPartition::Split.Merge");
     left_write_pos_buf_[0] = 0;
     right_write_pos_buf_[0] = 0;
-    for (int i = 1; i < num_threads_; ++i) {
-      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
-      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
+    for (int i = 1; i < nblock; ++i) {
+      left_write_pos_buf_[i] =
+          left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
+      right_write_pos_buf_[i] =
+          right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
     }
-    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
-    // copy back indices of right leaf to indices_
-    #pragma omp parallel for schedule(static, 1)
-    for (int i = 0; i < num_threads_; ++i) {
-      if (left_cnts_buf_[i] > 0) {
-        std::memcpy(indices_.data() + begin + left_write_pos_buf_[i],
-                    temp_left_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
-      }
-      if (right_cnts_buf_[i] > 0) {
-        std::memcpy(indices_.data() + begin + left_cnt + right_write_pos_buf_[i],
-                    temp_right_indices_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
-      }
+    data_size_t left_cnt =
+        left_write_pos_buf_[nblock - 1] + left_cnts_buf_[nblock - 1];
+
+    auto right_start = left_start + left_cnt;
+#pragma omp parallel for schedule(static)
+    for (int i = 0; i < nblock; ++i) {
+      std::copy_n(temp_left_indices_.data() + offsets_buf_[i],
+                  left_cnts_buf_[i], left_start + left_write_pos_buf_[i]);
+      std::copy_n(temp_right_indices_.data() + offsets_buf_[i],
+                  right_cnts_buf_[i], right_start + right_write_pos_buf_[i]);
     }
     // update leaf boundary
     leaf_count_[leaf] = left_cnt;
     leaf_begin_[right_leaf] = left_cnt + begin;
     leaf_count_[right_leaf] = cnt - left_cnt;
+    global_timer.Stop("DataPartition::Split.Merge");
   }
 
   /*!
@@ -201,11 +213,11 @@ class DataPartition {
   /*! \brief number of data on one leaf */
   std::vector<data_size_t> leaf_count_;
   /*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
-  std::vector<data_size_t> indices_;
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> indices_;
   /*! \brief team indices buffer for split */
-  std::vector<data_size_t> temp_left_indices_;
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> temp_left_indices_;
   /*! \brief team indices buffer for split */
-  std::vector<data_size_t> temp_right_indices_;
+  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> temp_right_indices_;
   /*! \brief used data indices, used for bagging */
   const data_size_t* used_data_indices_;
   /*! \brief used data count, used for bagging */
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index 8e0c3c585d7c..6dde6c4a541e 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -5,6 +5,7 @@
 #ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
 #define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
 
+#include <LightGBM/bin.h>
 #include <LightGBM/dataset.h>
 #include <LightGBM/utils/array_args.h>
 
@@ -20,7 +21,7 @@
 namespace LightGBM {
 
 class FeatureMetainfo {
- public:
+public:
   int num_bin;
   MissingType missing_type;
   int8_t offset = 0;
@@ -35,7 +36,7 @@ class FeatureMetainfo {
 * \brief FeatureHistogram is used to construct and store a histogram for a feature.
 */
 class FeatureHistogram {
- public:
+public:
   FeatureHistogram() {
     data_ = nullptr;
   }
@@ -53,19 +54,19 @@ class FeatureHistogram {
   * \param feature the feature data for this histogram
   * \param min_num_data_one_leaf minimal number of data in one leaf
   */
-  void Init(HistogramBinEntry* data, const FeatureMetainfo* meta) {
+  void Init(hist_t* data, const FeatureMetainfo* meta) {
     meta_ = meta;
     data_ = data;
     if (meta_->bin_type == BinType::NumericalBin) {
       find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
-                                           , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
+        , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
     } else {
       find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
-                                           , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
+        , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
     }
   }
 
-  HistogramBinEntry* RawData() {
+  hist_t* RawData() {
     return data_;
   }
   /*!
@@ -73,15 +74,13 @@ class FeatureHistogram {
   * \param other The histogram that want to subtract
   */
   void Subtract(const FeatureHistogram& other) {
-    for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) {
-      data_[i].cnt -= other.data_[i].cnt;
-      data_[i].sum_gradients -= other.data_[i].sum_gradients;
-      data_[i].sum_hessians -= other.data_[i].sum_hessians;
+    for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) {
+      data_[i] -= other.data_[i];
     }
   }
 
   void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
-                         SplitInfo* output) {
+    SplitInfo* output) {
     output->default_left = true;
     output->gain = kMinScore;
     find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output);
@@ -89,10 +88,10 @@ class FeatureHistogram {
   }
 
   void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
-                                  SplitInfo* output) {
+    SplitInfo* output) {
     is_splittable_ = false;
     double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
-                                         meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
+      meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
     double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
     if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
       if (meta_->missing_type == MissingType::Zero) {
@@ -116,8 +115,8 @@ class FeatureHistogram {
   }
 
   void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
-                                    double min_constraint, double max_constraint,
-                                    SplitInfo* output) {
+    double min_constraint, double max_constraint,
+    SplitInfo* output) {
     output->default_left = false;
     double best_gain = kMinScore;
     data_size_t best_left_count = 0;
@@ -134,25 +133,28 @@ class FeatureHistogram {
     bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot;
     int best_threshold = -1;
     int best_dir = 1;
-
+    const double cnt_factor = num_data / sum_hessian;
     if (use_onehot) {
       for (int t = 0; t < used_bin; ++t) {
+        const auto grad = GET_GRAD(data_, t);
+        const auto hess = GET_HESS(data_, t);
+        data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
         // if data not enough, or sum hessian too small
-        if (data_[t].cnt < meta_->config->min_data_in_leaf
-            || data_[t].sum_hessians < meta_->config->min_sum_hessian_in_leaf) continue;
-        data_size_t other_count = num_data - data_[t].cnt;
+        if (cnt < meta_->config->min_data_in_leaf
+          || hess < meta_->config->min_sum_hessian_in_leaf) continue;
+        data_size_t other_count = num_data - cnt;
         // if data not enough
         if (other_count < meta_->config->min_data_in_leaf) continue;
 
-        double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon;
+        double sum_other_hessian = sum_hessian - hess - kEpsilon;
         // if sum hessian too small
         if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
 
-        double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
+        double sum_other_gradient = sum_gradient - grad;
         // current split gain
-        double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
-                                            meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                            min_constraint, max_constraint, 0);
+        double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon,
+          meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+          min_constraint, max_constraint, 0);
         // gain with split is worse than without split
         if (current_gain <= min_gain_shift) continue;
 
@@ -161,15 +163,15 @@ class FeatureHistogram {
         // better split point
         if (current_gain > best_gain) {
           best_threshold = t;
-          best_sum_left_gradient = data_[t].sum_gradients;
-          best_sum_left_hessian = data_[t].sum_hessians + kEpsilon;
-          best_left_count = data_[t].cnt;
+          best_sum_left_gradient = grad;
+          best_sum_left_hessian = hess + kEpsilon;
+          best_left_count = cnt;
           best_gain = current_gain;
         }
       }
     } else {
       for (int i = 0; i < used_bin; ++i) {
-        if (data_[i].cnt >= meta_->config->cat_smooth) {
+        if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >= meta_->config->cat_smooth) {
           sorted_idx.push_back(i);
         }
       }
@@ -181,9 +183,9 @@ class FeatureHistogram {
         return (sum_grad) / (sum_hess + meta_->config->cat_smooth);
       };
       std::sort(sorted_idx.begin(), sorted_idx.end(),
-                [this, &ctr_fun](int i, int j) {
-        return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians);
-      });
+        [this, &ctr_fun](int i, int j) {
+          return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) < ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j));
+        });
 
       std::vector<int> find_direction(1, 1);
       std::vector<int> start_position(1, 0);
@@ -203,14 +205,17 @@ class FeatureHistogram {
         for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
           auto t = sorted_idx[start_pos];
           start_pos += dir;
+          const auto grad = GET_GRAD(data_, t);
+          const auto hess = GET_HESS(data_, t);
+          data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
 
-          sum_left_gradient += data_[t].sum_gradients;
-          sum_left_hessian += data_[t].sum_hessians;
-          left_count += data_[t].cnt;
-          cnt_cur_group += data_[t].cnt;
+          sum_left_gradient += grad;
+          sum_left_hessian += hess;
+          left_count += cnt;
+          cnt_cur_group += cnt;
 
           if (left_count < meta_->config->min_data_in_leaf
-              || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
+            || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
           data_size_t right_count = num_data - left_count;
           if (right_count < meta_->config->min_data_in_leaf || right_count < min_data_per_group) break;
 
@@ -223,8 +228,8 @@ class FeatureHistogram {
 
           double sum_right_gradient = sum_gradient - sum_left_gradient;
           double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
-                                              meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                              min_constraint, max_constraint, 0);
+            meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+            min_constraint, max_constraint, 0);
           if (current_gain <= min_gain_shift) continue;
           is_splittable_ = true;
           if (current_gain > best_gain) {
@@ -241,15 +246,15 @@ class FeatureHistogram {
 
     if (is_splittable_) {
       output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
-                                                        meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                                        min_constraint, max_constraint);
+        meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+        min_constraint, max_constraint);
       output->left_count = best_left_count;
       output->left_sum_gradient = best_sum_left_gradient;
       output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
       output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
-                                                         sum_hessian - best_sum_left_hessian,
-                                                         meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-                                                         min_constraint, max_constraint);
+        sum_hessian - best_sum_left_hessian,
+        meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+        min_constraint, max_constraint);
       output->right_count = num_data - best_left_count;
       output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
       output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
@@ -279,22 +284,22 @@ class FeatureHistogram {
   }
 
   void GatherInfoForThreshold(double sum_gradient, double sum_hessian,
-                              uint32_t threshold, data_size_t num_data, SplitInfo *output) {
+    uint32_t threshold, data_size_t num_data, SplitInfo* output) {
     if (meta_->bin_type == BinType::NumericalBin) {
       GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold,
-                                      num_data, output);
+        num_data, output);
     } else {
       GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold,
-                                        num_data, output);
+        num_data, output);
     }
   }
 
   void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian,
-                                       uint32_t threshold, data_size_t num_data,
-                                       SplitInfo *output) {
+    uint32_t threshold, data_size_t num_data,
+    SplitInfo* output) {
     double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
-                                         meta_->config->lambda_l1, meta_->config->lambda_l2,
-                                         meta_->config->max_delta_step);
+      meta_->config->lambda_l1, meta_->config->lambda_l2,
+      meta_->config->max_delta_step);
     double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
 
     // do stuff here
@@ -315,27 +320,29 @@ class FeatureHistogram {
 
     int t = meta_->num_bin - 1 - offset - use_na_as_missing;
     const int t_end = 1 - offset;
-
+    const double cnt_factor = num_data / sum_hessian;
     // from right to left, and we don't need data in bin0
     for (; t >= t_end; --t) {
       if (static_cast<uint32_t>(t + offset) < threshold) { break; }
 
       // need to skip default bin
       if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }
-
-      sum_right_gradient += data_[t].sum_gradients;
-      sum_right_hessian += data_[t].sum_hessians;
-      right_count += data_[t].cnt;
+      const auto grad = GET_GRAD(data_, t);
+      const auto hess = GET_HESS(data_, t);
+      data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
+      sum_right_gradient += grad;
+      sum_right_hessian += hess;
+      right_count += cnt;
     }
     double sum_left_gradient = sum_gradient - sum_right_gradient;
     double sum_left_hessian = sum_hessian - sum_right_hessian;
     data_size_t left_count = num_data - right_count;
     double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
-                                           meta_->config->lambda_l1, meta_->config->lambda_l2,
-                                           meta_->config->max_delta_step)
-          + GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
-                             meta_->config->lambda_l1, meta_->config->lambda_l2,
-                             meta_->config->max_delta_step);
+      meta_->config->lambda_l1, meta_->config->lambda_l2,
+      meta_->config->max_delta_step)
+      + GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
+        meta_->config->lambda_l1, meta_->config->lambda_l2,
+        meta_->config->max_delta_step);
 
     // gain with split is worse than without split
     if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
@@ -347,15 +354,15 @@ class FeatureHistogram {
     // update split information
     output->threshold = threshold;
     output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
-                                                      meta_->config->lambda_l1, meta_->config->lambda_l2,
-                                                      meta_->config->max_delta_step);
+      meta_->config->lambda_l1, meta_->config->lambda_l2,
+      meta_->config->max_delta_step);
     output->left_count = left_count;
     output->left_sum_gradient = sum_left_gradient;
     output->left_sum_hessian = sum_left_hessian - kEpsilon;
     output->right_output = CalculateSplittedLeafOutput(sum_gradient - sum_left_gradient,
-                                                       sum_hessian - sum_left_hessian,
-                                                       meta_->config->lambda_l1, meta_->config->lambda_l2,
-                                                       meta_->config->max_delta_step);
+      sum_hessian - sum_left_hessian,
+      meta_->config->lambda_l1, meta_->config->lambda_l2,
+      meta_->config->max_delta_step);
     output->right_count = num_data - left_count;
     output->right_sum_gradient = sum_gradient - sum_left_gradient;
     output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon;
@@ -365,13 +372,13 @@ class FeatureHistogram {
   }
 
   void GatherInfoForThresholdCategorical(double sum_gradient, double sum_hessian,
-                                         uint32_t threshold, data_size_t num_data, SplitInfo *output) {
+    uint32_t threshold, data_size_t num_data, SplitInfo* output) {
     // get SplitInfo for a given one-hot categorical split.
     output->default_left = false;
     double gain_shift = GetLeafSplitGain(
-            sum_gradient, sum_hessian,
-            meta_->config->lambda_l1, meta_->config->lambda_l2,
-            meta_->config->max_delta_step);
+      sum_gradient, sum_hessian,
+      meta_->config->lambda_l1, meta_->config->lambda_l2,
+      meta_->config->max_delta_step);
     double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
     bool is_full_categorical = meta_->missing_type == MissingType::None;
     int used_bin = meta_->num_bin - 1 + is_full_categorical;
@@ -380,21 +387,25 @@ class FeatureHistogram {
       Log::Warning("Invalid categorical threshold split");
       return;
     }
+    const double cnt_factor = num_data / sum_hessian;
+    const auto grad = GET_GRAD(data_, threshold);
+    const auto hess = GET_HESS(data_, threshold);
+    data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
 
     double l2 = meta_->config->lambda_l2;
-    data_size_t left_count = data_[threshold].cnt;
+    data_size_t left_count = cnt;
     data_size_t right_count = num_data - left_count;
-    double sum_left_hessian = data_[threshold].sum_hessians + kEpsilon;
+    double sum_left_hessian = hess + kEpsilon;
     double sum_right_hessian = sum_hessian - sum_left_hessian;
-    double sum_left_gradient = data_[threshold].sum_gradients;
+    double sum_left_gradient = grad;
     double sum_right_gradient = sum_gradient - sum_left_gradient;
     // current split gain
     double current_gain = GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
-                                           meta_->config->lambda_l1, l2,
-                                           meta_->config->max_delta_step)
-        + GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
-                           meta_->config->lambda_l1, l2,
-                           meta_->config->max_delta_step);
+      meta_->config->lambda_l1, l2,
+      meta_->config->max_delta_step)
+      + GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
+        meta_->config->lambda_l1, l2,
+        meta_->config->max_delta_step);
     if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
       output->gain = kMinScore;
       Log::Warning("'Forced Split' will be ignored since the gain getting worse. ");
@@ -402,14 +413,14 @@ class FeatureHistogram {
     }
 
     output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
-                                                      meta_->config->lambda_l1, l2,
-                                                      meta_->config->max_delta_step);
+      meta_->config->lambda_l1, l2,
+      meta_->config->max_delta_step);
     output->left_count = left_count;
     output->left_sum_gradient = sum_left_gradient;
     output->left_sum_hessian = sum_left_hessian - kEpsilon;
     output->right_output = CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian,
-                                                       meta_->config->lambda_l1, l2,
-                                                       meta_->config->max_delta_step);
+      meta_->config->lambda_l1, l2,
+      meta_->config->max_delta_step);
     output->right_count = right_count;
     output->right_sum_gradient = sum_gradient - sum_left_gradient;
     output->right_sum_hessian = sum_right_hessian - kEpsilon;
@@ -423,14 +434,14 @@ class FeatureHistogram {
   * \brief Binary size of this histogram
   */
   int SizeOfHistgram() const {
-    return (meta_->num_bin - meta_->offset) * sizeof(HistogramBinEntry);
+    return (meta_->num_bin - meta_->offset) * KHistEntrySize;
   }
 
   /*!
   * \brief Restore histogram from memory
   */
   void FromMemory(char* memory_data) {
-    std::memcpy(data_, memory_data, (meta_->num_bin - meta_->offset) * sizeof(HistogramBinEntry));
+    std::memcpy(data_, memory_data, (meta_->num_bin - meta_->offset) * KHistEntrySize);
   }
 
   /*!
@@ -457,11 +468,11 @@ class FeatureHistogram {
     }
   }
 
- private:
+private:
   static double GetSplitGains(double sum_left_gradients, double sum_left_hessians,
-                              double sum_right_gradients, double sum_right_hessians,
-                              double l1, double l2, double max_delta_step,
-                              double min_constraint, double max_constraint, int8_t monotone_constraint) {
+    double sum_right_gradients, double sum_right_hessians,
+    double l1, double l2, double max_delta_step,
+    double min_constraint, double max_constraint, int8_t monotone_constraint) {
     double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
     double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
     if (((monotone_constraint > 0) && (left_output > right_output)) ||
@@ -479,7 +490,7 @@ class FeatureHistogram {
   * \return leaf output
   */
   static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step,
-                                            double min_constraint, double max_constraint) {
+    double min_constraint, double max_constraint) {
     double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step);
     if (ret < min_constraint) {
       ret = min_constraint;
@@ -506,7 +517,7 @@ class FeatureHistogram {
   }
 
   void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
-                                 double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) {
+    double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) {
     const int8_t offset = meta_->offset;
 
     double best_sum_left_gradient = NAN;
@@ -514,7 +525,7 @@ class FeatureHistogram {
     double best_gain = kMinScore;
     data_size_t best_left_count = 0;
     uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
-
+    const double cnt_factor = num_data / sum_hessian;
     if (dir == -1) {
       double sum_right_gradient = 0.0f;
       double sum_right_hessian = kEpsilon;
@@ -528,12 +539,15 @@ class FeatureHistogram {
         // need to skip default bin
         if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }
 
-        sum_right_gradient += data_[t].sum_gradients;
-        sum_right_hessian += data_[t].sum_hessians;
-        right_count += data_[t].cnt;
+        const auto grad = GET_GRAD(data_, t);
+        const auto hess = GET_HESS(data_, t);
+        data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
+        sum_right_gradient += grad;
+        sum_right_hessian += hess;
+        right_count += cnt;
         // if data not enough, or sum hessian too small
         if (right_count < meta_->config->min_data_in_leaf
-            || sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
+          || sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
         data_size_t left_count = num_data - right_count;
         // if data not enough
         if (left_count < meta_->config->min_data_in_leaf) break;
@@ -545,8 +559,8 @@ class FeatureHistogram {
         double sum_left_gradient = sum_gradient - sum_right_gradient;
         // current split gain
         double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
-                                            meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                            min_constraint, max_constraint, meta_->monotone_type);
+          meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
+          min_constraint, max_constraint, meta_->monotone_type);
         // gain with split is worse than without split
         if (current_gain <= min_gain_shift) continue;
 
@@ -575,9 +589,12 @@ class FeatureHistogram {
         sum_left_hessian = sum_hessian - kEpsilon;
         left_count = num_data;
         for (int i = 0; i < meta_->num_bin - offset; ++i) {
-          sum_left_gradient -= data_[i].sum_gradients;
-          sum_left_hessian -= data_[i].sum_hessians;
-          left_count -= data_[i].cnt;
+          const auto grad = GET_GRAD(data_, i);
+          const auto hess = GET_HESS(data_, i);
+          data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
+          sum_left_gradient -= grad;
+          sum_left_hessian -= hess;
+          left_count -= cnt;
         }
         t = -1;
       }
@@ -586,13 +603,13 @@ class FeatureHistogram {
         // need to skip default bin
         if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }
         if (t >= 0) {
-          sum_left_gradient += data_[t].sum_gradients;
-          sum_left_hessian += data_[t].sum_hessians;
-          left_count += data_[t].cnt;
+          sum_left_gradient += GET_GRAD(data_, t);
+          sum_left_hessian += GET_HESS(data_, t);
+          left_count += static_cast<data_size_t>(Common::RoundInt(GET_HESS(data_, t) * cnt_factor));
         }
         // if data not enough, or sum hessian too small
         if (left_count < meta_->config->min_data_in_leaf
-            || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
+          || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
         data_size_t right_count = num_data - left_count;
         // if data not enough
         if (right_count < meta_->config->min_data_in_leaf) break;
@@ -604,8 +621,8 @@ class FeatureHistogram {
         double sum_right_gradient = sum_gradient - sum_left_gradient;
         // current split gain
         double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
-                                            meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                            min_constraint, max_constraint, meta_->monotone_type);
+          meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
+          min_constraint, max_constraint, meta_->monotone_type);
         // gain with split is worse than without split
         if (current_gain <= min_gain_shift) continue;
 
@@ -626,15 +643,15 @@ class FeatureHistogram {
       // update split information
       output->threshold = best_threshold;
       output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
-                                                        meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                                        min_constraint, max_constraint);
+        meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
+        min_constraint, max_constraint);
       output->left_count = best_left_count;
       output->left_sum_gradient = best_sum_left_gradient;
       output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
       output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
-                                                         sum_hessian - best_sum_left_hessian,
-                                                         meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
-                                                         min_constraint, max_constraint);
+        sum_hessian - best_sum_left_hessian,
+        meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
+        min_constraint, max_constraint);
       output->right_count = num_data - best_left_count;
       output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
       output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
@@ -645,14 +662,13 @@ class FeatureHistogram {
 
   const FeatureMetainfo* meta_;
   /*! \brief sum of gradient of each bin */
-  HistogramBinEntry* data_;
-  // std::vector<HistogramBinEntry> data_;
+  hist_t* data_;
   bool is_splittable_ = true;
 
   std::function<void(double, double, data_size_t, double, double, SplitInfo*)> find_best_threshold_fun_;
 };
 class HistogramPool {
- public:
+public:
   /*!
   * \brief Constructor
   */
@@ -698,7 +714,7 @@ class HistogramPool {
     }
   }
 
-  void DynamicChangeSize(const Dataset* train_data, const Config* config, int cache_size, int total_size) {
+  void DynamicChangeSize(const Dataset* train_data, bool is_hist_colwise, const Config* config, int cache_size, int total_size) {
     if (feature_metas_.empty()) {
       uint64_t bin_cnt_over_features = 0;
       int num_feature = train_data->num_features();
@@ -720,7 +736,6 @@ class HistogramPool {
       }
       Log::Info("Total Bins %d", bin_cnt_over_features);
     }
-    uint64_t num_total_bin = train_data->NumTotalBin();
     int old_cache_size = static_cast<int>(pool_.size());
     Reset(cache_size, total_size);
 
@@ -728,24 +743,39 @@ class HistogramPool {
       pool_.resize(cache_size);
       data_.resize(cache_size);
     }
+    int num_total_bin = static_cast<int>(train_data->NumTotalBin());
 
+    std::vector<int> offsets;
+    if (is_hist_colwise) {
+      int offset = 0;
+      for (int j = 0; j < train_data->num_features(); ++j) {
+        offset += train_data->SubFeatureBinOffset(j);
+        offsets.push_back(offset);
+        auto num_bin = train_data->FeatureNumBin(j);
+        if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
+          num_bin -= 1;
+        }
+        offset += num_bin;
+      }
+    } else {
+      num_total_bin = 1;
+      for (int j = 0; j < train_data->num_features(); ++j) {
+        offsets.push_back(num_total_bin);
+        num_total_bin += train_data->FeatureBinMapper(j)->num_bin();
+        if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
+          num_total_bin -= 1;
+        }
+      }
+    }
     OMP_INIT_EX();
     #pragma omp parallel for schedule(static)
     for (int i = old_cache_size; i < cache_size; ++i) {
       OMP_LOOP_EX_BEGIN();
       pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
-      data_[i].resize(num_total_bin);
-      uint64_t offset = 0;
+      data_[i].resize(num_total_bin * 2);
       for (int j = 0; j < train_data->num_features(); ++j) {
-        offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
-        pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j]);
-        auto num_bin = train_data->FeatureNumBin(j);
-        if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
-          num_bin -= 1;
-        }
-        offset += static_cast<uint64_t>(num_bin);
+        pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]);
       }
-      CHECK(offset == num_total_bin);
       OMP_LOOP_EX_END();
     }
     OMP_THROW_EX();
@@ -816,9 +846,9 @@ class HistogramPool {
     inverse_mapper_[slot] = dst_idx;
   }
 
- private:
+private:
   std::vector<std::unique_ptr<FeatureHistogram[]>> pool_;
-  std::vector<std::vector<HistogramBinEntry>> data_;
+  std::vector<std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>> data_;
   std::vector<FeatureMetainfo> feature_metas_;
   int cache_size_;
   int total_size_;
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index 9dd584574c9b..ceb8f87d3e3d 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -49,15 +49,15 @@ void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
 // some functions used for debugging the GPU histogram construction
 #if GPU_DEBUG > 0
 
-void PrintHistograms(HistogramBinEntry* h, size_t size) {
-  size_t total = 0;
+void PrintHistograms(hist_t* h, size_t size) {
+  double total_hess = 0;
   for (size_t i = 0; i < size; ++i) {
-    printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt);
-    total += h[i].cnt;
-    if ((i & 3) == 3)
+    printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i));
+    if ((i & 2) == 2)
         printf("\n");
+    total_hess += GET_HESS(h, i);
   }
-  printf("\nTotal examples: %lu\n", total);
+  printf("\nSum hessians: %9.3g\n", total_hess);
 }
 
 union Float_t {
@@ -69,27 +69,23 @@ union Float_t {
 };
 
 
-void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) {
+void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
   size_t i;
   Float_t a, b;
   for (i = 0; i < size; ++i) {
-    a.f = h1[i].sum_gradients;
-    b.f = h2[i].sum_gradients;
+    a.f = GET_GRAD(h1, i);
+    b.f = GET_GRAD(h2, i);
     int32_t ulps = Float_t::ulp_diff(a, b);
-    if (fabs(h1[i].cnt           - h2[i].cnt != 0)) {
-      printf("%d != %d\n", h1[i].cnt, h2[i].cnt);
-      goto err;
-    }
     if (ulps > 0) {
-      // printf("grad %g != %g (%d ULPs)\n", h1[i].sum_gradients, h2[i].sum_gradients, ulps);
+      // printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps);
       // goto err;
     }
-    a.f = h1[i].sum_hessians;
-    b.f = h2[i].sum_hessians;
+    a.f = GET_HESS(h1, i);
+    b.f = GET_HESS(h2, i);
     ulps = Float_t::ulp_diff(a, b);
-    if (ulps > 0) {
-      // printf("hessian %g != %g (%d ULPs)\n", h1[i].sum_hessians, h2[i].sum_hessians, ulps);
-      // goto err;
+    if (std::fabs(a.f - b.f) >= 1e-20) {
+      printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps);
+      goto err;
     }
   }
   return;
@@ -191,7 +187,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
 }
 
 template <typename HistType>
-void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
+void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) {
   HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
   // when the output is ready, the computation is done
   histograms_wait_obj_.wait();
@@ -201,29 +197,25 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
       continue;
     }
     int dense_group_index = dense_feature_group_map_[i];
-    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index);
+    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
     int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
     if (device_bin_mults_[i] == 1) {
       for (int j = 0; j < bin_size; ++j) {
-        old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients;
-        old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians;
-        old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt;
+        GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j);
+        GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j);
       }
     } else {
       // values of this feature has been redistributed to multiple bins; need a reduction here
       int ind = 0;
       for (int j = 0; j < bin_size; ++j) {
         double sum_g = 0.0, sum_h = 0.0;
-        size_t cnt = 0;
         for (int k = 0; k < device_bin_mults_[i]; ++k) {
-          sum_g += hist_outputs[i * device_bin_size_+ ind].sum_gradients;
-          sum_h += hist_outputs[i * device_bin_size_+ ind].sum_hessians;
-          cnt += hist_outputs[i * device_bin_size_ + ind].cnt;
+          sum_g += GET_GRAD(hist_outputs, i * device_bin_size_+ ind);
+          sum_h += GET_HESS(hist_outputs, i * device_bin_size_+ ind);
           ind++;
         }
-        old_histogram_array[j].sum_gradients = sum_g;
-        old_histogram_array[j].sum_hessians = sum_h;
-        old_histogram_array[j].cnt = (data_size_t)cnt;
+        GET_GRAD(old_histogram_array, j) = sum_g;
+        GET_HESS(old_histogram_array, j) = sum_h;
       }
     }
   }
@@ -233,7 +225,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
 void GPUTreeLearner::AllocateGPUMemory() {
   num_dense_feature_groups_ = 0;
   for (int i = 0; i < num_feature_groups_; ++i) {
-    if (ordered_bins_[i] == nullptr) {
+    if (!train_data_->IsMultiGroup(i)) {
       num_dense_feature_groups_++;
     }
   }
@@ -303,7 +295,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
   device_data_indices_ = std::unique_ptr<boost::compute::vector<data_size_t>>(new boost::compute::vector<data_size_t>(allocated_num_data_, ctx_));
   boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_);
   // histogram bin entry size depends on the precision (single/double)
-  hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry);
+  hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) * 2 : sizeof(gpu_hist_t) * 2;
   Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_);
   // create output buffer, each feature has a histogram with device_bin_size_ bins,
   // each work group generates a sub-histogram of dword_features_ features.
@@ -326,7 +318,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
   std::vector<int> dense_dword_ind(dword_features_);
   for (int i = 0; i < num_feature_groups_; ++i) {
     // looking for dword_features_ non-sparse feature-groups
-    if (ordered_bins_[i] == nullptr) {
+    if (!train_data_->IsMultiGroup(i)) {
       dense_dword_ind[k] = i;
       // decide if we need to redistribute the bin
       double t = device_bin_size_ / static_cast<double>(train_data_->FeatureGroupNumBin(i));
@@ -682,6 +674,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
   printf("bin size: ");
   #endif
   for (int i = 0; i < num_feature_groups_; ++i) {
+    if (train_data_->IsMultiGroup(i)) {
+      continue;
+    }
     #if GPU_DEBUG >= 1
     printf("%d, ", train_data_->FeatureGroupNumBin(i));
     #endif
@@ -960,35 +955,34 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
     if (!is_feature_used_[feature_index]) continue;
     if (!is_feature_used[feature_index]) continue;
-    if (ordered_bins_[train_data_->Feature2Group(feature_index)]) {
+    if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
       is_sparse_feature_used[feature_index] = 1;
     } else {
       is_dense_feature_used[feature_index] = 1;
     }
   }
   // construct smaller leaf
-  HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
   // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU
   bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
     nullptr, smaller_leaf_splits_->num_data_in_leaf(),
     nullptr, nullptr,
     nullptr, nullptr);
   // then construct sparse features on CPU
-  // We set data_indices to null to avoid rebuilding ordered gradients/hessians
   train_data_->ConstructHistograms(is_sparse_feature_used,
-    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
-    smaller_leaf_splits_->LeafIndex(),
-    &ordered_bins_, gradients_, hessians_,
+    smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
+    gradients_, hessians_,
     ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+    multi_val_bin_.get(), is_hist_colwise_,
     ptr_smaller_leaf_hist_data);
   // wait for GPU to finish, only if GPU is actually used
   if (is_gpu_used) {
     if (config_->gpu_use_dp) {
       // use double precision
-      WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data);
+      WaitAndGetHistograms<hist_t>(ptr_smaller_leaf_hist_data);
     } else {
       // use single precision
-      WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data);
+      WaitAndGetHistograms<gpu_hist_t>(ptr_smaller_leaf_hist_data);
     }
   }
 
@@ -1000,48 +994,58 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
       continue;
     int dense_feature_group_index = dense_feature_group_map_[i];
     size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
-    HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
-    HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index);
-    HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size];
+    hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
+    hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
+    hist_t* gpu_histogram = new hist_t[size * 2];
     data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
     printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size);
-    std::copy(current_histogram, current_histogram + size, gpu_histogram);
-    std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry));
-    train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-      num_data != num_data_ ? smaller_leaf_splits_->data_indices() : nullptr,
-      num_data,
-      num_data != num_data_ ? ordered_gradients_.data() : gradients_,
-      num_data != num_data_ ? ordered_hessians_.data() : hessians_,
-      current_histogram);
+    std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
+    std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
+    if(train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr){continue;}
+    if (num_data != num_data_ ) {
+      train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+        smaller_leaf_splits_->data_indices(),
+        0,
+        num_data,
+        ordered_gradients_.data(),
+        ordered_hessians_.data(),
+        current_histogram);
+    } else {
+      train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+        0,
+        num_data,
+        gradients_,
+        hessians_,
+        current_histogram);
+    }
     CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
-    std::copy(gpu_histogram, gpu_histogram + size, current_histogram);
+    std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
     delete [] gpu_histogram;
   }
   #endif
 
   if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
     // construct larger leaf
-    HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
+    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - KHistOffset;
     is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
       larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
       gradients_, hessians_,
       ordered_gradients_.data(), ordered_hessians_.data());
     // then construct sparse features on CPU
-    // We set data_indices to null to avoid rebuilding ordered gradients/hessians
     train_data_->ConstructHistograms(is_sparse_feature_used,
-      nullptr, larger_leaf_splits_->num_data_in_leaf(),
-      larger_leaf_splits_->LeafIndex(),
-      &ordered_bins_, gradients_, hessians_,
+      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
+      gradients_, hessians_,
       ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+      multi_val_bin_.get(), is_hist_colwise_,
       ptr_larger_leaf_hist_data);
     // wait for GPU to finish, only if GPU is actually used
     if (is_gpu_used) {
       if (config_->gpu_use_dp) {
         // use double precision
-        WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data);
+        WaitAndGetHistograms<hist_t>(ptr_larger_leaf_hist_data);
       } else {
         // use single precision
-        WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data);
+        WaitAndGetHistograms<gpu_hist_t>(ptr_larger_leaf_hist_data);
       }
     }
   }
diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h
index 8686a5057510..f6e03acb5fa4 100644
--- a/src/treelearner/gpu_tree_learner.h
+++ b/src/treelearner/gpu_tree_learner.h
@@ -76,12 +76,7 @@ class GPUTreeLearner: public SerialTreeLearner {
       uint8_t s[4];
   };
 
-  /*! \brief Single precision histogram entiry for GPU */
-  struct GPUHistogramBinEntry {
-    score_t sum_gradients;
-    score_t sum_hessians;
-    uint32_t cnt;
-  };
+  typedef float gpu_hist_t;
 
   /*!
   * \brief Find the best number of workgroups processing one feature for maximizing efficiency
@@ -133,7 +128,7 @@ class GPUTreeLearner: public SerialTreeLearner {
    * \param histograms Destination of histogram results from GPU.
   */
   template <typename HistType>
-  void WaitAndGetHistograms(HistogramBinEntry* histograms);
+  void WaitAndGetHistograms(hist_t* histograms);
 
   /*!
    * \brief Construct GPU histogram asynchronously.
diff --git a/src/treelearner/ocl/histogram16.cl b/src/treelearner/ocl/histogram16.cl
index efe0d8462cb8..eb15916066eb 100644
--- a/src/treelearner/ocl/histogram16.cl
+++ b/src/treelearner/ocl/histogram16.cl
@@ -163,7 +163,7 @@ R""()
 void within_kernel_reduction16x8(uchar8 feature_mask,
                            __global const acc_type* restrict feature4_sub_hist, 
                            const uint skip_id,
-                           acc_type stat_val, uint cnt_val,
+                           acc_type stat_val,
                            const ushort num_sub_hist,
                            __global acc_type* restrict output_buf,
                            __local acc_type * restrict local_hist) {
@@ -181,33 +181,21 @@ void within_kernel_reduction16x8(uchar8 feature_mask,
             // 256 threads working on 8 features' 16 bins, gradient and hessian
             stat_val += *p;
             p += NUM_BINS * DWORD_FEATURES * 2;
-            if (ltid < LOCAL_SIZE_0 / 2) {
-                cnt_val += as_acc_int_type(*p); 
-            }
-            p += NUM_BINS * DWORD_FEATURES;
     }
     // skip the counters we already have
-    p += 3 * DWORD_FEATURES * NUM_BINS;
+    p += 2 * DWORD_FEATURES * NUM_BINS;
     for (i = i + 1; i < num_sub_hist; ++i) {
             stat_val += *p; 
             p += NUM_BINS * DWORD_FEATURES * 2;
-            if (ltid < LOCAL_SIZE_0 / 2) {
-                cnt_val += as_acc_int_type(*p); 
-            }
-            p += NUM_BINS * DWORD_FEATURES;
     }
     #endif
     // printf("thread %d:feature=%d, bin_id=%d, hessian=%d, stat_val=%f, cnt=%d", ltid, feature_id, bin_id, is_hessian_first, stat_val, cnt_val);
     // now overwrite the local_hist for final reduction and output
     // reverse the f7...f0 order to match the real order
     feature_id = DWORD_FEATURES_MASK - feature_id;
-    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + is_hessian_first] = stat_val;
-    bin_id = ltid >> (LOG2_DWORD_FEATURES); // range 0 - 16, for counter
-    if (ltid < LOCAL_SIZE_0 / 2) {
-        local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val);
-    }
+    local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + is_hessian_first] = stat_val;
     barrier(CLK_LOCAL_MEM_FENCE);
-    for (i = ltid; i < DWORD_FEATURES * 3 * NUM_BINS; i += lsize) {
+    for (i = ltid; i < DWORD_FEATURES * 2 * NUM_BINS; i += lsize) {
         output_buf[i] = local_hist[i];
     }
 }
@@ -335,7 +323,9 @@ __kernel void histogram16(__global const uchar4* feature_data_base,
        bk7_c_f0_bin16  bk7_c_f1_bin16  bk7_c_f2_bin16  bk7_c_f3_bin16  bk7_c_f4_bin16  bk7_c_f5_bin16  bk7_c_f6_bin16  bk7_c_f7_bin0
        -----------------------------------------------
     */
+    #if CONST_HESSIAN == 1
     __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS);
+    #endif
 
     // thread 0, 1, 2, 3, 4, 5, 6, 7 compute histograms for gradients first
     // thread 8, 9, 10, 11, 12, 13, 14, 15 compute histograms for hessians first
@@ -547,7 +537,7 @@ R""()
             atomic_local_add_f(gh_hist + addr2, stat2);
             #endif
         }
-
+        #if CONST_HESSIAN == 1
         // STAGE 3: accumulate counter
         // there are 8 counters for 8 features
         // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's counts for example 0, 1, 2, 3, 4, 5, 6, 7
@@ -614,6 +604,7 @@ R""()
             // printf("thread %x add counter %d feature %d (7)\n", ltid, bin, offset);
             atom_inc(cnt_hist + addr);
         }
+        #endif
         stat1 = stat1_next;
         stat2 = stat2_next;
         feature4 = feature4_next;
@@ -642,6 +633,7 @@ R""()
         ushort bank_id = (i + offset) & BANK_MASK;
         stat_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + feature_id];
     }
+    #if CONST_HESSIAN == 1
     if (ltid < LOCAL_SIZE_0 / 2) {
         // first 128 threads accumulate the 8 * 16 = 128 counter values
         bin_id = ltid >> LOG2_DWORD_FEATURES; // bits 3 - 6 range 0 - 16 is bin ID
@@ -651,6 +643,7 @@ R""()
             cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * DWORD_FEATURES + feature_id];
         }
     }
+    #endif
     
     // now thread 0 - 7  holds feature 0 - 7's gradient for bin 0 and counter bin 0
     // now thread 8 - 15 holds feature 0 - 7's hessian  for bin 0 and counter bin 1
@@ -687,7 +680,7 @@ R""()
     // write to output
     // write gradients and hessians histogram for all 4 features
     // output data in linear order for further reduction
-    // output size = 4 (features) * 3 (counters) * 64 (bins) * sizeof(float)
+    // output size = 4 (features) * 2 (counters) * 64 (bins) * sizeof(float)
     /* memory layout of output:
        g_f0_bin0   g_f1_bin0   g_f2_bin0   g_f3_bin0   g_f4_bin0   g_f5_bin0   g_f6_bin0   g_f7_bin0
        h_f0_bin0   h_f1_bin0   h_f2_bin0   h_f3_bin0   h_f4_bin0   h_f5_bin0   h_f6_bin0   h_f7_bin0
@@ -705,14 +698,10 @@ R""()
     // if there is only one workgroup processing this feature4, don't even need to write
     uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
     #if POWER_FEATURE_WORKGROUPS != 0
-    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 3 * NUM_BINS;
+    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 2 * NUM_BINS;
     // if g_val and h_val are double, they are converted to float here
     // write gradients and hessians for 8 features
     output[0 * DWORD_FEATURES * NUM_BINS + ltid] = stat_val;
-    // write counts for 8 features
-    if (ltid < LOCAL_SIZE_0 / 2) {
-        output[2 * DWORD_FEATURES * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val);
-    }
     barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
     mem_fence(CLK_GLOBAL_MEM_FENCE);
     // To avoid the cost of an extra reducting kernel, we have to deal with some 
@@ -738,7 +727,7 @@ R""()
     // The is done by using an global atomic counter.
     // On AMD GPUs ideally this should be done in GDS,
     // but currently there is no easy way to access it via OpenCL.
-    __local uint * counter_val = cnt_hist;
+    __local uint * counter_val = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS);
     if (ltid == 0) {
         // all workgroups processing the same feature add this counter
         *counter_val = atom_inc(sync_counters + feature4_id);
@@ -762,12 +751,12 @@ R""()
         // locate our feature4's block in output memory
         uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
         __global acc_type const * restrict feature4_subhists = 
-                 (__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 3 * NUM_BINS;
+                 (__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 2 * NUM_BINS;
         // skip reading the data already in local memory
         uint skip_id = group_id ^ output_offset;
         // locate output histogram location for this feature4
-        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 3 * NUM_BINS;
-        within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val, cnt_val, 
+        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 2 * NUM_BINS;
+        within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val, 
                                     1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
     }
 }
@@ -776,4 +765,3 @@ R""()
 // the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
 // )"" "\n#endif" + 9
 #endif
-
diff --git a/src/treelearner/ocl/histogram256.cl b/src/treelearner/ocl/histogram256.cl
index 0fa4bed88412..6030044614a4 100644
--- a/src/treelearner/ocl/histogram256.cl
+++ b/src/treelearner/ocl/histogram256.cl
@@ -155,15 +155,6 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
     acc_type f1_hess_bin = local_hist[ltid * 8 + 5];
     acc_type f2_hess_bin = local_hist[ltid * 8 + 6];
     acc_type f3_hess_bin = local_hist[ltid * 8 + 7];
-    __local uint* restrict local_cnt = (__local uint *)(local_hist + 4 * 2 * NUM_BINS);
-    #if POWER_FEATURE_WORKGROUPS != 0
-    uint  f0_cont_bin = ltid ? local_cnt[ltid * 4] : old_val_f0_cont_bin0;
-    #else
-    uint  f0_cont_bin = local_cnt[ltid * 4];
-    #endif
-    uint  f1_cont_bin = local_cnt[ltid * 4 + 1];
-    uint  f2_cont_bin = local_cnt[ltid * 4 + 2];
-    uint  f3_cont_bin = local_cnt[ltid * 4 + 3];
     ushort i;
     // printf("%d-pre(skip %d): %f %f %f %f %f %f %f %f %d %d %d %d", ltid, skip_id, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
 #if POWER_FEATURE_WORKGROUPS != 0
@@ -173,70 +164,62 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
         if (feature_mask.s3) {
             f0_grad_bin += *p;          p += NUM_BINS;
             f0_hess_bin += *p;          p += NUM_BINS;
-            f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
         else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
         }
         if (feature_mask.s2) {
             f1_grad_bin += *p;          p += NUM_BINS;
             f1_hess_bin += *p;          p += NUM_BINS;
-            f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
         else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
         }
         if (feature_mask.s1) {
             f2_grad_bin += *p;          p += NUM_BINS;
             f2_hess_bin += *p;          p += NUM_BINS;
-            f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
         else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
         }
         if (feature_mask.s0) {
             f3_grad_bin += *p;          p += NUM_BINS;
             f3_hess_bin += *p;          p += NUM_BINS;
-            f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
         else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
         }
     }
     // skip the counters we already have
-    p += 3 * 4 * NUM_BINS;
+    p += 2 * 4 * NUM_BINS;
     for (i = i + 1; i < num_sub_hist; ++i) {
         if (feature_mask.s3) {
             f0_grad_bin += *p;          p += NUM_BINS;
             f0_hess_bin += *p;          p += NUM_BINS;
-            f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
         else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
         }
         if (feature_mask.s2) {
             f1_grad_bin += *p;          p += NUM_BINS;
             f1_hess_bin += *p;          p += NUM_BINS;
-            f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
         else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
         }
         if (feature_mask.s1) {
             f2_grad_bin += *p;          p += NUM_BINS;
             f2_hess_bin += *p;          p += NUM_BINS;
-            f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
         else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
         }
         if (feature_mask.s0) {
             f3_grad_bin += *p;          p += NUM_BINS;
             f3_hess_bin += *p;          p += NUM_BINS;
-            f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
         }
         else {
-            p += 3 * NUM_BINS;
+            p += 2 * NUM_BINS;
         }
     }
     // printf("%d-aft: %f %f %f %f %f %f %f %f %d %d %d %d", ltid, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
@@ -245,18 +228,14 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
     barrier(CLK_LOCAL_MEM_FENCE);
     #if USE_DP_FLOAT == 0
     // reverse the f3...f0 order to match the real order
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin);
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin);
-    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin;
-    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin;
-    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin);
-    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin;
-    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
-    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f3_grad_bin;
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f3_hess_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f2_grad_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f2_hess_bin;
+    local_hist[2 * 2 * NUM_BINS + ltid * 2 + 0] = f1_grad_bin;
+    local_hist[2 * 2 * NUM_BINS + ltid * 2 + 1] = f1_hess_bin;
+    local_hist[3 * 2 * NUM_BINS + ltid * 2 + 0] = f0_grad_bin;
+    local_hist[3 * 2 * NUM_BINS + ltid * 2 + 1] = f0_hess_bin;
     barrier(CLK_LOCAL_MEM_FENCE);
     /*
     for (ushort i = ltid; i < 4 * 3 * NUM_BINS; i += lsize) {
@@ -267,34 +246,28 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
     if (feature_mask.s0) {
         output_buf[i] = local_hist[i];
         output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
     }
-    i += 1 * 3 * NUM_BINS;
+    i += 1 * 2 * NUM_BINS;
     if (feature_mask.s1) {
         output_buf[i] = local_hist[i];
         output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
     }
-    i += 1 * 3 * NUM_BINS;
+    i += 1 * 2 * NUM_BINS;
     if (feature_mask.s2) {
         output_buf[i] = local_hist[i];
         output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
     }
-    i += 1 * 3 * NUM_BINS;
-    if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) {
+    i += 1 * 2 * NUM_BINS;
+    if (feature_mask.s3 && i < 4 * 2 * NUM_BINS) {
         output_buf[i] = local_hist[i];
         output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
     }
     #else
     // when double precision is used, we need to write twice, because local memory size is not enough
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin);
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin);
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f3_grad_bin;
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f3_hess_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f2_grad_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f2_hess_bin;
     barrier(CLK_LOCAL_MEM_FENCE);
     /*
     for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
@@ -305,21 +278,17 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
     if (feature_mask.s0) {
         output_buf[i] = local_hist[i];
         output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
     }
-    i += 1 * 3 * NUM_BINS;
+    i += 1 * 2 * NUM_BINS;
     if (feature_mask.s1) {
         output_buf[i] = local_hist[i];
         output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin;
-    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin);
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
-    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f1_grad_bin;
+    local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f1_hess_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f0_grad_bin;
+    local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f0_hess_bin;
     barrier(CLK_LOCAL_MEM_FENCE);
     /*
     for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
@@ -328,15 +297,13 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
     */
     i = ltid;
     if (feature_mask.s2) {
-        output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
-        output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+        output_buf[i + 2 * 2 * NUM_BINS] = local_hist[i];
+        output_buf[i + 2 * 2 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
     }
-    i += 1 * 3 * NUM_BINS;
+    i += 1 * 2 * NUM_BINS;
     if (feature_mask.s3) {
-        output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
-        output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
-        output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+        output_buf[i + 2 * 2 * NUM_BINS] = local_hist[i];
+        output_buf[i + 2 * 2 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
     }
     #endif
 }
@@ -401,7 +368,9 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
     __local acc_type * gh_hist = (__local acc_type *)shared_array;
     // counter histogram
     // total size: 4 * 256 * size_of(uint) = 4 KB
+    #if CONST_HESSIAN == 1
     __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS);
+    #endif 
 
     // thread 0, 1, 2, 3 compute histograms for gradients first
     // thread 4, 5, 6, 7 compute histograms for hessians  first
@@ -602,7 +571,7 @@ R""()
             s0_stat1 += stat1;
             s0_stat2 += stat2;
         }
-
+        #if CONST_HESSIAN == 1
         // STAGE 3: accumulate counter
         // there are 4 counters for 4 features
         // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
@@ -633,6 +602,7 @@ R""()
             addr = bin * 4 + offset;
             atom_inc(cnt_hist + addr);
         }
+        #endif
         stat1 = stat1_next;
         stat2 = stat2_next;
         feature4 = feature4_next;
@@ -741,7 +711,7 @@ R""()
     uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
     // if there is only one workgroup processing this feature4, don't even need to write
     #if POWER_FEATURE_WORKGROUPS != 0
-    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS;
+    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 2 * NUM_BINS;
     // write gradients and hessians
     __global acc_type * restrict ptr_f = output;
     for (ushort j = 0; j < 4; ++j) {
@@ -751,17 +721,7 @@ R""()
             acc_type value = gh_hist[i * 4 + j];
             ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
         }
-        ptr_f += 3 * NUM_BINS;
-    }
-    // write counts
-    __global acc_int_type * restrict ptr_i = (__global acc_int_type * restrict)(output + 2 * NUM_BINS);
-    for (ushort j = 0; j < 4; ++j) {
-        for (ushort i = ltid; i < NUM_BINS; i += lsize) {
-            // FIXME: 2-way bank conflict
-            uint value = cnt_hist[i * 4 + j];
-            ptr_i[i] = value;
-        }
-        ptr_i += 3 * NUM_BINS;
+        ptr_f += 2 * NUM_BINS;
     }
     barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
     mem_fence(CLK_GLOBAL_MEM_FENCE);
@@ -788,7 +748,7 @@ R""()
     // The is done by using an global atomic counter.
     // On AMD GPUs ideally this should be done in GDS,
     // but currently there is no easy way to access it via OpenCL.
-    __local uint * counter_val = cnt_hist;
+    __local uint * counter_val = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS);;
     // backup the old value
     uint old_val = *counter_val;
     if (ltid == 0) {
@@ -814,11 +774,11 @@ R""()
         // locate our feature4's block in output memory
         uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
         __global acc_type const * restrict feature4_subhists = 
-                 (__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS;
+                 (__global acc_type *)output_buf + output_offset * 4 * 2 * NUM_BINS;
         // skip reading the data already in local memory
         uint skip_id = group_id ^ output_offset;
         // locate output histogram location for this feature4
-        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS;
+        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 2 * NUM_BINS;
         within_kernel_reduction256x4(feature_mask, feature4_subhists, skip_id, old_val, 1 << POWER_FEATURE_WORKGROUPS, 
                                      hist_buf, (__local acc_type *)shared_array);
         // if (ltid == 0) 
diff --git a/src/treelearner/ocl/histogram64.cl b/src/treelearner/ocl/histogram64.cl
index 5b265abda703..d3c4d48c729f 100644
--- a/src/treelearner/ocl/histogram64.cl
+++ b/src/treelearner/ocl/histogram64.cl
@@ -157,7 +157,7 @@ R""()
 void within_kernel_reduction64x4(uchar4 feature_mask,
                            __global const acc_type* restrict feature4_sub_hist, 
                            const uint skip_id,
-                           acc_type g_val, acc_type h_val, uint cnt_val,
+                           acc_type g_val, acc_type h_val,
                            const ushort num_sub_hist,
                            __global acc_type* restrict output_buf,
                            __local acc_type * restrict local_hist) {
@@ -173,38 +173,35 @@ void within_kernel_reduction64x4(uchar4 feature_mask,
     for (i = 0; i < skip_id; ++i) {
             g_val += *p;            p += NUM_BINS * 4; // 256 threads working on 4 features' 64 bins
             h_val += *p;            p += NUM_BINS * 4;
-            cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4;
     }
     // skip the counters we already have
-    p += 3 * 4 * NUM_BINS;
+    p += 2 * 4 * NUM_BINS;
     for (i = i + 1; i < num_sub_hist; ++i) {
             g_val += *p;            p += NUM_BINS * 4;
             h_val += *p;            p += NUM_BINS * 4;
-            cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4;
     }
     #endif
     // printf("thread %d: g_val=%f, h_val=%f cnt=%d", ltid, g_val, h_val, cnt_val);
     // now overwrite the local_hist for final reduction and output
     // reverse the f3...f0 order to match the real order
     feature_id = 3 - feature_id;
-    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 0] = g_val;
-    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 1] = h_val;
-    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val);
+    local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 0] = g_val;
+    local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 1] = h_val;
     barrier(CLK_LOCAL_MEM_FENCE);
     i = ltid;
-    if (feature_mask.s0 && i < 1 * 3 * NUM_BINS) {
+    if (feature_mask.s0 && i < 1 * 2 * NUM_BINS) {
         output_buf[i] = local_hist[i];
     }
-    i += 1 * 3 * NUM_BINS;
-    if (feature_mask.s1 && i < 2 * 3 * NUM_BINS) {
+    i += 1 * 2 * NUM_BINS;
+    if (feature_mask.s1 && i < 2 * 2 * NUM_BINS) {
         output_buf[i] = local_hist[i];
     }
-    i += 1 * 3 * NUM_BINS;
-    if (feature_mask.s2 && i < 3 * 3 * NUM_BINS) {
+    i += 1 * 2 * NUM_BINS;
+    if (feature_mask.s2 && i < 3 * 2 * NUM_BINS) {
         output_buf[i] = local_hist[i];
     }
-    i += 1 * 3 * NUM_BINS;
-    if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) {
+    i += 1 * 2 * NUM_BINS;
+    if (feature_mask.s3 && i < 4 * 2 * NUM_BINS) {
         output_buf[i] = local_hist[i];
     }
 }
@@ -306,7 +303,9 @@ __kernel void histogram64(__global const uchar4* feature_data_base,
        bk3_c_f0_bin64 bk3_c_f1_bin64 bk3_c_f2_bin64 bk3_c_f3_bin64
        -----------------------------------------------
     */
+    #if CONST_HESSIAN == 1
     __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);
+    #endif
 
     // thread 0, 1, 2, 3 compute histograms for gradients first
     // thread 4, 5, 6, 7 compute histograms for hessians  first
@@ -509,7 +508,7 @@ R""()
             s0_stat1 += stat1;
             s0_stat2 += stat2;
         }
-
+        #if CONST_HESSIAN == 1
         // STAGE 3: accumulate counter
         // there are 4 counters for 4 features
         // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
@@ -540,6 +539,7 @@ R""()
             addr = bin * CNT_BIN_MULT + bank * 4 + offset;
             atom_inc(cnt_hist + addr);
         }
+        #endif
         stat1 = stat1_next;
         stat2 = stat2_next;
         feature4 = feature4_next;
@@ -639,7 +639,9 @@ R""()
         ushort bank_id = (i + offset) & BANK_MASK;
         g_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id];
         h_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id + 4];
+        #if CONST_HESSIAN == 1
         cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * 4 + feature_id];
+        #endif
     }
     // now thread 0 - 3 holds feature 0, 1, 2, 3's gradient, hessian and count bin 0
     // now thread 4 - 7 holds feature 0, 1, 2, 3's gradient, hessian and count bin 1
@@ -670,14 +672,12 @@ R""()
     // if there is only one workgroup processing this feature4, don't even need to write
     uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
     #if POWER_FEATURE_WORKGROUPS != 0
-    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS;
+    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 2 * NUM_BINS;
     // if g_val and h_val are double, they are converted to float here
     // write gradients for 4 features
     output[0 * 4 * NUM_BINS + ltid] = g_val;
     // write hessians for 4 features
     output[1 * 4 * NUM_BINS + ltid] = h_val;
-    // write counts for 4 features
-    output[2 * 4 * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val);
     barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
     mem_fence(CLK_GLOBAL_MEM_FENCE);
     // To avoid the cost of an extra reducting kernel, we have to deal with some 
@@ -703,7 +703,7 @@ R""()
     // The is done by using an global atomic counter.
     // On AMD GPUs ideally this should be done in GDS,
     // but currently there is no easy way to access it via OpenCL.
-    __local uint * counter_val = cnt_hist;
+    __local uint * counter_val = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);;
     if (ltid == 0) {
         // all workgroups processing the same feature add this counter
         *counter_val = atom_inc(sync_counters + feature4_id);
@@ -727,12 +727,12 @@ R""()
         // locate our feature4's block in output memory
         uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
         __global acc_type const * restrict feature4_subhists = 
-                 (__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS;
+                 (__global acc_type *)output_buf + output_offset * 4 * 2 * NUM_BINS;
         // skip reading the data already in local memory
         uint skip_id = group_id ^ output_offset;
         // locate output histogram location for this feature4
-        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS;
-        within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val, cnt_val, 
+        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 2 * NUM_BINS;
+        within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val, 
                                     1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
     }
 }
diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index c6754b517397..dde47d4989da 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -181,8 +181,8 @@ class VotingParallelTreeLearner: public TREELEARNER_T {
   /*! \brief Store global histogram for larger leaf  */
   std::unique_ptr<FeatureHistogram[]> larger_leaf_histogram_array_global_;
 
-  std::vector<HistogramBinEntry> smaller_leaf_histogram_data_;
-  std::vector<HistogramBinEntry> larger_leaf_histogram_data_;
+  std::vector<hist_t> smaller_leaf_histogram_data_;
+  std::vector<hist_t> larger_leaf_histogram_data_;
   std::vector<FeatureMetainfo> feature_metas_;
 };
 
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 252ce5fdca28..84ff5f2ee5f3 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -18,14 +18,6 @@
 
 namespace LightGBM {
 
-#ifdef TIMETAG
-std::chrono::duration<double, std::milli> init_train_time;
-std::chrono::duration<double, std::milli> init_split_time;
-std::chrono::duration<double, std::milli> hist_time;
-std::chrono::duration<double, std::milli> find_split_time;
-std::chrono::duration<double, std::milli> split_time;
-std::chrono::duration<double, std::milli> ordered_bin_time;
-#endif  // TIMETAG
 
 SerialTreeLearner::SerialTreeLearner(const Config* config)
   :config_(config) {
@@ -38,14 +30,7 @@ SerialTreeLearner::SerialTreeLearner(const Config* config)
 }
 
 SerialTreeLearner::~SerialTreeLearner() {
-  #ifdef TIMETAG
-  Log::Info("SerialTreeLearner::init_train costs %f", init_train_time * 1e-3);
-  Log::Info("SerialTreeLearner::init_split costs %f", init_split_time * 1e-3);
-  Log::Info("SerialTreeLearner::hist_build costs %f", hist_time * 1e-3);
-  Log::Info("SerialTreeLearner::find_split costs %f", find_split_time * 1e-3);
-  Log::Info("SerialTreeLearner::split costs %f", split_time * 1e-3);
-  Log::Info("SerialTreeLearner::ordered_bin costs %f", ordered_bin_time * 1e-3);
-  #endif
+
 }
 
 void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
@@ -60,7 +45,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
   } else {
     size_t total_histogram_size = 0;
     for (int i = 0; i < train_data_->num_features(); ++i) {
-      total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i);
+      total_histogram_size += KHistEntrySize * train_data_->FeatureNumBin(i);
     }
     max_cache_size = static_cast<int>(config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
   }
@@ -68,19 +53,10 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
   max_cache_size = std::max(2, max_cache_size);
   max_cache_size = std::min(max_cache_size, config_->num_leaves);
 
-  histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves);
+  
   // push split information for all leaves
   best_split_per_leaf_.resize(config_->num_leaves);
-  // get ordered bin
-  train_data_->CreateOrderedBins(&ordered_bins_);
 
-  // check existing for ordered bin
-  for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
-    if (ordered_bins_[i] != nullptr) {
-      has_ordered_bin_ = true;
-      break;
-    }
-  }
   // initialize splits for leaf
   smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
   larger_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
@@ -92,17 +68,10 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
   // initialize ordered gradients and hessians
   ordered_gradients_.resize(num_data_);
   ordered_hessians_.resize(num_data_);
-  // if has ordered bin, need to allocate a buffer to fast split
-  if (has_ordered_bin_) {
-    is_data_in_leaf_.resize(num_data_);
-    std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), static_cast<char>(0));
-    ordered_bin_indices_.clear();
-    for (int i = 0; i < static_cast<int>(ordered_bins_.size()); i++) {
-      if (ordered_bins_[i] != nullptr) {
-        ordered_bin_indices_.push_back(i);
-      }
-    }
-  }
+
+  GetMultiValBin(train_data_, true);
+
+  histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves);
   Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_);
   if (CostEfficientGradientBoosting::IsEnable(config_)) {
     cegb_.reset(new CostEfficientGradientBoosting(this));
@@ -110,14 +79,23 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
   }
 }
 
+void SerialTreeLearner::GetMultiValBin(const Dataset* dataset, bool is_first_time) {
+  if (is_first_time) {
+    auto used_feature = GetUsedFeatures(true);
+    multi_val_bin_.reset(dataset->TestMultiThreadingMethod(ordered_gradients_.data(), ordered_hessians_.data(), used_feature,
+      is_constant_hessian_, config_->force_col_wise, config_->force_row_wise, &is_hist_colwise_));
+  } else {
+    // cannot change is_hist_col_wise during training
+    multi_val_bin_.reset(dataset->TestMultiThreadingMethod(ordered_gradients_.data(), ordered_hessians_.data(), is_feature_used_,
+      is_constant_hessian_, is_hist_colwise_, !is_hist_colwise_, &is_hist_colwise_));
+  }
+}
+
 void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
   train_data_ = train_data;
   num_data_ = train_data_->num_data();
   CHECK(num_features_ == train_data_->num_features());
 
-  // get ordered bin
-  train_data_->CreateOrderedBins(&ordered_bins_);
-
   // initialize splits for leaf
   smaller_leaf_splits_->ResetNumData(num_data_);
   larger_leaf_splits_->ResetNumData(num_data_);
@@ -125,14 +103,12 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
   // initialize data partition
   data_partition_->ResetNumData(num_data_);
 
+  GetMultiValBin(train_data_, false);
+
   // initialize ordered gradients and hessians
   ordered_gradients_.resize(num_data_);
   ordered_hessians_.resize(num_data_);
-  // if has ordered bin, need to allocate a buffer to fast split
-  if (has_ordered_bin_) {
-    is_data_in_leaf_.resize(num_data_);
-    std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), static_cast<char>(0));
-  }
+
   if (cegb_ != nullptr) {
     cegb_->Init();
   }
@@ -148,14 +124,14 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
     } else {
       size_t total_histogram_size = 0;
       for (int i = 0; i < train_data_->num_features(); ++i) {
-        total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i);
+        total_histogram_size += KHistEntrySize * train_data_->FeatureNumBin(i);
       }
       max_cache_size = static_cast<int>(config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
     }
     // at least need 2 leaves
     max_cache_size = std::max(2, max_cache_size);
     max_cache_size = std::min(max_cache_size, config_->num_leaves);
-    histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves);
+    histogram_pool_.DynamicChangeSize(train_data_, is_hist_colwise_, config_, max_cache_size, config_->num_leaves);
 
     // push split information for all leaves
     best_split_per_leaf_.resize(config_->num_leaves);
@@ -171,19 +147,14 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
 }
 
 Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian, const Json& forced_split_json) {
+  Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
   gradients_ = gradients;
   hessians_ = hessians;
   is_constant_hessian_ = is_constant_hessian;
-  #ifdef TIMETAG
-  auto start_time = std::chrono::steady_clock::now();
-  #endif
+
   // some initial works before training
   BeforeTrain();
 
-  #ifdef TIMETAG
-  init_train_time += std::chrono::steady_clock::now() - start_time;
-  #endif
-
   auto tree = std::unique_ptr<Tree>(new Tree(config_->num_leaves));
   // root leaf
   int left_leaf = 0;
@@ -199,14 +170,8 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
   }
 
   for (int split = init_splits; split < config_->num_leaves - 1; ++split) {
-    #ifdef TIMETAG
-    start_time = std::chrono::steady_clock::now();
-    #endif
     // some initial works before finding best split
     if (!aborted_last_force_split && BeforeFindBestSplit(tree.get(), left_leaf, right_leaf)) {
-      #ifdef TIMETAG
-      init_split_time += std::chrono::steady_clock::now() - start_time;
-      #endif
       // find best threshold for every feature
       FindBestSplits();
     } else if (aborted_last_force_split) {
@@ -222,14 +187,8 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
       Log::Warning("No further splits with positive gain, best gain: %f", best_leaf_SplitInfo.gain);
       break;
     }
-    #ifdef TIMETAG
-    start_time = std::chrono::steady_clock::now();
-    #endif
     // split tree with best leaf
     Split(tree.get(), best_leaf, &left_leaf, &right_leaf);
-    #ifdef TIMETAG
-    split_time += std::chrono::steady_clock::now() - start_time;
-    #endif
     cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
   }
   Log::Debug("Trained a tree with leaves = %d and max_depth = %d", tree->num_leaves(), cur_depth);
@@ -319,6 +278,7 @@ std::vector<int8_t> SerialTreeLearner::GetUsedFeatures(bool is_tree_level) {
 }
 
 void SerialTreeLearner::BeforeTrain() {
+  Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeTrain", global_timer);
   // reset histogram pool
   histogram_pool_.ResetMap();
 
@@ -350,54 +310,10 @@ void SerialTreeLearner::BeforeTrain() {
   }
 
   larger_leaf_splits_->Init();
-
-  // if has ordered bin, need to initialize the ordered bin
-  if (has_ordered_bin_) {
-    #ifdef TIMETAG
-    auto start_time = std::chrono::steady_clock::now();
-    #endif
-    if (data_partition_->leaf_count(0) == num_data_) {
-      // use all data, pass nullptr
-      OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
-      for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
-        OMP_LOOP_EX_BEGIN();
-        ordered_bins_[ordered_bin_indices_[i]]->Init(nullptr, config_->num_leaves);
-        OMP_LOOP_EX_END();
-      }
-      OMP_THROW_EX();
-    } else {
-      // bagging, only use part of data
-
-      // mark used data
-      const data_size_t* indices = data_partition_->indices();
-      data_size_t begin = data_partition_->leaf_begin(0);
-      data_size_t end = begin + data_partition_->leaf_count(0);
-      #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
-      for (data_size_t i = begin; i < end; ++i) {
-        is_data_in_leaf_[indices[i]] = 1;
-      }
-      OMP_INIT_EX();
-      // initialize ordered bin
-      #pragma omp parallel for schedule(static)
-      for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
-        OMP_LOOP_EX_BEGIN();
-        ordered_bins_[ordered_bin_indices_[i]]->Init(is_data_in_leaf_.data(), config_->num_leaves);
-        OMP_LOOP_EX_END();
-      }
-      OMP_THROW_EX();
-      #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
-      for (data_size_t i = begin; i < end; ++i) {
-        is_data_in_leaf_[indices[i]] = 0;
-      }
-    }
-    #ifdef TIMETAG
-    ordered_bin_time += std::chrono::steady_clock::now() - start_time;
-    #endif
-  }
 }
 
 bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
+  Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeFindBestSplit", global_timer);
   // check depth of current leaf
   if (config_->max_depth > 0) {
     // only need to check left leaf, since right leaf is in same level of left leaf
@@ -435,44 +351,6 @@ bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int
     if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
     histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_);
   }
-  // split for the ordered bin
-  if (has_ordered_bin_ && right_leaf >= 0) {
-    #ifdef TIMETAG
-    auto start_time = std::chrono::steady_clock::now();
-    #endif
-    // mark data that at left-leaf
-    const data_size_t* indices = data_partition_->indices();
-    const auto left_cnt = data_partition_->leaf_count(left_leaf);
-    const auto right_cnt = data_partition_->leaf_count(right_leaf);
-    char mark = 1;
-    data_size_t begin = data_partition_->leaf_begin(left_leaf);
-    data_size_t end = begin + left_cnt;
-    if (left_cnt > right_cnt) {
-      begin = data_partition_->leaf_begin(right_leaf);
-      end = begin + right_cnt;
-      mark = 0;
-    }
-    #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
-    for (data_size_t i = begin; i < end; ++i) {
-      is_data_in_leaf_[indices[i]] = 1;
-    }
-    OMP_INIT_EX();
-    // split the ordered bin
-    #pragma omp parallel for schedule(static)
-    for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
-      OMP_LOOP_EX_BEGIN();
-      ordered_bins_[ordered_bin_indices_[i]]->Split(left_leaf, right_leaf, is_data_in_leaf_.data(), mark);
-      OMP_LOOP_EX_END();
-    }
-    OMP_THROW_EX();
-    #pragma omp parallel for schedule(static, 512) if (end - begin >= 1024)
-    for (data_size_t i = begin; i < end; ++i) {
-      is_data_in_leaf_[indices[i]] = 0;
-    }
-    #ifdef TIMETAG
-    ordered_bin_time += std::chrono::steady_clock::now() - start_time;
-    #endif
-  }
   return true;
 }
 
@@ -494,37 +372,30 @@ void SerialTreeLearner::FindBestSplits() {
 }
 
 void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
-  #ifdef TIMETAG
-  auto start_time = std::chrono::steady_clock::now();
-  #endif
+  Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms", global_timer);
   // construct smaller leaf
-  HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
   train_data_->ConstructHistograms(is_feature_used,
                                    smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
-                                   smaller_leaf_splits_->LeafIndex(),
-                                   &ordered_bins_, gradients_, hessians_,
+                                   gradients_, hessians_,
                                    ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+                                   multi_val_bin_.get(), is_hist_colwise_,
                                    ptr_smaller_leaf_hist_data);
 
   if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
     // construct larger leaf
-    HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
+    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - KHistOffset;
     train_data_->ConstructHistograms(is_feature_used,
                                      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
-                                     larger_leaf_splits_->LeafIndex(),
-                                     &ordered_bins_, gradients_, hessians_,
+                                     gradients_, hessians_,
                                      ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+                                     multi_val_bin_.get(), is_hist_colwise_,
                                      ptr_larger_leaf_hist_data);
   }
-  #ifdef TIMETAG
-  hist_time += std::chrono::steady_clock::now() - start_time;
-  #endif
 }
 
 void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
-  #ifdef TIMETAG
-  auto start_time = std::chrono::steady_clock::now();
-  #endif
+  Common::FunctionTimer fun_timer("SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
   std::vector<SplitInfo> smaller_best(num_threads_);
   std::vector<SplitInfo> larger_best(num_threads_);
   std::vector<int8_t> smaller_node_used_features(num_features_, 1);
@@ -534,7 +405,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
     larger_node_used_features = GetUsedFeatures(false);
   }
   OMP_INIT_EX();
-  // find splits
+  // find splits	
   #pragma omp parallel for schedule(static)
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
     OMP_LOOP_EX_BEGIN();
@@ -543,7 +414,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
     SplitInfo smaller_split;
     train_data_->FixHistogram(feature_index,
                               smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
-                              smaller_leaf_splits_->num_data_in_leaf(),
                               smaller_leaf_histogram_array_[feature_index].RawData());
     int real_fidx = train_data_->RealFeatureIndex(feature_index);
     smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
@@ -567,7 +437,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
       larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
     } else {
       train_data_->FixHistogram(feature_index, larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians(),
-                                larger_leaf_splits_->num_data_in_leaf(),
                                 larger_leaf_histogram_array_[feature_index].RawData());
     }
     SplitInfo larger_split;
@@ -589,7 +458,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
     OMP_LOOP_EX_END();
   }
   OMP_THROW_EX();
-
   auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
   int leaf = smaller_leaf_splits_->LeafIndex();
   best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
@@ -599,9 +467,6 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
     auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
     best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
   }
-  #ifdef TIMETAG
-  find_split_time += std::chrono::steady_clock::now() - start_time;
-  #endif
 }
 
 int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json, int* left_leaf,
@@ -769,69 +634,80 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, const Json& forced_split_json
 }
 
 void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) {
-  const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
+  Common::FunctionTimer fun_timer("SerialTreeLearner::Split", global_timer);
+  SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
   const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
   if (cegb_ != nullptr) {
     cegb_->UpdateLeafBestSplits(tree, best_leaf, &best_split_info, &best_split_per_leaf_);
   }
-  // left = parent
   *left_leaf = best_leaf;
+  auto next_leaf_id = tree->NextLeafId();
+
   bool is_numerical_split = train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin;
   if (is_numerical_split) {
     auto threshold_double = train_data_->RealThreshold(inner_feature_index, best_split_info.threshold);
+    data_partition_->Split(best_leaf, train_data_, inner_feature_index,
+      &best_split_info.threshold, 1, best_split_info.default_left, next_leaf_id);
+    best_split_info.left_count = data_partition_->leaf_count(*left_leaf);
+    best_split_info.right_count = data_partition_->leaf_count(next_leaf_id);
     // split tree, will return right leaf
     *right_leaf = tree->Split(best_leaf,
-                              inner_feature_index,
-                              best_split_info.feature,
-                              best_split_info.threshold,
-                              threshold_double,
-                              static_cast<double>(best_split_info.left_output),
-                              static_cast<double>(best_split_info.right_output),
-                              static_cast<data_size_t>(best_split_info.left_count),
-                              static_cast<data_size_t>(best_split_info.right_count),
-                              static_cast<double>(best_split_info.left_sum_hessian),
-                              static_cast<double>(best_split_info.right_sum_hessian),
-                              static_cast<float>(best_split_info.gain),
-                              train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
-                              best_split_info.default_left);
-    data_partition_->Split(best_leaf, train_data_, inner_feature_index,
-                           &best_split_info.threshold, 1, best_split_info.default_left, *right_leaf);
+      inner_feature_index,
+      best_split_info.feature,
+      best_split_info.threshold,
+      threshold_double,
+      static_cast<double>(best_split_info.left_output),
+      static_cast<double>(best_split_info.right_output),
+      static_cast<data_size_t>(best_split_info.left_count),
+      static_cast<data_size_t>(best_split_info.right_count),
+      static_cast<double>(best_split_info.left_sum_hessian),
+      static_cast<double>(best_split_info.right_sum_hessian),
+      static_cast<float>(best_split_info.gain),
+      train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
+      best_split_info.default_left);
+
   } else {
+
     std::vector<uint32_t> cat_bitset_inner = Common::ConstructBitset(best_split_info.cat_threshold.data(), best_split_info.num_cat_threshold);
     std::vector<int> threshold_int(best_split_info.num_cat_threshold);
     for (int i = 0; i < best_split_info.num_cat_threshold; ++i) {
       threshold_int[i] = static_cast<int>(train_data_->RealThreshold(inner_feature_index, best_split_info.cat_threshold[i]));
     }
     std::vector<uint32_t> cat_bitset = Common::ConstructBitset(threshold_int.data(), best_split_info.num_cat_threshold);
-    *right_leaf = tree->SplitCategorical(best_leaf,
-                                         inner_feature_index,
-                                         best_split_info.feature,
-                                         cat_bitset_inner.data(),
-                                         static_cast<int>(cat_bitset_inner.size()),
-                                         cat_bitset.data(),
-                                         static_cast<int>(cat_bitset.size()),
-                                         static_cast<double>(best_split_info.left_output),
-                                         static_cast<double>(best_split_info.right_output),
-                                         static_cast<data_size_t>(best_split_info.left_count),
-                                         static_cast<data_size_t>(best_split_info.right_count),
-                                         static_cast<double>(best_split_info.left_sum_hessian),
-                                         static_cast<double>(best_split_info.right_sum_hessian),
-                                         static_cast<float>(best_split_info.gain),
-                                         train_data_->FeatureBinMapper(inner_feature_index)->missing_type());
+
     data_partition_->Split(best_leaf, train_data_, inner_feature_index,
-                           cat_bitset_inner.data(), static_cast<int>(cat_bitset_inner.size()), best_split_info.default_left, *right_leaf);
-  }
+      cat_bitset_inner.data(), static_cast<int>(cat_bitset_inner.size()), best_split_info.default_left, next_leaf_id);
+
+    best_split_info.left_count = data_partition_->leaf_count(*left_leaf);
+    best_split_info.right_count = data_partition_->leaf_count(next_leaf_id);
+
+    *right_leaf = tree->SplitCategorical(best_leaf,
+      inner_feature_index,
+      best_split_info.feature,
+      cat_bitset_inner.data(),
+      static_cast<int>(cat_bitset_inner.size()),
+      cat_bitset.data(),
+      static_cast<int>(cat_bitset.size()),
+      static_cast<double>(best_split_info.left_output),
+      static_cast<double>(best_split_info.right_output),
+      static_cast<data_size_t>(best_split_info.left_count),
+      static_cast<data_size_t>(best_split_info.right_count),
+      static_cast<double>(best_split_info.left_sum_hessian),
+      static_cast<double>(best_split_info.right_sum_hessian),
+      static_cast<float>(best_split_info.gain),
+      train_data_->FeatureBinMapper(inner_feature_index)->missing_type());
+  }
+  CHECK(*right_leaf == next_leaf_id);
 
-  #ifdef DEBUG
-  CHECK(best_split_info.left_count == data_partition_->leaf_count(best_leaf));
-  #endif
   auto p_left = smaller_leaf_splits_.get();
   auto p_right = larger_leaf_splits_.get();
   // init the leaves that used on next iteration
   if (best_split_info.left_count < best_split_info.right_count) {
+    CHECK(best_split_info.left_count > 0);
     smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
     larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
   } else {
+    CHECK(best_split_info.right_count > 0);
     smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
     larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
     p_right = smaller_leaf_splits_.get();
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 31743933a780..0fedefc5a15d 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -79,7 +79,12 @@ class SerialTreeLearner: public TreeLearner {
   void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
                        data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;
 
+  bool IsHistColWise() const override { return is_hist_colwise_; }
+
  protected:
+
+  void GetMultiValBin(const Dataset* dataset, bool is_first_time);
+
   virtual std::vector<int8_t> GetUsedFeatures(bool is_tree_level);
   /*!
   * \brief Some initial works before training
@@ -161,17 +166,13 @@ class SerialTreeLearner: public TreeLearner {
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
 #else
   /*! \brief gradients of current iteration, ordered for cache optimized */
-  std::vector<score_t> ordered_gradients_;
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized */
-  std::vector<score_t> ordered_hessians_;
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_hessians_;
 #endif
 
-  /*! \brief Store ordered bin */
-  std::vector<std::unique_ptr<OrderedBin>> ordered_bins_;
-  /*! \brief True if has ordered bin */
-  bool has_ordered_bin_ = false;
   /*! \brief  is_data_in_leaf_[i] != 0 means i-th data is marked */
-  std::vector<char> is_data_in_leaf_;
+  std::vector<char, Common::AlignmentAllocator<char, kAlignedSize>> is_data_in_leaf_;
   /*! \brief used to cache historical histogram to speed up*/
   HistogramPool histogram_pool_;
   /*! \brief config of tree learner*/
@@ -179,6 +180,8 @@ class SerialTreeLearner: public TreeLearner {
   int num_threads_;
   std::vector<int> ordered_bin_indices_;
   bool is_constant_hessian_;
+  std::unique_ptr<MultiValBin> multi_val_bin_;
+  bool is_hist_colwise_;
   std::unique_ptr<CostEfficientGradientBoosting> cegb_;
 };
 
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index e13210b3c600..3fc644e540a2 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -36,7 +36,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b
     }
   }
   // calculate buffer size
-  size_t buffer_size = 2 * top_k_ * std::max(max_bin * sizeof(HistogramBinEntry), sizeof(LightSplitInfo) * num_machines_);
+  size_t buffer_size = 2 * top_k_ * std::max(max_bin * KHistEntrySize, sizeof(LightSplitInfo) * num_machines_);
   // left and right on same time, so need double size
   input_buffer_.resize(buffer_size);
   output_buffer_.resize(buffer_size);
@@ -290,7 +290,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
     const int real_feature_index = this->train_data_->RealFeatureIndex(feature_index);
     this->train_data_->FixHistogram(feature_index,
       this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
-      this->smaller_leaf_splits_->num_data_in_leaf(),
       this->smaller_leaf_histogram_array_[feature_index].RawData());
 
     this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
@@ -308,7 +307,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
       this->larger_leaf_histogram_array_[feature_index].Subtract(this->smaller_leaf_histogram_array_[feature_index]);
     } else {
       this->train_data_->FixHistogram(feature_index, this->larger_leaf_splits_->sum_gradients(), this->larger_leaf_splits_->sum_hessians(),
-        this->larger_leaf_splits_->num_data_in_leaf(),
         this->larger_leaf_histogram_array_[feature_index].RawData());
     }
     // find best threshold for larger child
@@ -367,8 +365,8 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
   CopyLocalHistogram(smaller_top_features, larger_top_features);
 
   // Reduce scatter for histogram
-  Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(), block_len_.data(),
-                         output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramBinEntry::SumReducer);
+  Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(), block_len_.data(),
+                         output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramSumReducer);
 
   this->FindBestSplitsFromHistograms(is_feature_used, false);
 }
@@ -399,7 +397,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(cons
 
       this->train_data_->FixHistogram(feature_index,
                                       smaller_leaf_splits_global_->sum_gradients(), smaller_leaf_splits_global_->sum_hessians(),
-                                      GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()),
                                       smaller_leaf_histogram_array_global_[feature_index].RawData());
 
       // find best threshold
@@ -423,7 +420,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(cons
 
       this->train_data_->FixHistogram(feature_index,
                                       larger_leaf_splits_global_->sum_gradients(), larger_leaf_splits_global_->sum_hessians(),
-                                      GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()),
                                       larger_leaf_histogram_array_global_[feature_index].RawData());
 
       // find best threshold
diff --git a/tests/python_package_test/test_consistency.py b/tests/python_package_test/test_consistency.py
index cc284aa9076a..63a5834cf619 100644
--- a/tests/python_package_test/test_consistency.py
+++ b/tests/python_package_test/test_consistency.py
@@ -38,7 +38,9 @@ def load_cpp_result(self, result_file='LightGBM_predict_result.txt'):
         return np.loadtxt(os.path.join(self.directory, result_file))
 
     def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred):
-        gbm = lgb.train(self.params, lgb_train)
+        params = dict(self.params)
+        params['force_row_wise'] = True
+        gbm = lgb.train(params, lgb_train)
         y_pred = gbm.predict(X_test)
         cpp_pred = gbm.predict(X_test_fn)
         np.testing.assert_allclose(y_pred, cpp_pred)
@@ -105,7 +107,9 @@ def test_lambdarank(self):
         X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True)
         group_train = fd.load_field('.train.query')
         lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
-        gbm = lgb.LGBMRanker(**fd.params)
+        params = dict(fd.params)
+        params['force_col_wise'] = True
+        gbm = lgb.LGBMRanker(**params)
         gbm.fit(X_train, y_train, group=group_train)
         sk_pred = gbm.predict(X_test)
         fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 5d1ce43b03b5..72d263238ce7 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -66,7 +66,7 @@ def test_binary(self):
                         verbose_eval=False,
                         evals_result=evals_result)
         ret = log_loss(y_test, gbm.predict(X_test))
-        self.assertLess(ret, 0.11)
+        self.assertLess(ret, 0.14)
         self.assertEqual(len(evals_result['valid_0']['binary_logloss']), 50)
         self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
 
@@ -328,7 +328,7 @@ def test_multiclass(self):
                         verbose_eval=False,
                         evals_result=evals_result)
         ret = multi_logloss(y_test, gbm.predict(X_test))
-        self.assertLess(ret, 0.15)
+        self.assertLess(ret, 0.16)
         self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)
 
     def test_multiclass_rf(self):
@@ -518,7 +518,7 @@ def test_early_stopping(self):
                         valid_names=valid_set_name,
                         verbose_eval=False,
                         early_stopping_rounds=5)
-        self.assertLessEqual(gbm.best_iteration, 31)
+        self.assertLessEqual(gbm.best_iteration, 39)
         self.assertIn(valid_set_name, gbm.best_score)
         self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
 
@@ -1740,7 +1740,7 @@ def test_node_level_subcol(self):
                         verbose_eval=False,
                         evals_result=evals_result)
         ret = log_loss(y_test, gbm.predict(X_test))
-        self.assertLess(ret, 0.13)
+        self.assertLess(ret, 0.14)
         self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
         params['feature_fraction'] = 0.5
         gbm2 = lgb.train(params, lgb_train, num_boost_round=25)
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 8af3edf8f120..d9b9c872a2b5 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -77,7 +77,7 @@ def test_binary(self):
         gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
         gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
         ret = log_loss(y_test, gbm.predict_proba(X_test))
-        self.assertLess(ret, 0.11)
+        self.assertLess(ret, 0.12)
         self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5)
 
     def test_regression(self):
@@ -97,7 +97,7 @@ def test_multiclass(self):
         ret = multi_error(y_test, gbm.predict(X_test))
         self.assertLess(ret, 0.05)
         ret = multi_logloss(y_test, gbm.predict_proba(X_test))
-        self.assertLess(ret, 0.15)
+        self.assertLess(ret, 0.16)
         self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], places=5)
 
     def test_lambdarank(self):
@@ -114,8 +114,8 @@ def test_lambdarank(self):
                 eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False,
                 callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))])
         self.assertLessEqual(gbm.best_iteration_, 24)
-        self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6333)
-        self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6048)
+        self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.5769)
+        self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.5920)
 
     def test_xendcg(self):
         dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -129,7 +129,7 @@ def test_xendcg(self):
                 eval_metric='ndcg',
                 callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))])
         self.assertLessEqual(gbm.best_iteration_, 24)
-        self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6579)
+        self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6559)
         self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6421)
 
     def test_regression_with_custom_objective(self):
diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj
index c923fad9be6c..f899f97e4dc4 100644
--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -30,24 +30,24 @@
     <SccLocalPath>SAK</SccLocalPath>
     <SccProvider>SAK</SccProvider>
     <ProjectName>LightGBM</ProjectName>
-    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug_mpi|x64'">
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='DLL|x64'" Label="Configuration">
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
     <ConfigurationType>DynamicLibrary</ConfigurationType>
   </PropertyGroup>
   <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Release_mpi|x64'">
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
@@ -95,6 +95,8 @@
       <WholeProgramOptimization>false</WholeProgramOptimization>
       <Optimization>Disabled</Optimization>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
     <Link>
       <AdditionalLibraryDirectories>
@@ -116,6 +118,8 @@
       <WholeProgramOptimization>false</WholeProgramOptimization>
       <Optimization>Disabled</Optimization>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
     <Link>
       <AdditionalDependencies>
@@ -137,6 +141,8 @@
       <WholeProgramOptimization>true</WholeProgramOptimization>
       <OmitFramePointers>true</OmitFramePointers>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
     <Link>
       <AdditionalLibraryDirectories>
@@ -162,6 +168,8 @@
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
       <OmitFramePointers>true</OmitFramePointers>
       <FunctionLevelLinking>true</FunctionLevelLinking>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
     <Link>
       <AdditionalDependencies />
@@ -181,6 +189,8 @@
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
       <OmitFramePointers>true</OmitFramePointers>
       <FunctionLevelLinking>true</FunctionLevelLinking>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
     <Link>
       <AdditionalDependencies>
@@ -224,7 +234,8 @@
     <ClInclude Include="..\src\boosting\score_updater.hpp" />
     <ClInclude Include="..\src\io\dense_bin.hpp" />
     <ClInclude Include="..\src\io\dense_nbits_bin.hpp" />
-    <ClInclude Include="..\src\io\ordered_sparse_bin.hpp" />
+    <ClInclude Include="..\src\io\multi_val_dense_bin.hpp" />
+    <ClInclude Include="..\src\io\multi_val_sparse_bin.hpp" />
     <ClInclude Include="..\src\io\parser.hpp" />
     <ClInclude Include="..\src\io\sparse_bin.hpp" />
     <ClInclude Include="..\src\metric\binary_metric.hpp" />
diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters
index c5f5c94a93d0..4f706fb17c42 100644
--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -57,9 +57,6 @@
     <ClInclude Include="..\src\io\dense_bin.hpp">
       <Filter>src\io</Filter>
     </ClInclude>
-    <ClInclude Include="..\src\io\ordered_sparse_bin.hpp">
-      <Filter>src\io</Filter>
-    </ClInclude>
     <ClInclude Include="..\src\io\parser.hpp">
       <Filter>src\io</Filter>
     </ClInclude>
@@ -213,6 +210,12 @@
     <ClInclude Include="..\src\treelearner\cost_effective_gradient_boosting.hpp">
       <Filter>src\treelearner</Filter>
     </ClInclude>
+    <ClInclude Include="..\src\io\multi_val_dense_bin.hpp">
+      <Filter>src\io</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\io\multi_val_sparse_bin.hpp">
+      <Filter>src\io</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\src\application\application.cpp">