From 0bb4a825af9b65184070ce83e529d1c92d7b3ef3 Mon Sep 17 00:00:00 2001
From: Huan Zhang <ecezhang@ucdavis.edu>
Date: Sun, 9 Apr 2017 06:53:14 -0700
Subject: [PATCH] Initial GPU acceleration support for LightGBM (#368)

* add dummy gpu solver code

* initial GPU code

* fix crash bug

* first working version

* use asynchronous copy

* use a better kernel for root

* parallel read histogram

* sparse features now works, but no acceleration, compute on CPU

* compute sparse feature on CPU simultaneously

* fix big bug; add gpu selection; add kernel selection

* better debugging

* clean up

* add feature scatter

* Add sparse_threshold control

* fix a bug in feature scatter

* clean up debug

* temporarily add OpenCL kernels for k=64,256

* fix up CMakeList and definition USE_GPU

* add OpenCL kernels as string literals

* Add boost.compute as a submodule

* add boost dependency into CMakeList

* fix opencl pragma

* use pinned memory for histogram

* use pinned buffer for gradients and hessians

* better debugging message

* add double precision support on GPU

* fix boost version in CMakeList

* Add a README

* reconstruct GPU initialization code for ResetTrainingData

* move data to GPU in parallel

* fix a bug during feature copy

* update gpu kernels

* update gpu code

* initial port to LightGBM v2

* speedup GPU data loading process

* Add 4-bit bin support to GPU

* re-add sparse_threshold parameter

* remove kMaxNumWorkgroups and allows an unlimited number of features

* add feature mask support for skipping unused features

* enable kernel cache

* use GPU kernels withoug feature masks when all features are used

* REAdme.

* REAdme.

* update README

* fix typos (#349)

* change compile to gcc on Apple as default

* clean vscode related file

* refine api of constructing from sampling data.

* fix bug in the last commit.

* more efficient algorithm to sample k from n.

* fix bug in filter bin

* change to boost from average output.

* fix tests.

* only stop training when all classes are finshed in multi-class.

* limit the max tree output. change hessian in multi-class objective.

* robust tree model loading.

* fix test.

* convert the probabilities to raw score in boost_from_average of classification.

* fix the average label for binary classification.

* Add boost_from_average to docs (#354)

* don't use "ConvertToRawScore" for self-defined objective function.

* boost_from_average seems doesn't work well in binary classification. remove it.

* For a better jump link (#355)

* Update Python-API.md

* for a better jump in page

A space is needed between `#` and the headers content according to Github's markdown format [guideline](https://guides.github.com/features/mastering-markdown/)

After adding the spaces, we can jump to the exact position in page by click the link.

* fixed something mentioned by @wxchan

* Update Python-API.md

* add FitByExistingTree.

* adapt GPU tree learner for FitByExistingTree

* avoid NaN output.

* update boost.compute

* fix typos (#361)

* fix broken links (#359)

* update README

* disable GPU acceleration by default

* fix image url

* cleanup debug macro

* remove old README

* do not save sparse_threshold_ in FeatureGroup

* add details for new GPU settings

* ignore submodule when doing pep8 check

* allocate workspace for at least one thread during builing Feature4

* move sparse_threshold to class Dataset

* remove duplicated code in GPUTreeLearner::Split

* Remove duplicated code in FindBestThresholds and BeforeFindBestSplit

* do not rebuild ordered gradients and hessians for sparse features

* support feature groups in GPUTreeLearner

* Initial parallel learners with GPU support

* add option device, cleanup code

* clean up FindBestThresholds; add some omp parallel

* constant hessian optimization for GPU

* Fix GPUTreeLearner crash when there is zero feature

* use np.testing.assert_almost_equal() to compare lists of floats in tests

* travis for GPU
---
 .gitmodules                                   |    3 +
 .travis.yml                                   |   28 +-
 .travis/amd_sdk.sh                            |   38 +
 CMakeLists.txt                                |   19 +-
 compute                                       |    1 +
 include/LightGBM/bin.h                        |    5 +-
 include/LightGBM/config.h                     |   18 +
 include/LightGBM/dataset.h                    |   25 +
 include/LightGBM/feature_group.h              |   17 +-
 include/LightGBM/tree_learner.h               |    9 +-
 src/boosting/gbdt.cpp                         |    4 +-
 src/io/bin.cpp                                |    6 +-
 src/io/config.cpp                             |   22 +
 src/io/dataset.cpp                            |    7 +-
 src/io/dense_bin.hpp                          |    6 +
 src/io/dense_nbits_bin.hpp                    |    9 +-
 src/io/sparse_bin.hpp                         |   16 +-
 .../data_parallel_tree_learner.cpp            |  153 +--
 .../feature_parallel_tree_learner.cpp         |   51 +-
 src/treelearner/gpu_tree_learner.cpp          | 1093 +++++++++++++++++
 src/treelearner/gpu_tree_learner.h            |  282 +++++
 src/treelearner/ocl/histogram16.cl            |  756 ++++++++++++
 src/treelearner/ocl/histogram256.cl           |  813 ++++++++++++
 src/treelearner/ocl/histogram64.cl            |  724 +++++++++++
 src/treelearner/parallel_tree_learner.h       |   16 +-
 src/treelearner/serial_tree_learner.cpp       |    3 +-
 src/treelearner/serial_tree_learner.h         |   16 +-
 src/treelearner/tree_learner.cpp              |   32 +-
 .../voting_parallel_tree_learner.cpp          |  229 ++--
 tests/python_package_test/test_engine.py      |    8 +-
 30 files changed, 4163 insertions(+), 246 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 .travis/amd_sdk.sh
 create mode 160000 compute
 create mode 100644 src/treelearner/gpu_tree_learner.cpp
 create mode 100644 src/treelearner/gpu_tree_learner.h
 create mode 100644 src/treelearner/ocl/histogram16.cl
 create mode 100644 src/treelearner/ocl/histogram256.cl
 create mode 100644 src/treelearner/ocl/histogram64.cl

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000000..133ceb3889da
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "include/boost/compute"]
+	path = compute
+	url = https://github.com/boostorg/compute
diff --git a/.travis.yml b/.travis.yml
index d9316cd1b26e..297b065b52ed 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,24 +11,48 @@ before_install:
 - export PATH="$HOME/miniconda/bin:$PATH"
 - conda config --set always_yes yes --set changeps1 no
 - conda update -q conda
+- sudo add-apt-repository ppa:george-edison55/cmake-3.x -y
+- sudo apt-get update -q
+- bash .travis/amd_sdk.sh;
+- tar -xjf AMD-SDK.tar.bz2;
+- AMDAPPSDK=${HOME}/AMDAPPSDK;
+- export OPENCL_VENDOR_PATH=${AMDAPPSDK}/etc/OpenCL/vendors;
+- mkdir -p ${OPENCL_VENDOR_PATH};
+- sh AMD-APP-SDK*.sh --tar -xf -C ${AMDAPPSDK};
+- echo libamdocl64.so > ${OPENCL_VENDOR_PATH}/amdocl64.icd;
+- export LD_LIBRARY_PATH=${AMDAPPSDK}/lib/x86_64:${LD_LIBRARY_PATH};
+- chmod +x ${AMDAPPSDK}/bin/x86_64/clinfo;
+- ${AMDAPPSDK}/bin/x86_64/clinfo;
+- export LIBRARY_PATH="$HOME/miniconda/lib:$LIBRARY_PATH"
+- export LD_RUN_PATH="$HOME/miniconda/lib:$LD_RUN_PATH"
+- export CPLUS_INCLUDE_PATH="$HOME/miniconda/include:$AMDAPPSDK/include/:$CPLUS_INCLUDE_PATH"
 
 install:
 - sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential
+- sudo apt-get install -y cmake
 - conda install --yes atlas numpy scipy scikit-learn pandas matplotlib
+- conda install --yes -c conda-forge boost=1.63.0
 - pip install pep8
 
-
 script:
 - cd $TRAVIS_BUILD_DIR
 - mkdir build && cd build && cmake .. && make -j
 - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
 - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
 - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
-- cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 .
+- cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 --exclude=./compute .
 - rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
 - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
 - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
 - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
+- cd $TRAVIS_BUILD_DIR
+- rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=ON -DBOOST_ROOT="$HOME/miniconda/" -DOpenCL_INCLUDE_DIR=$AMDAPPSDK/include/ ..
+- sed -i 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' ../include/LightGBM/config.h
+- make -j$(nproc)
+- sed -i 's/std::string device_type = "gpu";/std::string device_type = "cpu";/' ../include/LightGBM/config.h
+- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
+- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
+- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
 
 notifications:
   email: false
diff --git a/.travis/amd_sdk.sh b/.travis/amd_sdk.sh
new file mode 100644
index 000000000000..f5fea984e59f
--- /dev/null
+++ b/.travis/amd_sdk.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Original script from https://github.com/gregvw/amd_sdk/
+
+# Location from which get nonce and file name from
+URL="http://developer.amd.com/tools-and-sdks/opencl-zone/opencl-tools-sdks/amd-accelerated-parallel-processing-app-sdk/"
+URLDOWN="http://developer.amd.com/amd-license-agreement-appsdk/"
+
+NONCE1_STRING='name="amd_developer_central_downloads_page_nonce"'
+FILE_STRING='name="f"'
+POSTID_STRING='name="post_id"'
+NONCE2_STRING='name="amd_developer_central_nonce"'
+
+#For newest FORM=`wget -qO - $URL | sed -n '/download-2/,/64-bit/p'`
+FORM=`wget -qO - $URL | sed -n '/download-5/,/64-bit/p'`
+
+# Get nonce from form
+NONCE1=`echo $FORM | awk -F ${NONCE1_STRING} '{print $2}'`
+NONCE1=`echo $NONCE1 | awk -F'"' '{print $2}'`
+echo $NONCE1
+
+# get the postid
+POSTID=`echo $FORM | awk -F ${POSTID_STRING} '{print $2}'`
+POSTID=`echo $POSTID | awk -F'"' '{print $2}'`
+echo $POSTID
+
+# get file name
+FILE=`echo $FORM | awk -F ${FILE_STRING} '{print $2}'`
+FILE=`echo $FILE | awk -F'"' '{print $2}'`
+echo $FILE
+
+FORM=`wget -qO - $URLDOWN --post-data "amd_developer_central_downloads_page_nonce=${NONCE1}&f=${FILE}&post_id=${POSTID}"`
+
+NONCE2=`echo $FORM | awk -F ${NONCE2_STRING} '{print $2}'`
+NONCE2=`echo $NONCE2 | awk -F'"' '{print $2}'`
+echo $NONCE2
+
+wget --content-disposition --trust-server-names $URLDOWN --post-data "amd_developer_central_nonce=${NONCE2}&f=${FILE}" -O AMD-SDK.tar.bz2;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cab0468187b8..242bd3cee96c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,7 @@ PROJECT(lightgbm)
 
 OPTION(USE_MPI "MPI based parallel learning" OFF)
 OPTION(USE_OPENMP "Enable OpenMP" ON)
+OPTION(USE_GPU "Enable GPU-acclerated training (EXPERIMENTAL)" OFF)
 
 if(APPLE)
     OPTION(APPLE_OUTPUT_DYLIB "Output dylib shared library" OFF)
@@ -34,8 +35,17 @@ else()
     endif()
 endif(USE_OPENMP)
 
+if(USE_GPU)
+    find_package(OpenCL REQUIRED)
+    include_directories(${OpenCL_INCLUDE_DIRS})
+    MESSAGE(STATUS "OpenCL include directory:" ${OpenCL_INCLUDE_DIRS})
+    find_package(Boost 1.56.0 COMPONENTS filesystem system REQUIRED)
+    include_directories(${Boost_INCLUDE_DIRS})
+    ADD_DEFINITIONS(-DUSE_GPU)
+endif(USE_GPU)
+
 if(UNIX OR MINGW OR CYGWIN)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3 -Wall -std=c++11")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3 -Wall -std=c++11 -Wno-ignored-attributes")
 endif()
 
 if(MSVC)
@@ -65,11 +75,13 @@ endif()
 
 
 SET(LightGBM_HEADER_DIR ${PROJECT_SOURCE_DIR}/include)
+SET(BOOST_COMPUTE_HEADER_DIR ${PROJECT_SOURCE_DIR}/compute/include)
 
 SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
 SET(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
 
 include_directories (${LightGBM_HEADER_DIR})
+include_directories (${BOOST_COMPUTE_HEADER_DIR})
 
 if(APPLE)
   if (APPLE_OUTPUT_DYLIB)
@@ -105,6 +117,11 @@ if(USE_MPI)
   TARGET_LINK_LIBRARIES(_lightgbm ${MPI_CXX_LIBRARIES})
 endif(USE_MPI)
 
+if(USE_GPU)
+  TARGET_LINK_LIBRARIES(lightgbm ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
+  TARGET_LINK_LIBRARIES(_lightgbm ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
+endif(USE_GPU)
+
 if(WIN32 AND (MINGW OR CYGWIN))
     TARGET_LINK_LIBRARIES(lightgbm Ws2_32)
     TARGET_LINK_LIBRARIES(_lightgbm Ws2_32)
diff --git a/compute b/compute
new file mode 160000
index 000000000000..1380a0458208
--- /dev/null
+++ b/compute
@@ -0,0 +1 @@
+Subproject commit 1380a04582080bbe2364352b336270bc4bfa3025
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index 38dac7cdbe36..0ead09ee54bb 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -59,7 +59,6 @@ class BinMapper {
   explicit BinMapper(const void* memory);
   ~BinMapper();
 
-  static double kSparseThreshold;
   bool CheckAlign(const BinMapper& other) const {
     if (num_bin_ != other.num_bin_) {
       return false;
@@ -258,6 +257,7 @@ class BinIterator {
   * \return Bin data
   */
   virtual uint32_t Get(data_size_t idx) = 0;
+  virtual uint32_t RawGet(data_size_t idx) = 0;
   virtual void Reset(data_size_t idx) = 0;
   virtual ~BinIterator() = default;
 };
@@ -383,12 +383,13 @@ class Bin {
   * \param num_bin Number of bin
   * \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
   * \param is_enable_sparse True if enable sparse feature
+  * \param sparse_threshold Threshold for treating a feature as a sparse feature
   * \param is_sparse Will set to true if this bin is sparse
   * \param default_bin Default bin for zeros value
   * \return The bin data object
   */
   static Bin* CreateBin(data_size_t num_data, int num_bin,
-    double sparse_rate, bool is_enable_sparse, bool* is_sparse);
+    double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
 
   /*!
   * \brief Create object for bin data of one feature, used for dense feature
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 2b1c7d3de6b9..5f1540d29700 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -97,6 +97,11 @@ struct IOConfig: public ConfigBase {
   int num_iteration_predict = -1;
   bool is_pre_partition = false;
   bool is_enable_sparse = true;
+  /*! \brief The threshold of zero elements precentage for treating a feature as a sparse feature.
+   *  Default is 0.8, where a feature is treated as a sparse feature when there are over 80% zeros.
+   *  When setting to 1.0, all features are processed as dense features.
+   */
+  double sparse_threshold = 0.8;
   bool use_two_round_loading = false;
   bool is_save_binary_file = false;
   bool enable_load_from_binary_file = true;
@@ -188,6 +193,16 @@ struct TreeConfig: public ConfigBase {
   // max_depth < 0 means no limit
   int max_depth = -1;
   int top_k = 20;
+  /*! \brief OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform.
+   *  Default value is -1, using the system-wide default platform
+   */
+  int gpu_platform_id = -1;
+  /*! \brief OpenCL device ID in the specified platform. Each GPU in the selected platform has a
+   *  unique device ID. Default value is -1, using the default device in the selected platform
+   */
+  int gpu_device_id = -1;
+  /*! \brief Set to true to use double precision math on GPU (default using single precision) */
+  bool gpu_use_dp = false;
   LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
 };
 
@@ -216,11 +231,14 @@ struct BoostingConfig: public ConfigBase {
   // only used for the regression. Will boost from the average labels.
   bool boost_from_average = true;
   std::string tree_learner_type = "serial";
+  std::string device_type = "cpu";
   TreeConfig tree_config;
   LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
 private:
   void GetTreeLearnerType(const std::unordered_map<std::string,
     std::string>& params);
+  void GetDeviceType(const std::unordered_map<std::string,
+    std::string>& params);
 };
 
 /*! \brief Config for Network */
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 7687d3bcd3dc..8ced85365a79 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -355,6 +355,9 @@ class Dataset {
   inline int Feture2SubFeature(int feature_idx) const {
     return feature2subfeature_[feature_idx];
   }
+  inline uint64_t GroupBinBoundary(int group_idx) const {
+    return group_bin_boundaries_[group_idx];
+  }
   inline uint64_t NumTotalBin() const {
     return group_bin_boundaries_.back();
   }
@@ -421,6 +424,10 @@ class Dataset {
     const int sub_feature = feature2subfeature_[i];
     return feature_groups_[group]->bin_mappers_[sub_feature]->num_bin();
   }
+  
+  inline int FeatureGroupNumBin(int group) const {
+    return feature_groups_[group]->num_total_bin_;
+  }
 
   inline const BinMapper* FeatureBinMapper(int i) const {
     const int group = feature2group_[i];
@@ -428,12 +435,25 @@ class Dataset {
     return feature_groups_[group]->bin_mappers_[sub_feature].get();
   }
 
+  inline const Bin* FeatureBin(int i) const {
+    const int group = feature2group_[i];
+    return feature_groups_[group]->bin_data_.get();
+  }
+  
+  inline const Bin* FeatureGroupBin(int group) const {
+    return feature_groups_[group]->bin_data_.get();
+  }
+
   inline BinIterator* FeatureIterator(int i) const {
     const int group = feature2group_[i];
     const int sub_feature = feature2subfeature_[i];
     return feature_groups_[group]->SubFeatureIterator(sub_feature);
   }
 
+  inline BinIterator* FeatureGroupIterator(int group) const {
+    return feature_groups_[group]->FeatureGroupIterator();
+  }
+  
   inline double RealThreshold(int i, uint32_t threshold) const {
     const int group = feature2group_[i];
     const int sub_feature = feature2subfeature_[i];
@@ -461,6 +481,9 @@ class Dataset {
   /*! \brief Get Number of used features */
   inline int num_features() const { return num_features_; }
 
+  /*! \brief Get Number of feature groups */
+  inline int num_feature_groups() const { return num_groups_;}
+
   /*! \brief Get Number of total features */
   inline int num_total_features() const { return num_total_features_; }
 
@@ -516,6 +539,8 @@ class Dataset {
   Metadata metadata_;
   /*! \brief index of label column */
   int label_idx_ = 0;
+  /*! \brief Threshold for treating a feature as a sparse feature */
+  double sparse_threshold_;
   /*! \brief store feature names */
   std::vector<std::string> feature_names_;
   /*! \brief store feature names */
diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index 5a2d518c3ee0..ca23588af94e 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -25,10 +25,11 @@ class FeatureGroup {
   * \param bin_mappers Bin mapper for features
   * \param num_data Total number of data
   * \param is_enable_sparse True if enable sparse feature
+  * \param sparse_threshold Threshold for treating a feature as a sparse feature
   */
   FeatureGroup(int num_feature,
     std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
-    data_size_t num_data, bool is_enable_sparse) : num_feature_(num_feature) {
+    data_size_t num_data, double sparse_threshold, bool is_enable_sparse) : num_feature_(num_feature) {
     CHECK(static_cast<int>(bin_mappers.size()) == num_feature);
     // use bin at zero to store default_bin
     num_total_bin_ = 1;
@@ -46,7 +47,7 @@ class FeatureGroup {
     }
     double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
     bin_data_.reset(Bin::CreateBin(num_data, num_total_bin_,
-      sparse_rate, is_enable_sparse, &is_sparse_));
+      sparse_rate, is_enable_sparse, sparse_threshold, &is_sparse_));
   }
   /*!
   * \brief Constructor from memory
@@ -120,6 +121,18 @@ class FeatureGroup {
     uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
     return bin_data_->GetIterator(min_bin, max_bin, default_bin);
   }
+  
+  /*!
+   * \brief Returns a BinIterator that can access the entire feature group's raw data.
+   *        The RawGet() function of the iterator should be called for best efficiency.
+   * \return A pointer to the BinIterator object
+   */
+  inline BinIterator* FeatureGroupIterator() {
+    uint32_t min_bin = bin_offsets_[0];
+    uint32_t max_bin = bin_offsets_.back() - 1;
+    uint32_t default_bin = 0;
+    return bin_data_->GetIterator(min_bin, max_bin, default_bin);
+  }
 
   inline data_size_t Split(
     int sub_feature,
diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h
index 682ae640c2c8..e40ca3e1f8f9 100644
--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -24,8 +24,9 @@ class TreeLearner {
   /*!
   * \brief Initialize tree learner with training dataset
   * \param train_data The used training data
+  * \param is_constant_hessian True if all hessians share the same value
   */
-  virtual void Init(const Dataset* train_data) = 0;
+  virtual void Init(const Dataset* train_data, bool is_constant_hessian) = 0;
 
   virtual void ResetTrainingData(const Dataset* train_data) = 0;
 
@@ -71,10 +72,12 @@ class TreeLearner {
 
   /*!
   * \brief Create object of tree learner
-  * \param type Type of tree learner
+  * \param learner_type Type of tree learner
+  * \param device_type Type of tree learner
   * \param tree_config config of tree
   */
-  static TreeLearner* CreateTreeLearner(const std::string& type,
+  static TreeLearner* CreateTreeLearner(const std::string& learner_type,
+    const std::string& device_type,
     const TreeConfig* tree_config);
 };
 
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 2ebce1b40213..9288ea746828 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -92,10 +92,10 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
 
   if (train_data_ != train_data && train_data != nullptr) {
     if (tree_learner_ == nullptr) {
-      tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(new_config->tree_learner_type, &new_config->tree_config));
+      tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(new_config->tree_learner_type, new_config->device_type, &new_config->tree_config));
     }
     // init tree learner
-    tree_learner_->Init(train_data);
+    tree_learner_->Init(train_data, is_constant_hessian_);
 
     // push training metrics
     training_metrics_.clear();
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index f83a339ebcfc..df77cfdd363a 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -339,12 +339,10 @@ template class OrderedSparseBin<uint8_t>;
 template class OrderedSparseBin<uint16_t>;
 template class OrderedSparseBin<uint32_t>;
 
-double BinMapper::kSparseThreshold = 0.8f;
-
 Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, 
-  bool is_enable_sparse, bool* is_sparse) {
+  bool is_enable_sparse, double sparse_threshold, bool* is_sparse) {
   // sparse threshold
-  if (sparse_rate >= BinMapper::kSparseThreshold && is_enable_sparse) {
+  if (sparse_rate >= sparse_threshold && is_enable_sparse) {
     *is_sparse = true;
     return CreateSparseBin(num_data, num_bin);
   } else {
diff --git a/src/io/config.cpp b/src/io/config.cpp
index a3c3d4e31bc8..65eb484f734e 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -201,6 +201,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
   GetInt(params, "bin_construct_sample_cnt", &bin_construct_sample_cnt);
   GetBool(params, "is_pre_partition", &is_pre_partition);
   GetBool(params, "is_enable_sparse", &is_enable_sparse);
+  GetDouble(params, "sparse_threshold", &sparse_threshold);
   GetBool(params, "use_two_round_loading", &use_two_round_loading);
   GetBool(params, "is_save_binary_file", &is_save_binary_file);
   GetBool(params, "enable_load_from_binary_file", &enable_load_from_binary_file);
@@ -305,6 +306,9 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
   GetDouble(params, "histogram_pool_size", &histogram_pool_size);
   GetInt(params, "max_depth", &max_depth);
   GetInt(params, "top_k", &top_k);
+  GetInt(params, "gpu_platform_id", &gpu_platform_id);
+  GetInt(params, "gpu_device_id", &gpu_device_id);
+  GetBool(params, "gpu_use_dp", &gpu_use_dp);
 }
 
 
@@ -336,6 +340,7 @@ void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& par
   GetBool(params, "boost_from_average", &boost_from_average);
   CHECK(drop_rate <= 1.0 && drop_rate >= 0.0);
   CHECK(skip_drop <= 1.0 && skip_drop >= 0.0);
+  GetDeviceType(params);
   GetTreeLearnerType(params);
   tree_config.Set(params);
 }
@@ -346,6 +351,9 @@ void BoostingConfig::GetTreeLearnerType(const std::unordered_map<std::string, st
     std::transform(value.begin(), value.end(), value.begin(), Common::tolower);
     if (value == std::string("serial")) {
       tree_learner_type = "serial";
+    } else if (value == std::string("gpu")) {
+      tree_learner_type = "serial";
+      device_type = "gpu";
     } else if (value == std::string("feature") || value == std::string("feature_parallel")) {
       tree_learner_type = "feature";
     } else if (value == std::string("data") || value == std::string("data_parallel")) {
@@ -358,6 +366,20 @@ void BoostingConfig::GetTreeLearnerType(const std::unordered_map<std::string, st
   }
 }
 
+void BoostingConfig::GetDeviceType(const std::unordered_map<std::string, std::string>& params) {
+  std::string value;
+  if (GetString(params, "device", &value)) {
+    std::transform(value.begin(), value.end(), value.begin(), Common::tolower);
+    if (value == std::string("cpu")) {
+      device_type = "cpu";
+    } else if (value == std::string("gpu")) {
+      device_type = "gpu";
+    } else {
+      Log::Fatal("Unknown device type %s", value.c_str());
+    }
+  }
+}
+
 void NetworkConfig::Set(const std::unordered_map<std::string, std::string>& params) {
   GetInt(params, "num_machines", &num_machines);
   CHECK(num_machines >= 1);
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 3a02a6818573..40f5b4268daf 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -50,6 +50,7 @@ void Dataset::Construct(
   size_t,
   const IOConfig& io_config) {
   num_total_features_ = static_cast<int>(bin_mappers.size());
+  sparse_threshold_ = io_config.sparse_threshold;
   // get num_features
   std::vector<int> used_features;
   for (int i = 0; i < static_cast<int>(bin_mappers.size()); ++i) {
@@ -85,7 +86,7 @@ void Dataset::Construct(
       ++cur_fidx;
     }
     feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
-      new FeatureGroup(cur_cnt_features, cur_bin_mappers, num_data_, io_config.is_enable_sparse)));
+      new FeatureGroup(cur_cnt_features, cur_bin_mappers, num_data_, sparse_threshold_, io_config.is_enable_sparse)));
   }
   feature_groups_.shrink_to_fit();
   group_bin_boundaries_.clear();
@@ -129,6 +130,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
   feature_groups_.clear();
   num_features_ = dataset->num_features_;
   num_groups_ = dataset->num_groups_;
+  sparse_threshold_ = dataset->sparse_threshold_;
   bool is_enable_sparse = false;
   for (int i = 0; i < num_groups_; ++i) {
     if (dataset->feature_groups_[i]->is_sparse_) {
@@ -146,6 +148,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
       dataset->feature_groups_[i]->num_feature_,
       bin_mappers,
       num_data_,
+      dataset->sparse_threshold_,
       is_enable_sparse));
   }
   feature_groups_.shrink_to_fit();
@@ -165,6 +168,7 @@ void Dataset::CreateValid(const Dataset* dataset) {
   feature_groups_.clear();
   num_features_ = dataset->num_features_;
   num_groups_ = num_features_;
+  sparse_threshold_ = dataset->sparse_threshold_;
   bool is_enable_sparse = true;
   feature2group_.clear();
   feature2subfeature_.clear();
@@ -176,6 +180,7 @@ void Dataset::CreateValid(const Dataset* dataset) {
       1,
       bin_mappers,
       num_data_,
+      dataset->sparse_threshold_,
       is_enable_sparse));
     feature2group_.push_back(i);
     feature2subfeature_.push_back(0);
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index b6ed973a3398..0be476fc0610 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -25,6 +25,7 @@ class DenseBinIterator: public BinIterator {
       bias_ = 0;
     }
   }
+  inline uint32_t RawGet(data_size_t idx) override;
   inline uint32_t Get(data_size_t idx) override;
   inline void Reset(data_size_t) override { }
 private:
@@ -284,6 +285,11 @@ uint32_t DenseBinIterator<VAL_T>::Get(data_size_t idx) {
   }
 }
 
+template <typename VAL_T>
+inline uint32_t DenseBinIterator<VAL_T>::RawGet(data_size_t idx) {
+  return bin_data_->data_[idx];
+}
+
 template <typename VAL_T>
 BinIterator* DenseBin<VAL_T>::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
   return new DenseBinIterator<VAL_T>(this, min_bin, max_bin, default_bin);
diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp
index c5a8c67cb964..30aa3dbcf2f5 100644
--- a/src/io/dense_nbits_bin.hpp
+++ b/src/io/dense_nbits_bin.hpp
@@ -23,6 +23,7 @@ class Dense4bitsBinIterator: public BinIterator {
       bias_ = 0;
     }
   }
+  inline uint32_t RawGet(data_size_t idx) override;
   inline uint32_t Get(data_size_t idx) override;
   inline void Reset(data_size_t) override { }
 private:
@@ -74,7 +75,7 @@ class Dense4bitsBin: public Bin {
     }
   }
 
-  BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
+  inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
 
   void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
                           const score_t* ordered_gradients, const score_t* ordered_hessians,
@@ -357,7 +358,11 @@ uint32_t Dense4bitsBinIterator::Get(data_size_t idx) {
   }
 }
 
-BinIterator* Dense4bitsBin::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
+uint32_t Dense4bitsBinIterator::RawGet(data_size_t idx) {
+  return (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
+}
+
+inline BinIterator* Dense4bitsBin::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
   return new Dense4bitsBinIterator(this, min_bin, max_bin, default_bin);
 }
 
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index 245644794938..01659ebff531 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -38,7 +38,8 @@ class SparseBinIterator: public BinIterator {
     Reset(start_idx);
   }
 
-  inline VAL_T RawGet(data_size_t idx);
+  inline uint32_t RawGet(data_size_t idx) override;
+  inline VAL_T InnerRawGet(data_size_t idx);
 
   inline uint32_t Get( data_size_t idx) override {
     VAL_T ret = RawGet(idx);
@@ -152,7 +153,7 @@ class SparseBin: public Bin {
       }
       for (data_size_t i = 0; i < num_data; ++i) {
         const data_size_t idx = data_indices[i];
-        VAL_T bin = iterator.RawGet(idx);
+        VAL_T bin = iterator.InnerRawGet(idx);
         if (bin > maxb || bin < minb) {
           default_indices[(*default_count)++] = idx;
         } else if (bin > th) {
@@ -168,7 +169,7 @@ class SparseBin: public Bin {
       }
       for (data_size_t i = 0; i < num_data; ++i) {
         const data_size_t idx = data_indices[i];
-        VAL_T bin = iterator.RawGet(idx);
+        VAL_T bin = iterator.InnerRawGet(idx);
         if (bin > maxb || bin < minb) {
           default_indices[(*default_count)++] = idx;
         } else if (bin != th) {
@@ -327,7 +328,7 @@ class SparseBin: public Bin {
     // transform to delta array
     data_size_t last_idx = 0;
     for (data_size_t i = 0; i < num_used_indices; ++i) {
-      VAL_T bin = iterator.RawGet(used_indices[i]);
+      VAL_T bin = iterator.InnerRawGet(used_indices[i]);
       if (bin > 0) {
         data_size_t cur_delta = i - last_idx;
         while (cur_delta >= 256) {
@@ -363,7 +364,12 @@ class SparseBin: public Bin {
 };
 
 template <typename VAL_T>
-inline VAL_T SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
+inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
+  return InnerRawGet(idx);
+}
+
+template <typename VAL_T>
+inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
   while (cur_pos_ < idx) {
     bin_data_->NextNonzero(&i_delta_, &cur_pos_);
   }
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index a8415283c092..5961fe6557e4 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -7,54 +7,59 @@
 
 namespace LightGBM {
 
-DataParallelTreeLearner::DataParallelTreeLearner(const TreeConfig* tree_config)
-  :SerialTreeLearner(tree_config) {
+template <typename TREELEARNER_T>
+DataParallelTreeLearner<TREELEARNER_T>::DataParallelTreeLearner(const TreeConfig* tree_config)
+  :TREELEARNER_T(tree_config) {
 }
 
-DataParallelTreeLearner::~DataParallelTreeLearner() {
+template <typename TREELEARNER_T>
+DataParallelTreeLearner<TREELEARNER_T>::~DataParallelTreeLearner() {
 
 }
 
-void DataParallelTreeLearner::Init(const Dataset* train_data) {
+template <typename TREELEARNER_T>
+void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
   // initialize SerialTreeLearner
-  SerialTreeLearner::Init(train_data);
+  TREELEARNER_T::Init(train_data, is_constant_hessian);
   // Get local rank and global machine size
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
   // allocate buffer for communication
-  size_t buffer_size = train_data_->NumTotalBin() * sizeof(HistogramBinEntry);
+  size_t buffer_size = this->train_data_->NumTotalBin() * sizeof(HistogramBinEntry);
 
   input_buffer_.resize(buffer_size);
   output_buffer_.resize(buffer_size);
 
-  is_feature_aggregated_.resize(num_features_);
+  is_feature_aggregated_.resize(this->num_features_);
 
   block_start_.resize(num_machines_);
   block_len_.resize(num_machines_);
 
-  buffer_write_start_pos_.resize(num_features_);
-  buffer_read_start_pos_.resize(num_features_);
-  global_data_count_in_leaf_.resize(tree_config_->num_leaves);
+  buffer_write_start_pos_.resize(this->num_features_);
+  buffer_read_start_pos_.resize(this->num_features_);
+  global_data_count_in_leaf_.resize(this->tree_config_->num_leaves);
 }
 
-void DataParallelTreeLearner::ResetConfig(const TreeConfig* tree_config) {
-  SerialTreeLearner::ResetConfig(tree_config);
-  global_data_count_in_leaf_.resize(tree_config_->num_leaves);
+template <typename TREELEARNER_T>
+void DataParallelTreeLearner<TREELEARNER_T>::ResetConfig(const TreeConfig* tree_config) {
+  TREELEARNER_T::ResetConfig(tree_config);
+  global_data_count_in_leaf_.resize(this->tree_config_->num_leaves);
 }
 
-void DataParallelTreeLearner::BeforeTrain() {
-  SerialTreeLearner::BeforeTrain();
+template <typename TREELEARNER_T>
+void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
+  TREELEARNER_T::BeforeTrain();
   // generate feature partition for current tree
   std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
   std::vector<int> num_bins_distributed(num_machines_, 0);
-  for (int i = 0; i < train_data_->num_total_features(); ++i) {
-    int inner_feature_index = train_data_->InnerFeatureIndex(i);
+  for (int i = 0; i < this->train_data_->num_total_features(); ++i) {
+    int inner_feature_index = this->train_data_->InnerFeatureIndex(i);
     if (inner_feature_index == -1) { continue; }
-    if (is_feature_used_[inner_feature_index]) {
+    if (this->is_feature_used_[inner_feature_index]) {
       int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
       feature_distribution[cur_min_machine].push_back(inner_feature_index);
-      auto num_bin = train_data_->FeatureNumBin(inner_feature_index);
-      if (train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin() == 0) {
+      auto num_bin = this->train_data_->FeatureNumBin(inner_feature_index);
+      if (this->train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin() == 0) {
         num_bin -= 1;
       }
       num_bins_distributed[cur_min_machine] += num_bin;
@@ -71,8 +76,8 @@ void DataParallelTreeLearner::BeforeTrain() {
   for (int i = 0; i < num_machines_; ++i) {
     block_len_[i] = 0;
     for (auto fid : feature_distribution[i]) {
-      auto num_bin = train_data_->FeatureNumBin(fid);
-      if (train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
+      auto num_bin = this->train_data_->FeatureNumBin(fid);
+      if (this->train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
         num_bin -= 1;
       }
       block_len_[i] += num_bin * sizeof(HistogramBinEntry);
@@ -90,8 +95,8 @@ void DataParallelTreeLearner::BeforeTrain() {
   for (int i = 0; i < num_machines_; ++i) {
     for (auto fid : feature_distribution[i]) {
       buffer_write_start_pos_[fid] = bin_size;
-      auto num_bin = train_data_->FeatureNumBin(fid);
-      if (train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
+      auto num_bin = this->train_data_->FeatureNumBin(fid);
+      if (this->train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
         num_bin -= 1;
       }
       bin_size += num_bin * sizeof(HistogramBinEntry);
@@ -102,16 +107,16 @@ void DataParallelTreeLearner::BeforeTrain() {
   bin_size = 0;
   for (auto fid : feature_distribution[rank_]) {
     buffer_read_start_pos_[fid] = bin_size;
-    auto num_bin = train_data_->FeatureNumBin(fid);
-    if (train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
+    auto num_bin = this->train_data_->FeatureNumBin(fid);
+    if (this->train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
       num_bin -= 1;
     }
     bin_size += num_bin * sizeof(HistogramBinEntry);
   }
 
   // sync global data sumup info
-  std::tuple<data_size_t, double, double> data(smaller_leaf_splits_->num_data_in_leaf(),
-                                               smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians());
+  std::tuple<data_size_t, double, double> data(this->smaller_leaf_splits_->num_data_in_leaf(),
+                                               this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians());
   int size = sizeof(data);
   std::memcpy(input_buffer_.data(), &data, size);
   // global sumup reduce
@@ -134,93 +139,95 @@ void DataParallelTreeLearner::BeforeTrain() {
   // copy back
   std::memcpy(&data, output_buffer_.data(), size);
   // set global sumup info
-  smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data));
+  this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data));
   // init global data count in leaf
   global_data_count_in_leaf_[0] = std::get<0>(data);
 }
 
-void DataParallelTreeLearner::FindBestThresholds() {
-  ConstructHistograms(is_feature_used_, true);
+template <typename TREELEARNER_T>
+void DataParallelTreeLearner<TREELEARNER_T>::FindBestThresholds() {
+  this->ConstructHistograms(this->is_feature_used_, true);
   // construct local histograms
   #pragma omp parallel for schedule(static)
-  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
-    if ((!is_feature_used_.empty() && is_feature_used_[feature_index] == false)) continue;
+  for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
+    if ((!this->is_feature_used_.empty() && this->is_feature_used_[feature_index] == false)) continue;
     // copy to buffer
     std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index],
-                smaller_leaf_histogram_array_[feature_index].RawData(),
-                smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
+                this->smaller_leaf_histogram_array_[feature_index].RawData(),
+                this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
   }
   // Reduce scatter for histogram
   Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, block_start_.data(),
                          block_len_.data(), output_buffer_.data(), &HistogramBinEntry::SumReducer);
 
-  std::vector<SplitInfo> smaller_best(num_threads_, SplitInfo());
-  std::vector<SplitInfo> larger_best(num_threads_, SplitInfo());
+  std::vector<SplitInfo> smaller_best(this->num_threads_, SplitInfo());
+  std::vector<SplitInfo> larger_best(this->num_threads_, SplitInfo());
   OMP_INIT_EX();
   #pragma omp parallel for schedule(static)
-  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+  for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
     OMP_LOOP_EX_BEGIN();
     if (!is_feature_aggregated_[feature_index]) continue;
     const int tid = omp_get_thread_num();
     // restore global histograms from buffer
-    smaller_leaf_histogram_array_[feature_index].FromMemory(
+    this->smaller_leaf_histogram_array_[feature_index].FromMemory(
       output_buffer_.data() + buffer_read_start_pos_[feature_index]);
 
-    train_data_->FixHistogram(feature_index,
-                              smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
-                              GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
-                              smaller_leaf_histogram_array_[feature_index].RawData());
+    this->train_data_->FixHistogram(feature_index,
+                              this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
+                              GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()),
+                              this->smaller_leaf_histogram_array_[feature_index].RawData());
     SplitInfo smaller_split;
     // find best threshold for smaller child
-    smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
-      smaller_leaf_splits_->sum_gradients(),
-      smaller_leaf_splits_->sum_hessians(),
-      GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
+    this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
+      this->smaller_leaf_splits_->sum_gradients(),
+      this->smaller_leaf_splits_->sum_hessians(),
+      GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()),
       &smaller_split);
     if (smaller_split.gain > smaller_best[tid].gain) {
       smaller_best[tid] = smaller_split;
-      smaller_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
+      smaller_best[tid].feature = this->train_data_->RealFeatureIndex(feature_index);
     }
 
     // only root leaf
-    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;
+    if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->LeafIndex() < 0) continue;
 
     // construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
-    larger_leaf_histogram_array_[feature_index].Subtract(
-      smaller_leaf_histogram_array_[feature_index]);
+    this->larger_leaf_histogram_array_[feature_index].Subtract(
+      this->smaller_leaf_histogram_array_[feature_index]);
     SplitInfo larger_split;
     // find best threshold for larger child
-    larger_leaf_histogram_array_[feature_index].FindBestThreshold(
-      larger_leaf_splits_->sum_gradients(),
-      larger_leaf_splits_->sum_hessians(),
-      GetGlobalDataCountInLeaf(larger_leaf_splits_->LeafIndex()),
+    this->larger_leaf_histogram_array_[feature_index].FindBestThreshold(
+      this->larger_leaf_splits_->sum_gradients(),
+      this->larger_leaf_splits_->sum_hessians(),
+      GetGlobalDataCountInLeaf(this->larger_leaf_splits_->LeafIndex()),
       &larger_split);
     if (larger_split.gain > larger_best[tid].gain) {
       larger_best[tid] = larger_split;
-      larger_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
+      larger_best[tid].feature = this->train_data_->RealFeatureIndex(feature_index);
     }
     OMP_LOOP_EX_END();
   }
   OMP_THROW_EX();
   auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
-  int leaf = smaller_leaf_splits_->LeafIndex();
-  best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
+  int leaf = this->smaller_leaf_splits_->LeafIndex();
+  this->best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
 
 
-  if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { return; }
+  if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->LeafIndex() < 0) { return; }
 
-  leaf = larger_leaf_splits_->LeafIndex();
+  leaf = this->larger_leaf_splits_->LeafIndex();
   auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
-  best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
+  this->best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
 
 }
 
-void DataParallelTreeLearner::FindBestSplitsForLeaves() {
+template <typename TREELEARNER_T>
+void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsForLeaves() {
   SplitInfo smaller_best, larger_best;
-  smaller_best = best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()];
+  smaller_best = this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()];
   // find local best split for larger leaf
-  if (larger_leaf_splits_->LeafIndex() >= 0) {
-    larger_best = best_split_per_leaf_[larger_leaf_splits_->LeafIndex()];
+  if (this->larger_leaf_splits_->LeafIndex() >= 0) {
+    larger_best = this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()];
   }
 
   // sync global best info
@@ -234,19 +241,23 @@ void DataParallelTreeLearner::FindBestSplitsForLeaves() {
   std::memcpy(&larger_best, output_buffer_.data() + sizeof(SplitInfo), sizeof(SplitInfo));
 
   // set best split
-  best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()] = smaller_best;
-  if (larger_leaf_splits_->LeafIndex() >= 0) {
-    best_split_per_leaf_[larger_leaf_splits_->LeafIndex()] = larger_best;
+  this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()] = smaller_best;
+  if (this->larger_leaf_splits_->LeafIndex() >= 0) {
+    this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()] = larger_best;
   }
 }
 
-void DataParallelTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
-  SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
-  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
+template <typename TREELEARNER_T>
+void DataParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
+  TREELEARNER_T::Split(tree, best_Leaf, left_leaf, right_leaf);
+  const SplitInfo& best_split_info = this->best_split_per_leaf_[best_Leaf];
   // need update global number of data in leaf
   global_data_count_in_leaf_[*left_leaf] = best_split_info.left_count;
   global_data_count_in_leaf_[*right_leaf] = best_split_info.right_count;
 }
 
+// instantiate template classes, otherwise linker cannot find the code
+template class DataParallelTreeLearner<GPUTreeLearner>;
+template class DataParallelTreeLearner<SerialTreeLearner>;
 
 }  // namespace LightGBM
diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp
index 5ad541fc00b4..b6f0831fbe9f 100644
--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -6,15 +6,20 @@
 
 namespace LightGBM {
 
-FeatureParallelTreeLearner::FeatureParallelTreeLearner(const TreeConfig* tree_config)
-  :SerialTreeLearner(tree_config) {
+
+template <typename TREELEARNER_T>
+FeatureParallelTreeLearner<TREELEARNER_T>::FeatureParallelTreeLearner(const TreeConfig* tree_config)
+  :TREELEARNER_T(tree_config) {
 }
 
-FeatureParallelTreeLearner::~FeatureParallelTreeLearner() {
+template <typename TREELEARNER_T>
+FeatureParallelTreeLearner<TREELEARNER_T>::~FeatureParallelTreeLearner() {
 
 }
-void FeatureParallelTreeLearner::Init(const Dataset* train_data) {
-  SerialTreeLearner::Init(train_data);
+
+template <typename TREELEARNER_T>
+void FeatureParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
+  TREELEARNER_T::Init(train_data, is_constant_hessian);
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
   input_buffer_.resize(sizeof(SplitInfo) * 2);
@@ -22,35 +27,36 @@ void FeatureParallelTreeLearner::Init(const Dataset* train_data) {
 }
 
 
-
-void FeatureParallelTreeLearner::BeforeTrain() {
-  SerialTreeLearner::BeforeTrain();
+template <typename TREELEARNER_T>
+void FeatureParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
+  TREELEARNER_T::BeforeTrain();
   // get feature partition
   std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
   std::vector<int> num_bins_distributed(num_machines_, 0);
-  for (int i = 0; i < train_data_->num_total_features(); ++i) {
-    int inner_feature_index = train_data_->InnerFeatureIndex(i);
+  for (int i = 0; i < this->train_data_->num_total_features(); ++i) {
+    int inner_feature_index = this->train_data_->InnerFeatureIndex(i);
     if (inner_feature_index == -1) { continue; }
-    if (is_feature_used_[inner_feature_index]) {
+    if (this->is_feature_used_[inner_feature_index]) {
       int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
       feature_distribution[cur_min_machine].push_back(inner_feature_index);
-      num_bins_distributed[cur_min_machine] += train_data_->FeatureNumBin(inner_feature_index);
-      is_feature_used_[inner_feature_index] = false;
+      num_bins_distributed[cur_min_machine] += this->train_data_->FeatureNumBin(inner_feature_index);
+      this->is_feature_used_[inner_feature_index] = false;
     }
   }
   // get local used features
   for (auto fid : feature_distribution[rank_]) {
-    is_feature_used_[fid] = true;
+    this->is_feature_used_[fid] = true;
   }
 }
 
-void FeatureParallelTreeLearner::FindBestSplitsForLeaves() {
+template <typename TREELEARNER_T>
+void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsForLeaves() {
   SplitInfo smaller_best, larger_best;
   // get best split at smaller leaf
-  smaller_best = best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()];
+  smaller_best = this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()];
   // find local best split for larger leaf
-  if (larger_leaf_splits_->LeafIndex() >= 0) {
-    larger_best = best_split_per_leaf_[larger_leaf_splits_->LeafIndex()];
+  if (this->larger_leaf_splits_->LeafIndex() >= 0) {
+    larger_best = this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()];
   }
   // sync global best info
   std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo));
@@ -62,10 +68,13 @@ void FeatureParallelTreeLearner::FindBestSplitsForLeaves() {
   std::memcpy(&smaller_best, output_buffer_.data(), sizeof(SplitInfo));
   std::memcpy(&larger_best, output_buffer_.data() + sizeof(SplitInfo), sizeof(SplitInfo));
   // update best split
-  best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()] = smaller_best;
-  if (larger_leaf_splits_->LeafIndex() >= 0) {
-    best_split_per_leaf_[larger_leaf_splits_->LeafIndex()] = larger_best;
+  this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()] = smaller_best;
+  if (this->larger_leaf_splits_->LeafIndex() >= 0) {
+    this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()] = larger_best;
   }
 }
 
+// instantiate template classes, otherwise linker cannot find the code
+template class FeatureParallelTreeLearner<GPUTreeLearner>;
+template class FeatureParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
new file mode 100644
index 000000000000..e4b732e35376
--- /dev/null
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -0,0 +1,1093 @@
+#ifdef USE_GPU
+#include "gpu_tree_learner.h"
+#include "../io/dense_bin.hpp"
+#include "../io/dense_nbits_bin.hpp"
+
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/network.h>
+#include <LightGBM/bin.h>
+
+#include <algorithm>
+#include <vector>
+
+
+#define GPU_DEBUG 0
+
+namespace LightGBM {
+
+GPUTreeLearner::GPUTreeLearner(const TreeConfig* tree_config)
+  :SerialTreeLearner(tree_config) {
+  use_bagging_ = false;
+  Log::Info("This is the GPU trainer!!");
+}
+
+GPUTreeLearner::~GPUTreeLearner() {
+  if (ptr_pinned_gradients_) {
+    queue_.enqueue_unmap_buffer(pinned_gradients_, ptr_pinned_gradients_);
+  }
+  if (ptr_pinned_hessians_) {
+    queue_.enqueue_unmap_buffer(pinned_hessians_, ptr_pinned_hessians_);
+  }
+  if (ptr_pinned_feature_masks_) {
+    queue_.enqueue_unmap_buffer(pinned_feature_masks_, ptr_pinned_feature_masks_);
+  }
+}
+
+void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
+  // initialize SerialTreeLearner
+  SerialTreeLearner::Init(train_data, is_constant_hessian);
+  // some additional variables needed for GPU trainer
+  num_feature_groups_ = train_data_->num_feature_groups();
+  // Initialize GPU buffers and kernels
+  InitGPU(tree_config_->gpu_platform_id, tree_config_->gpu_device_id);
+}
+
+// some functions used for debugging the GPU histogram construction
+#if GPU_DEBUG > 0
+
+void PrintHistograms(HistogramBinEntry* h, size_t size) {
+  size_t total = 0;
+  for (size_t i = 0; i < size; ++i) {
+    printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt);
+    total += h[i].cnt;
+    if ((i & 3) == 3)
+        printf("\n");
+  }
+  printf("\nTotal examples: %lu\n", total);
+}
+
+union Float_t
+{
+    int64_t i;
+    double f;
+    static int64_t ulp_diff(Float_t a, Float_t b) {
+      return abs(a.i - b.i);
+    }
+};
+  
+
+void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) {
+  size_t i;
+  Float_t a, b;
+  for (i = 0; i < size; ++i) {
+    a.f = h1[i].sum_gradients;
+    b.f = h2[i].sum_gradients;
+    int32_t ulps = Float_t::ulp_diff(a, b);
+    if (fabs(h1[i].cnt           - h2[i].cnt != 0)) {
+      printf("%d != %d\n", h1[i].cnt, h2[i].cnt);
+      goto err;
+    }
+    if (ulps > 0) {
+      // printf("grad %g != %g (%d ULPs)\n", h1[i].sum_gradients, h2[i].sum_gradients, ulps);
+      // goto err;
+    }
+    a.f = h1[i].sum_hessians;
+    b.f = h2[i].sum_hessians;
+    ulps = Float_t::ulp_diff(a, b);
+    if (ulps > 0) {
+      // printf("hessian %g != %g (%d ULPs)\n", h1[i].sum_hessians, h2[i].sum_hessians, ulps);
+      // goto err;
+    }
+  }
+  return;
+err:
+  Log::Warning("Mismatched histograms found for feature %d at location %lu.", feature_id, i);
+  std::cin.get();
+  PrintHistograms(h1, size);
+  printf("\n");
+  PrintHistograms(h2, size);
+  std::cin.get();
+}
+#endif
+
+int GPUTreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
+  // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples.
+  // also guarantee that there are at least 2K examples per workgroup
+  double x = 256.0 / num_dense_feature4_;
+  int exp_workgroups_per_feature = ceil(log2(x));
+  double t = leaf_num_data / 1024.0;
+  #if GPU_DEBUG >= 4
+  printf("Computing histogram for %d examples and (%d * %d) feature groups\n", leaf_num_data, dword_features_, num_dense_feature4_);
+  printf("We can have at most %d workgroups per feature4 for efficiency reasons.\n"
+  "Best workgroup size per feature for full utilization is %d\n", (int)ceil(t), (1 << exp_workgroups_per_feature));
+  #endif
+  exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, (int)ceil(log((double)t)/log(2.0)));
+  if (exp_workgroups_per_feature < 0)
+      exp_workgroups_per_feature = 0;
+  if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature)
+      exp_workgroups_per_feature = kMaxLogWorkgroupsPerFeature;
+  // return 0;
+  return exp_workgroups_per_feature;
+}
+
+void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) {
+  // we have already copied ordered gradients, ordered hessians and indices to GPU
+  // decide the best number of workgroups working on one feature4 tuple
+  // set work group size based on feature size
+  // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple
+  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data);
+  int num_workgroups = (1 << exp_workgroups_per_feature) * num_dense_feature4_;
+  if (num_workgroups > preallocd_max_num_wg_) {
+    preallocd_max_num_wg_ = num_workgroups;
+    Log::Info("Increasing preallocd_max_num_wg_ to %d for launching more workgroups.", preallocd_max_num_wg_);
+    device_subhistograms_.reset(new boost::compute::vector<char>(
+                              preallocd_max_num_wg_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_, ctx_));
+    // we need to refresh the kernel arguments after reallocating
+    for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
+      // The only argument that needs to be changed later is num_data_
+      histogram_kernels_[i].set_arg(7, *device_subhistograms_);
+      histogram_allfeats_kernels_[i].set_arg(7, *device_subhistograms_);
+      histogram_fulldata_kernels_[i].set_arg(7, *device_subhistograms_);
+    }
+  }
+  #if GPU_DEBUG >= 4
+  printf("setting exp_workgroups_per_feature to %d, using %u work groups\n", exp_workgroups_per_feature, num_workgroups);
+  printf("Constructing histogram with %d examples\n", leaf_num_data);
+  #endif
+  
+  // the GPU kernel will process all features in one call, and each
+  // 2^exp_workgroups_per_feature (compile time constant) workgroup will
+  // process one feature4 tuple
+
+  if (use_all_features) {
+    histogram_allfeats_kernels_[exp_workgroups_per_feature].set_arg(4, leaf_num_data);
+  }
+  else {
+    histogram_kernels_[exp_workgroups_per_feature].set_arg(4, leaf_num_data);
+  }
+  // for the root node, indices are not copied
+  if (leaf_num_data != num_data_) {
+    indices_future_.wait();
+  }
+  // for constant hessian, hessians are not copied except for the root node
+  if (!is_constant_hessian_) {
+    hessians_future_.wait();
+  }
+  gradients_future_.wait();
+  // there will be 2^exp_workgroups_per_feature = num_workgroups / num_dense_feature4 sub-histogram per feature4
+  // and we will launch num_feature workgroups for this kernel
+  // will launch threads for all features
+  // the queue should be asynchrounous, and we will can WaitAndGetHistograms() before we start processing dense feature groups
+  if (leaf_num_data == num_data_) {
+    kernel_wait_obj_ = boost::compute::wait_list(queue_.enqueue_1d_range_kernel(histogram_fulldata_kernels_[exp_workgroups_per_feature], 0, num_workgroups * 256, 256));
+  }
+  else {
+    if (use_all_features) {
+      kernel_wait_obj_ = boost::compute::wait_list(
+                         queue_.enqueue_1d_range_kernel(histogram_allfeats_kernels_[exp_workgroups_per_feature], 0, num_workgroups * 256, 256));
+    }
+    else {
+      kernel_wait_obj_ = boost::compute::wait_list(
+                         queue_.enqueue_1d_range_kernel(histogram_kernels_[exp_workgroups_per_feature], 0, num_workgroups * 256, 256));
+    }
+  }
+  // copy the results asynchronously. Size depends on if double precision is used
+  size_t output_size = num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
+  boost::compute::event histogram_wait_event;
+  host_histogram_outputs_ = (void*)queue_.enqueue_map_buffer_async(device_histogram_outputs_, boost::compute::command_queue::map_read, 
+                                                                   0, output_size, histogram_wait_event, kernel_wait_obj_);
+  // we will wait for this object in WaitAndGetHistograms
+  histograms_wait_obj_ = boost::compute::wait_list(histogram_wait_event);
+}
+
+template <typename HistType>
+void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms, const std::vector<int8_t>& is_feature_used) {
+  HistType* hist_outputs = (HistType*) host_histogram_outputs_;
+  // when the output is ready, the computation is done
+  histograms_wait_obj_.wait();
+  #pragma omp parallel for schedule(static)
+  for(int i = 0; i < num_dense_feature_groups_; ++i) {
+    if (!feature_masks_[i]) {
+      continue;
+    }
+    int dense_group_index = dense_feature_group_map_[i];
+    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index);
+    int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); 
+    if (device_bin_mults_[i] == 1) {
+      for (int j = 0; j < bin_size; ++j) {
+        old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients;
+        old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians;
+        old_histogram_array[j].cnt = hist_outputs[i * device_bin_size_ + j].cnt;
+      }
+    }
+    else {
+      // values of this feature has been redistributed to multiple bins; need a reduction here
+      int ind = 0;
+      for (int j = 0; j < bin_size; ++j) {
+        double sum_g = 0.0, sum_h = 0.0;
+        size_t cnt = 0;
+        for (int k = 0; k < device_bin_mults_[i]; ++k) {
+          sum_g += hist_outputs[i * device_bin_size_+ ind].sum_gradients;
+          sum_h += hist_outputs[i * device_bin_size_+ ind].sum_hessians;
+          cnt += hist_outputs[i * device_bin_size_ + ind].cnt;
+          ind++;
+        }
+        old_histogram_array[j].sum_gradients = sum_g;
+        old_histogram_array[j].sum_hessians = sum_h;
+        old_histogram_array[j].cnt = cnt;
+      }
+    }
+  }
+  queue_.enqueue_unmap_buffer(device_histogram_outputs_, host_histogram_outputs_);
+}
+
+void GPUTreeLearner::AllocateGPUMemory() {
+  num_dense_feature_groups_ = 0;
+  for (int i = 0; i < num_feature_groups_; ++i) {
+    if (ordered_bins_[i] == nullptr) {
+      num_dense_feature_groups_++;
+    }
+  }
+  // how many feature-group tuples we have
+  num_dense_feature4_ = (num_dense_feature_groups_ + (dword_features_ - 1)) / dword_features_;
+  // leave some safe margin for prefetching
+  // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
+  int allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
+  // clear sparse/dense maps
+  dense_feature_group_map_.clear();
+  device_bin_mults_.clear();
+  sparse_feature_group_map_.clear();
+  // do nothing if no features can be processed on GPU
+  if (!num_dense_feature_groups_) {
+    Log::Warning("GPU acceleration is disabled because no non-trival dense features can be found");
+    return;
+  }
+  // allocate memory for all features (FIXME: 4 GB barrier on some devices, need to split to multiple buffers)
+  device_features_.reset();
+  device_features_ = std::unique_ptr<boost::compute::vector<Feature4>>(new boost::compute::vector<Feature4>(num_dense_feature4_ * num_data_, ctx_));
+  // unpin old buffer if necessary before destructing them
+  if (ptr_pinned_gradients_) {
+    queue_.enqueue_unmap_buffer(pinned_gradients_, ptr_pinned_gradients_);
+  }
+  if (ptr_pinned_hessians_) {
+    queue_.enqueue_unmap_buffer(pinned_hessians_, ptr_pinned_hessians_);
+  }
+  if (ptr_pinned_feature_masks_) {
+    queue_.enqueue_unmap_buffer(pinned_feature_masks_, ptr_pinned_feature_masks_);
+  }
+  // make ordered_gradients and hessians larger (including extra room for prefetching), and pin them 
+  ordered_gradients_.reserve(allocated_num_data_);
+  ordered_hessians_.reserve(allocated_num_data_);
+  pinned_gradients_ = boost::compute::buffer(); // deallocate
+  pinned_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t), 
+                                             boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr, 
+                                             ordered_gradients_.data());
+  ptr_pinned_gradients_ = queue_.enqueue_map_buffer(pinned_gradients_, boost::compute::command_queue::map_write_invalidate_region, 
+                                                    0, allocated_num_data_ * sizeof(score_t));
+  pinned_hessians_ = boost::compute::buffer(); // deallocate
+  pinned_hessians_  = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t), 
+                                             boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr, 
+                                             ordered_hessians_.data());
+  ptr_pinned_hessians_ = queue_.enqueue_map_buffer(pinned_hessians_, boost::compute::command_queue::map_write_invalidate_region, 
+                                                   0, allocated_num_data_ * sizeof(score_t));
+  // allocate space for gradients and hessians on device
+  // we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed
+  device_gradients_ = boost::compute::buffer(); // deallocate
+  device_gradients_ = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t), 
+                      boost::compute::memory_object::read_only, nullptr);
+  device_hessians_ = boost::compute::buffer(); // deallocate
+  device_hessians_  = boost::compute::buffer(ctx_, allocated_num_data_ * sizeof(score_t), 
+                      boost::compute::memory_object::read_only, nullptr);
+  // allocate feature mask, for disabling some feature-groups' histogram calculation
+  feature_masks_.resize(num_dense_feature4_ * dword_features_);
+  device_feature_masks_ = boost::compute::buffer(); // deallocate
+  device_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_, 
+                          boost::compute::memory_object::read_only, nullptr);
+  pinned_feature_masks_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_, 
+                                             boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr, 
+                                             feature_masks_.data());
+  ptr_pinned_feature_masks_ = queue_.enqueue_map_buffer(pinned_feature_masks_, boost::compute::command_queue::map_write_invalidate_region,
+                                                        0, num_dense_feature4_ * dword_features_);
+  memset(ptr_pinned_feature_masks_, 0, num_dense_feature4_ * dword_features_);
+  // copy indices to the device
+  device_data_indices_.reset();
+  device_data_indices_ = std::unique_ptr<boost::compute::vector<data_size_t>>(new boost::compute::vector<data_size_t>(allocated_num_data_, ctx_));
+  boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_);
+  // histogram bin entry size depends on the precision (single/double)
+  hist_bin_entry_sz_ = tree_config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry);
+  Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_);
+  // create output buffer, each feature has a histogram with device_bin_size_ bins,
+  // each work group generates a sub-histogram of dword_features_ features.
+  if (!device_subhistograms_) {
+    // only initialize once here, as this will not need to change when ResetTrainingData() is called
+    device_subhistograms_ = std::unique_ptr<boost::compute::vector<char>>(new boost::compute::vector<char>(
+                              preallocd_max_num_wg_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_, ctx_));
+  }
+  // create atomic counters for inter-group coordination
+  sync_counters_.reset();
+  sync_counters_ = std::unique_ptr<boost::compute::vector<int>>(new boost::compute::vector<int>(
+                    num_dense_feature4_, ctx_));
+  boost::compute::fill(sync_counters_->begin(), sync_counters_->end(), 0, queue_);
+  // The output buffer is allocated to host directly, to overlap compute and data transfer
+  device_histogram_outputs_ = boost::compute::buffer(); // deallocate
+  device_histogram_outputs_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_, 
+                           boost::compute::memory_object::write_only | boost::compute::memory_object::alloc_host_ptr, nullptr);
+  // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
+  int i, k, copied_feature4 = 0, dense_ind[dword_features_];
+  for (i = 0, k = 0; i < num_feature_groups_; ++i) {
+    // looking for dword_features_ non-sparse feature-groups
+    if (ordered_bins_[i] == nullptr) {
+      dense_ind[k] = i;
+      // decide if we need to redistribute the bin
+      double t = device_bin_size_ / (double)train_data_->FeatureGroupNumBin(i);
+      // multiplier must be a power of 2
+      device_bin_mults_.push_back((int)round(pow(2, floor(log2(t)))));
+      // device_bin_mults_.push_back(1);
+      #if GPU_DEBUG >= 1
+      printf("feature-group %d using multiplier %d\n", i, device_bin_mults_.back());
+      #endif
+      k++;
+    }
+    else {
+      sparse_feature_group_map_.push_back(i);
+    }
+    // found 
+    if (k == dword_features_) {
+      k = 0;
+      for (int j = 0; j < dword_features_; ++j) {
+        dense_feature_group_map_.push_back(dense_ind[j]);
+      }
+      copied_feature4++;
+    }
+  }
+  // for data transfer time
+  auto start_time = std::chrono::steady_clock::now();
+  // Now generate new data structure feature4, and copy data to the device
+  int nthreads = std::min(omp_get_max_threads(), (int)dense_feature_group_map_.size() / dword_features_);
+  nthreads = std::max(nthreads, 1);
+  std::vector<Feature4*> host4_vecs(nthreads);
+  std::vector<boost::compute::buffer> host4_bufs(nthreads);
+  std::vector<Feature4*> host4_ptrs(nthreads);
+  // preallocate arrays for all threads, and pin them
+  for (int i = 0; i < nthreads; ++i) {
+    host4_vecs[i] = (Feature4*)boost::alignment::aligned_alloc(4096, num_data_ * sizeof(Feature4));
+    host4_bufs[i] = boost::compute::buffer(ctx_, num_data_ * sizeof(Feature4), 
+                    boost::compute::memory_object::read_write | boost::compute::memory_object::use_host_ptr, 
+                    host4_vecs[i]);
+    host4_ptrs[i] = (Feature4*)queue_.enqueue_map_buffer(host4_bufs[i], boost::compute::command_queue::map_write_invalidate_region,
+                    0, num_data_ * sizeof(Feature4));
+  }
+  // building Feature4 bundles; each thread handles dword_features_ features
+  #pragma omp parallel for schedule(static)
+  for (unsigned int i = 0; i < dense_feature_group_map_.size() / dword_features_; ++i) {
+    int tid = omp_get_thread_num();
+    Feature4* host4 = host4_ptrs[tid];
+    auto dense_ind = dense_feature_group_map_.begin() + i * dword_features_;
+    auto dev_bin_mult = device_bin_mults_.begin() + i * dword_features_;
+    #if GPU_DEBUG >= 1
+    printf("Copying feature group ");
+    for (int l = 0; l < dword_features_; ++l) {
+      printf("%d ", dense_ind[l]);
+    }
+    printf("to devices\n");
+    #endif
+    if (dword_features_ == 8) {
+      // one feature datapoint is 4 bits
+      BinIterator* bin_iters[8];
+      for (int s_idx = 0; s_idx < 8; ++s_idx) {
+        bin_iters[s_idx] = train_data_->FeatureGroupIterator(dense_ind[s_idx]);
+        if (dynamic_cast<Dense4bitsBinIterator*>(bin_iters[s_idx]) == 0) {
+          Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not.", dense_ind[s_idx]);
+        }
+      }
+      // this guarantees that the RawGet() function is inlined, rather than using virtual function dispatching
+      Dense4bitsBinIterator iters[8] = {
+        *static_cast<Dense4bitsBinIterator*>(bin_iters[0]),
+        *static_cast<Dense4bitsBinIterator*>(bin_iters[1]),
+        *static_cast<Dense4bitsBinIterator*>(bin_iters[2]),
+        *static_cast<Dense4bitsBinIterator*>(bin_iters[3]),
+        *static_cast<Dense4bitsBinIterator*>(bin_iters[4]),
+        *static_cast<Dense4bitsBinIterator*>(bin_iters[5]),
+        *static_cast<Dense4bitsBinIterator*>(bin_iters[6]),
+        *static_cast<Dense4bitsBinIterator*>(bin_iters[7])};
+      for (int j = 0; j < num_data_; ++j) {
+        host4[j].s0 = (iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1))) 
+                    |((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4);
+        host4[j].s1 = (iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1))) 
+                    |((iters[3].RawGet(j) * dev_bin_mult[3] + ((j+3) & (dev_bin_mult[3] - 1))) << 4);
+        host4[j].s2 = (iters[4].RawGet(j) * dev_bin_mult[4] + ((j+4) & (dev_bin_mult[4] - 1))) 
+                    |((iters[5].RawGet(j) * dev_bin_mult[5] + ((j+5) & (dev_bin_mult[5] - 1))) << 4);
+        host4[j].s3 = (iters[6].RawGet(j) * dev_bin_mult[6] + ((j+6) & (dev_bin_mult[6] - 1))) 
+                    |((iters[7].RawGet(j) * dev_bin_mult[7] + ((j+7) & (dev_bin_mult[7] - 1))) << 4);
+      }
+    }
+    else if (dword_features_ == 4) {
+      // one feature datapoint is one byte
+      for (int s_idx = 0; s_idx < 4; ++s_idx) {
+        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_ind[s_idx]);
+        // this guarantees that the RawGet() function is inlined, rather than using virtual function dispatching
+        if (dynamic_cast<DenseBinIterator<uint8_t>*>(bin_iter) != 0) {
+          // Dense bin
+          DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter);
+          for (int j = 0; j < num_data_; ++j) {
+            host4[j].s[s_idx] = iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1));
+          }
+        }
+        else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
+          // Dense 4-bit bin
+          Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
+          for (int j = 0; j < num_data_; ++j) {
+            host4[j].s[s_idx] = iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1));
+          }
+        }
+        else {
+          Log::Fatal("Bug in GPU tree builder: only DenseBin and Dense4bitsBin are supported!"); 
+        }
+      }
+    }
+    else {
+      Log::Fatal("Bug in GPU tree builder: dword_features_ can only be 4 or 8!");
+    }
+    queue_.enqueue_write_buffer(device_features_->get_buffer(),
+                        i * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
+    #if GPU_DEBUG >= 1
+    printf("first example of feature-group tuple is: %d %d %d %d\n", host4[0].s0, host4[0].s1, host4[0].s2, host4[0].s3);
+    printf("Feature-groups copied to device with multipliers ");
+    for (int l = 0; l < dword_features_; ++l) {
+      printf("%d ", dev_bin_mult[l]);
+    }
+    printf("\n");
+    #endif
+  }
+  // working on the remaining (less than dword_features_) feature groups
+  if (k != 0) {
+    Feature4* host4 = host4_ptrs[0];
+    if (dword_features_ == 8) {
+      memset(host4, 0, num_data_ * sizeof(Feature4));
+    }
+    #if GPU_DEBUG >= 1
+    printf("%d features left\n", k);
+    #endif
+    for (i = 0; i < k; ++i) {
+      if (dword_features_ == 8) {
+        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_ind[i]);
+        if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
+          Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
+          #pragma omp parallel for schedule(static)
+          for (int j = 0; j < num_data_; ++j) {
+            host4[j].s[i >> 1] |= ((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+                                + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
+                               << ((i & 1) << 2));
+          }
+        }
+        else {
+          Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not.", dense_ind[i]);
+        }
+      }
+      else if (dword_features_ == 4) {
+        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_ind[i]);
+        if (dynamic_cast<DenseBinIterator<uint8_t>*>(bin_iter) != 0) {
+          DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter);
+          #pragma omp parallel for schedule(static)
+          for (int j = 0; j < num_data_; ++j) {
+            host4[j].s[i] = iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] 
+                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1));
+          }
+        }
+        else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
+          Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
+          #pragma omp parallel for schedule(static)
+          for (int j = 0; j < num_data_; ++j) {
+            host4[j].s[i] = iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] 
+                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1));
+          }
+        }
+        else {
+          Log::Fatal("BUG in GPU tree builder: only DenseBin and Dense4bitsBin are supported!"); 
+        }
+      }
+      else {
+        Log::Fatal("Bug in GPU tree builder: dword_features_ can only be 4 or 8!");
+      }
+    }
+    // fill the leftover features
+    if (dword_features_ == 8) {
+      #pragma omp parallel for schedule(static)
+      for (int j = 0; j < num_data_; ++j) {
+        for (i = k; i < dword_features_; ++i) {
+          // fill this empty feature with some "random" value
+          host4[j].s[i >> 1] |= ((j & 0xf) << ((i & 1) << 2));
+        }
+      }
+    }
+    else if (dword_features_ == 4) {
+      #pragma omp parallel for schedule(static)
+      for (int j = 0; j < num_data_; ++j) {
+        for (i = k; i < dword_features_; ++i) {
+          // fill this empty feature with some "random" value
+          host4[j].s[i] = j;
+        }
+      }
+    }
+    // copying the last 1 to (dword_features - 1) feature-groups in the last tuple
+    queue_.enqueue_write_buffer(device_features_->get_buffer(),
+                        (num_dense_feature4_ - 1) * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
+    #if GPU_DEBUG >= 1
+    printf("Last features copied to device\n");
+    #endif
+    for (i = 0; i < k; ++i) {
+      dense_feature_group_map_.push_back(dense_ind[i]);
+    }
+  }
+  // deallocate pinned space for feature copying
+  for (int i = 0; i < nthreads; ++i) {
+      queue_.enqueue_unmap_buffer(host4_bufs[i], host4_ptrs[i]);
+      host4_bufs[i] = boost::compute::buffer();
+      boost::alignment::aligned_free(host4_vecs[i]);
+  }
+  // data transfer time
+  std::chrono::duration<double, std::milli> end_time = std::chrono::steady_clock::now() - start_time;
+  Log::Info("%d dense feature groups (%.2f MB) transfered to GPU in %f secs. %d sparse feature groups.", 
+            dense_feature_group_map_.size(), ((dense_feature_group_map_.size() + (dword_features_ - 1)) / dword_features_) * num_data_ * sizeof(Feature4) / (1024.0 * 1024.0), 
+            end_time * 1e-3, sparse_feature_group_map_.size());
+  #if GPU_DEBUG >= 1
+  printf("Dense feature group list (size %lu): ", dense_feature_group_map_.size());
+  for (i = 0; i < num_dense_feature_groups_; ++i) {
+    printf("%d ", dense_feature_group_map_[i]);
+  }
+  printf("\n");
+  printf("Sparse feature group list (size %lu): ", sparse_feature_group_map_.size());
+  for (i = 0; i < num_feature_groups_ - num_dense_feature_groups_; ++i) {
+    printf("%d ", sparse_feature_group_map_[i]);
+  }
+  printf("\n");
+  #endif
+}
+
+void GPUTreeLearner::BuildGPUKernels() {
+  Log::Info("Compiling OpenCL Kernel with %d bins...", device_bin_size_);
+  // destroy any old kernels
+  histogram_kernels_.clear();
+  histogram_allfeats_kernels_.clear();
+  histogram_fulldata_kernels_.clear();
+  // create OpenCL kernels for different number of workgroups per feature
+  histogram_kernels_.resize(kMaxLogWorkgroupsPerFeature+1);
+  histogram_allfeats_kernels_.resize(kMaxLogWorkgroupsPerFeature+1);
+  histogram_fulldata_kernels_.resize(kMaxLogWorkgroupsPerFeature+1);
+  // currently we don't use constant memory
+  int use_constants = 0;
+  #pragma omp parallel for schedule(guided)
+  for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
+    boost::compute::program program;
+    std::ostringstream opts;
+    // compile the GPU kernel depending if double precision is used, constant hessian is used, etc 
+    opts << " -D POWER_FEATURE_WORKGROUPS=" << i
+         << " -D USE_CONSTANT_BUF=" << use_constants << " -D USE_DP_FLOAT=" << int(tree_config_->gpu_use_dp)
+         << " -D CONST_HESSIAN=" << int(is_constant_hessian_)
+         << " -cl-strict-aliasing -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math";
+    #if GPU_DEBUG >= 1
+    std::cout << "Building GPU kernels with options: " << opts.str() << std::endl;
+    #endif
+    // kernel with indices in an array
+    try {
+      program = boost::compute::program::build_with_source(kernel_source_, ctx_, opts.str());
+    }
+    catch (boost::compute::opencl_error &e) {
+      if (program.build_log().size() > 0) {
+        Log::Fatal("GPU program built failure:\n %s", program.build_log().c_str());
+      }
+      else {
+        Log::Fatal("GPU program built failure, log unavailable");
+      }
+    }
+    histogram_kernels_[i] = program.create_kernel(kernel_name_);
+    
+    // kernel with all features enabled, with elimited branches
+    opts << " -D ENABLE_ALL_FEATURES=1";
+    try {
+      program = boost::compute::program::build_with_source(kernel_source_, ctx_, opts.str());
+    }
+    catch (boost::compute::opencl_error &e) {
+      if (program.build_log().size() > 0) {
+        Log::Fatal("GPU program built failure:\n %s", program.build_log().c_str());
+      }
+      else {
+        Log::Fatal("GPU program built failure, log unavailable");
+      }
+    }
+    histogram_allfeats_kernels_[i] = program.create_kernel(kernel_name_);
+
+    // kernel with all data indices (for root node, and assumes that root node always uses all features)
+    opts << " -D IGNORE_INDICES=1";
+    try {
+      program = boost::compute::program::build_with_source(kernel_source_, ctx_, opts.str());
+    }
+    catch (boost::compute::opencl_error &e) {
+      if (program.build_log().size() > 0) {
+        Log::Fatal("GPU program built failure:\n %s", program.build_log().c_str());
+      }
+      else {
+        Log::Fatal("GPU program built failure, log unavailable");
+      }
+    }
+    histogram_fulldata_kernels_[i] = program.create_kernel(kernel_name_);
+  }
+  Log::Info("GPU programs have been built");
+}
+
+void GPUTreeLearner::SetupKernelArguments() {
+  // do nothing if no features can be processed on GPU
+  if (!num_dense_feature_groups_) {
+    return;
+  }
+  for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
+    // The only argument that needs to be changed later is num_data_
+    if (is_constant_hessian_) {
+      // hessian is passed as a parameter, but it is not available now. 
+      // hessian will be set in BeforeTrain()
+      histogram_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
+                                         *device_data_indices_, num_data_, device_gradients_, 0.0f,
+                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
+      histogram_allfeats_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
+                                         *device_data_indices_, num_data_, device_gradients_, 0.0f,
+                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
+      histogram_fulldata_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
+                                          *device_data_indices_, num_data_, device_gradients_, 0.0f,
+                                          *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
+    }
+    else {
+      histogram_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
+                                         *device_data_indices_, num_data_, device_gradients_, device_hessians_,
+                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
+      histogram_allfeats_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
+                                         *device_data_indices_, num_data_, device_gradients_, device_hessians_,
+                                         *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
+      histogram_fulldata_kernels_[i].set_args(*device_features_, device_feature_masks_, num_data_,
+                                          *device_data_indices_, num_data_, device_gradients_, device_hessians_,
+                                          *device_subhistograms_, *sync_counters_, device_histogram_outputs_);
+    }
+  }
+}
+
+void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
+  // Get the max bin size, used for selecting best GPU kernel
+  max_num_bin_ = 0;
+  #if GPU_DEBUG >= 1
+  printf("bin size: ");
+  #endif
+  for (int i = 0; i < num_feature_groups_; ++i) {
+    #if GPU_DEBUG >= 1
+    printf("%d, ", train_data_->FeatureGroupNumBin(i));
+    #endif
+    max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i));
+  }
+  #if GPU_DEBUG >= 1
+  printf("\n");
+  #endif
+  // initialize GPU
+  dev_ = boost::compute::system::default_device();
+  if (platform_id >= 0 && device_id >= 0) {
+    const std::vector<boost::compute::platform> platforms = boost::compute::system::platforms();
+    if ((int)platforms.size() > platform_id) {
+      const std::vector<boost::compute::device> platform_devices = platforms[platform_id].devices();
+      if ((int)platform_devices.size() > device_id) {
+        Log::Info("Using requested OpenCL platform %d device %d", platform_id, device_id);
+        dev_ = platform_devices[device_id];
+      }   
+    }   
+  }   
+  // determine which kernel to use based on the max number of bins
+  if (max_num_bin_ <= 16) {
+    kernel_source_ = kernel16_src_;
+    kernel_name_ = "histogram16";
+    device_bin_size_ = 16;
+    dword_features_ = 8;
+  }
+  else if (max_num_bin_ <= 64) {
+    kernel_source_ = kernel64_src_;
+    kernel_name_ = "histogram64";
+    device_bin_size_ = 64;
+    dword_features_ = 4;
+  }
+  else if ( max_num_bin_ <= 256) {
+    kernel_source_ = kernel256_src_;
+    kernel_name_ = "histogram256";
+    device_bin_size_ = 256;
+    dword_features_ = 4;
+  }
+  else {
+    Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
+  }
+  if(max_num_bin_ == 65) {
+    Log::Warning("Setting max_bin to 63 is sugguested for best performance");
+  }
+  if(max_num_bin_ == 17) {
+    Log::Warning("Setting max_bin to 15 is sugguested for best performance");
+  }
+  ctx_ = boost::compute::context(dev_);
+  queue_ = boost::compute::command_queue(ctx_, dev_);
+  Log::Info("Using GPU Device: %s, Vendor: %s", dev_.name().c_str(), dev_.vendor().c_str());
+  BuildGPUKernels();
+  AllocateGPUMemory();
+  // setup GPU kernel arguments after we allocating all the buffers
+  SetupKernelArguments();
+}
+
+Tree* GPUTreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian) {
+  // check if we need to recompile the GPU kernel (is_constant_hessian changed)
+  // this should rarely occur
+  if (is_constant_hessian != is_constant_hessian_) {
+    Log::Info("Recompiling GPU kernel because hessian is %sa constant now", is_constant_hessian ? "" : "not ");
+    is_constant_hessian_ = is_constant_hessian;
+    BuildGPUKernels();
+    SetupKernelArguments();
+  }
+  return SerialTreeLearner::Train(gradients, hessians, is_constant_hessian);
+}
+
+void GPUTreeLearner::ResetTrainingData(const Dataset* train_data) {
+  SerialTreeLearner::ResetTrainingData(train_data);
+  num_feature_groups_ = train_data_->num_feature_groups();
+  // GPU memory has to been reallocated because data may have been changed
+  AllocateGPUMemory();
+  // setup GPU kernel arguments after we allocating all the buffers
+  SetupKernelArguments();
+}
+
+void GPUTreeLearner::BeforeTrain() {
+
+  #if GPU_DEBUG >= 2
+  printf("Copying intial full gradients and hessians to device\n");
+  #endif
+  // Copy initial full hessians and gradients to GPU.
+  // We start copying as early as possible, instead of at ConstructHistogram().
+  if (!use_bagging_ && num_dense_feature_groups_) {
+    if (!is_constant_hessian_) {
+      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data_ * sizeof(score_t), hessians_);
+    }
+    else {
+      // setup hessian parameters only
+      score_t const_hessian = hessians_[0];
+      for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
+        // hessian is passed as a parameter
+        histogram_kernels_[i].set_arg(6, const_hessian);
+        histogram_allfeats_kernels_[i].set_arg(6, const_hessian);
+        histogram_fulldata_kernels_[i].set_arg(6, const_hessian);
+      }
+    }
+    gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, num_data_ * sizeof(score_t), gradients_);
+  }
+
+  SerialTreeLearner::BeforeTrain();
+
+  // use bagging
+  if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) {
+    // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram()
+    // copy used gradients and hessians to ordered buffer
+    const data_size_t* indices = data_partition_->indices();
+    data_size_t cnt = data_partition_->leaf_count(0);
+    #if GPU_DEBUG > 0
+    printf("Using bagging, examples count = %d\n", cnt);
+    #endif
+    // transfer the indices to GPU
+    indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_);
+    if (!is_constant_hessian_) {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < cnt; ++i) {
+        ordered_hessians_[i] = hessians_[indices[i]];
+      }
+      // transfer hessian to GPU
+      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, cnt * sizeof(score_t), ordered_hessians_.data());
+    }
+    else {
+      // setup hessian parameters only
+      score_t const_hessian = hessians_[indices[0]];
+      for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
+        // hessian is passed as a parameter
+        histogram_kernels_[i].set_arg(6, const_hessian);
+        histogram_allfeats_kernels_[i].set_arg(6, const_hessian);
+        histogram_fulldata_kernels_[i].set_arg(6, const_hessian);
+      }
+    }
+    #pragma omp parallel for schedule(static)
+    for (data_size_t i = 0; i < cnt; ++i) {
+      ordered_gradients_[i] = gradients_[indices[i]];
+    }
+    // transfer gradients to GPU
+    gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, cnt * sizeof(score_t), ordered_gradients_.data());
+  }
+}
+
+bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
+  int smaller_leaf;
+  data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
+  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
+  // only have root
+  if (right_leaf < 0) {
+    smaller_leaf = -1;
+  } else if (num_data_in_left_child < num_data_in_right_child) {
+    smaller_leaf = left_leaf;
+  } else {
+    smaller_leaf = right_leaf;
+  }
+
+  // Copy indices, gradients and hessians as early as possible
+  if (smaller_leaf >= 0 && num_dense_feature_groups_) {
+    // only need to initialize for smaller leaf
+    // Get leaf boundary
+    const data_size_t* indices = data_partition_->indices();
+    data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
+    data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);
+
+    // copy indices to the GPU:
+    #if GPU_DEBUG >= 2
+    Log::Info("Copying indices, gradients and hessians to GPU...");
+    printf("indices size %d being copied (left = %d, right = %d)\n", end - begin,num_data_in_left_child,num_data_in_right_child);
+    #endif
+    indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);
+
+    if (!is_constant_hessian_) {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = begin; i < end; ++i) {
+        ordered_hessians_[i - begin] = hessians_[indices[i]];
+      }
+      // copy ordered hessians to the GPU:
+      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, (end - begin) * sizeof(score_t), ptr_pinned_hessians_);
+    }
+
+    #pragma omp parallel for schedule(static)
+    for (data_size_t i = begin; i < end; ++i) {
+      ordered_gradients_[i - begin] = gradients_[indices[i]];
+    }
+    // copy ordered gradients to the GPU:
+    gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, (end - begin) * sizeof(score_t), ptr_pinned_gradients_);
+
+    #if GPU_DEBUG >= 2
+    Log::Info("gradients/hessians/indiex copied to device with size %d", end - begin);
+    #endif
+  }
+  return SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf);
+}
+
+bool GPUTreeLearner::ConstructGPUHistogramsAsync(
+  const std::vector<int8_t>& is_feature_used,
+  const data_size_t* data_indices, data_size_t num_data,
+  const score_t* gradients, const score_t* hessians,
+  score_t* ordered_gradients, score_t* ordered_hessians) {
+
+  if (num_data <= 0) {
+    return false;
+  }
+  // do nothing if no features can be processed on GPU
+  if (!num_dense_feature_groups_) {
+    return false;
+  }
+  
+  // copy data indices if it is not null
+  if (data_indices != nullptr && num_data != num_data_) {
+    indices_future_ = boost::compute::copy_async(data_indices, data_indices + num_data, device_data_indices_->begin(), queue_);
+  }
+  // generate and copy ordered_gradients if gradients is not null
+  if (gradients != nullptr) {
+    if (num_data != num_data_) {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data; ++i) {
+        ordered_gradients[i] = gradients[data_indices[i]];
+      }
+      gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, num_data * sizeof(score_t), ptr_pinned_gradients_);
+    }
+    else {
+      gradients_future_ = queue_.enqueue_write_buffer_async(device_gradients_, 0, num_data * sizeof(score_t), gradients);
+    }
+  }
+  // generate and copy ordered_hessians if hessians is not null
+  if (hessians != nullptr && !is_constant_hessian_) {
+    if (num_data != num_data_) {
+      #pragma omp parallel for schedule(static)
+      for (data_size_t i = 0; i < num_data; ++i) {
+        ordered_hessians[i] = hessians[data_indices[i]];
+      }
+      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data * sizeof(score_t), ptr_pinned_hessians_);
+    }
+    else {
+      hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, num_data * sizeof(score_t), hessians);
+    }
+  }
+  // converted indices in is_feature_used to feature-group indices
+  std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
+  #pragma omp parallel for schedule(static,1024) if (num_features_ >= 2048)
+  for (int i = 0; i < num_features_; ++i) {
+    if(is_feature_used[i]) {
+      is_feature_group_used[train_data_->Feature2Group(i)] = 1;
+    }
+  }
+  // construct the feature masks for dense feature-groups
+  int used_dense_feature_groups = 0;
+  #pragma omp parallel for schedule(static,1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
+  for (int i = 0; i < num_dense_feature_groups_; ++i) {
+    if (is_feature_group_used[dense_feature_group_map_[i]]) {
+      feature_masks_[i] = 1;
+      ++used_dense_feature_groups;
+    }
+    else {
+      feature_masks_[i] = 0;
+    }
+  }
+  bool use_all_features = used_dense_feature_groups == num_dense_feature_groups_;
+  // if no feature group is used, just return and do not use GPU
+  if (used_dense_feature_groups == 0) {
+    return false;
+  }
+#if GPU_DEBUG >= 1
+  printf("feature masks:\n");
+  for (unsigned int i = 0; i < feature_masks_.size(); ++i) {
+    printf("%d ", feature_masks_[i]);
+  }
+  printf("\n");
+  printf("%d feature groups, %d used, %d\n", num_dense_feature_groups_, used_dense_feature_groups, use_all_features);
+#endif
+  // if not all feature groups are used, we need to transfer the feature mask to GPU
+  // otherwise, we will use a specialized GPU kernel with all feature groups enabled
+  if (!use_all_features) {
+    queue_.enqueue_write_buffer(device_feature_masks_, 0, num_dense_feature4_ * dword_features_, ptr_pinned_feature_masks_);
+  }
+  // All data have been prepared, now run the GPU kernel
+  GPUHistogram(num_data, use_all_features);
+  return true;
+}
+
+void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
+  std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
+  std::vector<int8_t> is_dense_feature_used(num_features_, 0);
+  #pragma omp parallel for schedule(static)
+  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+    if (!is_feature_used_[feature_index]) continue;
+    if (!is_feature_used[feature_index]) continue;
+    if (ordered_bins_[train_data_->Feature2Group(feature_index)]) {
+      is_sparse_feature_used[feature_index] = 1;
+    }
+    else {
+      is_dense_feature_used[feature_index] = 1;
+    }
+  }
+  // construct smaller leaf
+  HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+  // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU
+  bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
+    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
+    nullptr, nullptr,
+    nullptr, nullptr);
+  // then construct sparse features on CPU
+  // We set data_indices to null to avoid rebuilding ordered gradients/hessians
+  train_data_->ConstructHistograms(is_sparse_feature_used,
+    nullptr, smaller_leaf_splits_->num_data_in_leaf(),
+    smaller_leaf_splits_->LeafIndex(),
+    ordered_bins_, gradients_, hessians_,
+    ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+    ptr_smaller_leaf_hist_data);
+  // wait for GPU to finish, only if GPU is actually used
+  if (is_gpu_used) {
+    if (tree_config_->gpu_use_dp) {
+      // use double precision
+      WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data, is_feature_used);
+    }
+    else {
+      // use single precision
+      WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data, is_feature_used);
+    }
+  }
+
+  // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
+  // #define GPU_DEBUG_COMPARE
+  #ifdef GPU_DEBUG_COMPARE
+  for (int i = 0; i < num_dense_feature_groups_; ++i) {
+    if (!feature_masks_[i])
+      continue;
+    int dense_feature_group_index = dense_feature_group_map_[i];
+    size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
+    HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
+    HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index);
+    HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size];
+    data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
+    printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size);
+    std::copy(current_histogram, current_histogram + size, gpu_histogram);
+    std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry));
+    train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
+      num_data != num_data_ ? smaller_leaf_splits_->data_indices() : nullptr,
+      num_data,
+      num_data != num_data_ ? ordered_gradients_.data() : gradients_,
+      num_data != num_data_ ? ordered_hessians_.data() : hessians_,
+      current_histogram); 
+    CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
+    std::copy(gpu_histogram, gpu_histogram + size, current_histogram);
+    delete [] gpu_histogram;
+  }
+  #endif
+
+  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
+    // construct larger leaf
+    HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
+    is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
+      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
+      gradients_, hessians_,
+      ordered_gradients_.data(), ordered_hessians_.data());
+    // then construct sparse features on CPU
+    // We set data_indices to null to avoid rebuilding ordered gradients/hessians
+    train_data_->ConstructHistograms(is_sparse_feature_used,
+      nullptr, larger_leaf_splits_->num_data_in_leaf(),
+      larger_leaf_splits_->LeafIndex(),
+      ordered_bins_, gradients_, hessians_,
+      ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
+      ptr_larger_leaf_hist_data);
+    // wait for GPU to finish, only if GPU is actually used
+    if (is_gpu_used) {
+      if (tree_config_->gpu_use_dp) {
+        // use double precision
+        WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data, is_feature_used);
+      }
+      else {
+        // use single precision
+        WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data, is_feature_used);
+      }
+    }
+  }
+}
+
+void GPUTreeLearner::FindBestThresholds() {
+  SerialTreeLearner::FindBestThresholds();
+
+#if GPU_DEBUG >= 3
+  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+    if (!is_feature_used_[feature_index]) continue;
+    if (parent_leaf_histogram_array_ != nullptr
+        && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
+      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
+      continue;
+    }
+    size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; 
+    printf("feature %d smaller leaf:\n", feature_index);
+    PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
+    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
+    printf("feature %d larger leaf:\n", feature_index);
+    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - 1, bin_size);
+  }
+#endif
+}
+
+void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
+  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
+#if GPU_DEBUG >= 2
+  printf("spliting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
+#endif
+  SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
+  if (Network::num_machines() == 1) {
+    // do some sanity check for the GPU algorithm
+    if (best_split_info.left_count < best_split_info.right_count) {
+      if ((best_split_info.left_count != smaller_leaf_splits_->num_data_in_leaf()) ||
+          (best_split_info.right_count!= larger_leaf_splits_->num_data_in_leaf())) {
+        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
+      }
+    } else {
+      smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
+      larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
+      if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
+          (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
+        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
+      }
+    }
+  }
+}
+
+}   // namespace LightGBM
+#endif // USE_GPU
+
diff --git a/src/treelearner/gpu_tree_learner.h b/src/treelearner/gpu_tree_learner.h
new file mode 100644
index 000000000000..9b7dd9d6112b
--- /dev/null
+++ b/src/treelearner/gpu_tree_learner.h
@@ -0,0 +1,282 @@
+#ifndef LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_
+#define LIGHTGBM_TREELEARNER_GPU_TREE_LEARNER_H_
+
+#include <LightGBM/utils/random.h>
+#include <LightGBM/utils/array_args.h>
+#include <LightGBM/dataset.h>
+#include <LightGBM/tree.h>
+#include <LightGBM/feature_group.h>
+#include "feature_histogram.hpp"
+#include "serial_tree_learner.h"
+#include "data_partition.hpp"
+#include "split_info.hpp"
+#include "leaf_splits.hpp"
+
+#include <cstdio>
+#include <vector>
+#include <random>
+#include <cmath>
+#include <memory>
+
+#ifdef USE_GPU
+
+#define BOOST_COMPUTE_THREAD_SAFE
+#define BOOST_COMPUTE_HAVE_THREAD_LOCAL
+// Use Boost.Compute on-disk kernel cache
+#define BOOST_COMPUTE_USE_OFFLINE_CACHE
+#include <boost/compute/core.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/align/aligned_allocator.hpp>
+
+
+namespace LightGBM {
+
+/*!
+* \brief GPU-based parallel learning algorithm.
+*/
+class GPUTreeLearner: public SerialTreeLearner {
+public:
+  explicit GPUTreeLearner(const TreeConfig* tree_config);
+  ~GPUTreeLearner();
+  void Init(const Dataset* train_data, bool is_constant_hessian) override;
+  void ResetTrainingData(const Dataset* train_data) override;
+  Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian) override;
+
+  void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
+    SerialTreeLearner::SetBaggingData(used_indices, num_data);
+    // determine if we are using bagging before we construct the data partition
+    // thus we can start data movement to GPU earlier
+    if (used_indices != nullptr) {
+      if (num_data != num_data_) {
+        use_bagging_ = true;
+        return;
+      }
+    }
+    use_bagging_ = false;
+  }
+
+protected:
+  void BeforeTrain() override;
+  bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
+  void FindBestThresholds() override;
+  void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
+  void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
+private:
+  /*! \brief 4-byte feature tuple used by GPU kernels */
+  struct Feature4 {
+	  union {
+		  unsigned char s[4];
+		  struct {
+			  unsigned char s0;
+			  unsigned char s1;
+			  unsigned char s2;
+			  unsigned char s3;
+		  };
+	  };
+  };
+  
+  /*! \brief Single precision histogram entiry for GPU */
+  struct GPUHistogramBinEntry {
+    score_t sum_gradients;
+    score_t sum_hessians;
+    uint32_t cnt;
+  };
+
+  /*!
+  * \brief Find the best number of workgroups processing one feature for maximizing efficiency
+  * \param leaf_num_data The number of data examples on the current leaf being processed
+  * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
+  */
+  int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
+  
+  /*!
+  * \brief Initialize GPU device, context and command queues
+  *        Also compiles the OpenCL kernel
+  * \param platform_id OpenCL platform ID
+  * \param device_id OpenCL device ID
+  */
+  void InitGPU(int platform_id, int device_id);
+
+  /*!
+  * \brief Allocate memory for GPU computation
+  */
+  void AllocateGPUMemory();
+
+  /*!
+  * \brief Compile OpenCL GPU source code to kernel binaries
+  */
+  void BuildGPUKernels();
+
+  /*!
+  * \brief Setup GPU kernel arguments, preparing for launching
+  */
+  void SetupKernelArguments();
+
+  /*! 
+   * \brief Compute GPU feature histogram for the current leaf.
+   *        Indices, gradients and hessians have been copied to the device.
+   * \param leaf_num_data Number of data on current leaf
+   * \param use_all_features Set to true to not use feature masks, with a faster kernel
+  */
+  void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
+  
+  /*!
+   * \brief Wait for GPU kernel execution and read histogram
+   * \param histograms Destination of histogram results from GPU.
+   * \param is_feature_used A predicate vector for enabling each feature
+  */
+  template <typename HistType>
+  void WaitAndGetHistograms(HistogramBinEntry* histograms, const std::vector<int8_t>& is_feature_used);
+
+  /*!
+   * \brief Construct GPU histogram asynchronously. 
+   *        Interface is similar to Dataset::ConstructHistograms().
+   * \param is_feature_used A predicate vector for enabling each feature
+   * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
+   *                     Set to nullptr to skip copy to GPU.
+   * \param num_data Number of data examples to be included in histogram
+   * \param gradients Array of gradients for all examples.
+   * \param hessians Array of hessians for all examples.
+   * \param ordered_gradients Ordered gradients will be generated and copied to GPU when gradients is not nullptr, 
+   *                     Set gradients to nullptr to skip copy to GPU.
+   * \param ordered_hessians Ordered hessians will be generated and copied to GPU when hessians is not nullptr, 
+   *                     Set hessians to nullptr to skip copy to GPU.
+   * \return true if GPU kernel is launched, false if GPU is not used
+  */
+  bool ConstructGPUHistogramsAsync(
+    const std::vector<int8_t>& is_feature_used,
+    const data_size_t* data_indices, data_size_t num_data,
+    const score_t* gradients, const score_t* hessians,
+    score_t* ordered_gradients, score_t* ordered_hessians);
+
+
+  /*! brief Log2 of max number of workgroups per feature*/
+  const int kMaxLogWorkgroupsPerFeature = 10; // 2^10
+  /*! brief Max total number of workgroups with preallocated workspace.
+   *        If we use more than this number of workgroups, we have to reallocate subhistograms */
+  int preallocd_max_num_wg_ = 1024;
+
+  /*! \brief True if bagging is used */
+  bool use_bagging_;
+
+  /*! \brief GPU device object */
+  boost::compute::device dev_;
+  /*! \brief GPU context object */
+  boost::compute::context ctx_;
+  /*! \brief GPU command queue object */
+  boost::compute::command_queue queue_;
+  /*! \brief GPU kernel for 256 bins */
+  const char *kernel256_src_ = 
+  #include "ocl/histogram256.cl"
+  ;
+  /*! \brief GPU kernel for 64 bins */
+  const char *kernel64_src_ = 
+  #include "ocl/histogram64.cl"
+  ;
+  /*! \brief GPU kernel for 64 bins */
+  const char *kernel16_src_ = 
+  #include "ocl/histogram16.cl"
+  ;
+  /*! \brief Currently used kernel source */
+  std::string kernel_source_;
+  /*! \brief Currently used kernel name */
+  std::string kernel_name_;
+
+  /*! \brief a array of histogram kernels with different number
+     of workgroups per feature */
+  std::vector<boost::compute::kernel> histogram_kernels_;
+  /*! \brief a array of histogram kernels with different number
+     of workgroups per feature, with all features enabled to avoid branches */
+  std::vector<boost::compute::kernel> histogram_allfeats_kernels_;
+  /*! \brief a array of histogram kernels with different number
+     of workgroups per feature, and processing the whole dataset */
+  std::vector<boost::compute::kernel> histogram_fulldata_kernels_;
+  /*! \brief total number of feature-groups */
+  int num_feature_groups_;
+  /*! \brief total number of dense feature-groups, which will be processed on GPU */
+  int num_dense_feature_groups_;
+  /*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
+   *  With bin size > 16, there are 4 features per DWORD.
+   *  With bin size <=16, there are 8 features per DWORD.
+   * */
+  int dword_features_;
+  /*! \brief total number of dense feature-group tuples on GPU.
+   * Each feature tuple is 4-byte (4 features if each feature takes a byte) */
+  int num_dense_feature4_;
+  /*! \brief Max number of bins of training data, used to determine 
+   * which GPU kernel to use */
+  int max_num_bin_;
+  /*! \brief Used GPU kernel bin size (64, 256) */
+  int device_bin_size_;
+  /*! \brief Size of histogram bin entry, depending if single or double precision is used */
+  size_t hist_bin_entry_sz_;
+  /*! \brief Indices of all dense feature-groups */
+  std::vector<int> dense_feature_group_map_;
+  /*! \brief Indices of all sparse feature-groups */
+  std::vector<int> sparse_feature_group_map_;
+  /*! \brief Multipliers of all dense feature-groups, used for redistributing bins */
+  std::vector<int> device_bin_mults_;
+  /*! \brief GPU memory object holding the training data */
+  std::unique_ptr<boost::compute::vector<Feature4>> device_features_;
+  /*! \brief GPU memory object holding the ordered gradient */
+  boost::compute::buffer device_gradients_;
+  /*! \brief Pinned memory object for ordered gradient */
+  boost::compute::buffer pinned_gradients_;
+  /*! \brief Pointer to pinned memory of ordered gradient */
+  void * ptr_pinned_gradients_ = nullptr;
+  /*! \brief GPU memory object holding the ordered hessian */
+  boost::compute::buffer device_hessians_;
+  /*! \brief Pinned memory object for ordered hessian */
+  boost::compute::buffer pinned_hessians_;
+  /*! \brief Pointer to pinned memory of ordered hessian */
+  void * ptr_pinned_hessians_ = nullptr;
+  /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
+  std::vector<char, boost::alignment::aligned_allocator<char, 4096>> feature_masks_;
+  /*! \brief GPU memory object holding the feature masks */
+  boost::compute::buffer device_feature_masks_;
+  /*! \brief Pinned memory object for feature masks */
+  boost::compute::buffer pinned_feature_masks_;
+  /*! \brief Pointer to pinned memory of feature masks */
+  void * ptr_pinned_feature_masks_ = nullptr;
+  /*! \brief GPU memory object holding indices of the leaf being processed */
+  std::unique_ptr<boost::compute::vector<data_size_t>> device_data_indices_;
+  /*! \brief GPU memory object holding counters for workgroup coordination */
+  std::unique_ptr<boost::compute::vector<int>> sync_counters_;
+  /*! \brief GPU memory object holding temporary sub-histograms per workgroup */
+  std::unique_ptr<boost::compute::vector<char>> device_subhistograms_;
+  /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
+  boost::compute::buffer device_histogram_outputs_;
+  /*! \brief Host memory pointer for histogram outputs */
+  void * host_histogram_outputs_;
+  /*! \brief OpenCL waitlist object for waiting for data transfer before kernel execution */
+  boost::compute::wait_list kernel_wait_obj_;
+  /*! \brief OpenCL waitlist object for reading output histograms after kernel execution */
+  boost::compute::wait_list histograms_wait_obj_;
+  /*! \brief Asynchronous waiting object for copying indices */
+  boost::compute::future<void> indices_future_;
+  /*! \brief Asynchronous waiting object for copying gradients */
+  boost::compute::event gradients_future_;
+  /*! \brief Asynchronous waiting object for copying hessians */
+  boost::compute::event hessians_future_;
+};
+
+}  // namespace LightGBM
+#else
+
+// When GPU support is not compiled in, quit with an error message
+
+namespace LightGBM {
+    
+class GPUTreeLearner: public SerialTreeLearner {
+public:
+  explicit GPUTreeLearner(const TreeConfig* tree_config) : SerialTreeLearner(tree_config) {
+    Log::Fatal("GPU Tree Learner was not enabled in this build. Recompile with CMake option -DUSE_GPU=1");
+  }
+};
+
+}
+
+#endif   // USE_GPU
+
+#endif   // LightGBM_TREELEARNER_GPU_TREE_LEARNER_H_
+
diff --git a/src/treelearner/ocl/histogram16.cl b/src/treelearner/ocl/histogram16.cl
new file mode 100644
index 000000000000..dc393f60c440
--- /dev/null
+++ b/src/treelearner/ocl/histogram16.cl
@@ -0,0 +1,756 @@
+// this file can either be read and passed to an OpenCL compiler directly,
+// or included in a C++11 source file as a string literal
+#ifndef __OPENCL_VERSION__
+// If we are including this file in C++,
+// the entire source file following (except the last #endif) will become
+// a raw string literal. The extra ")" is just for mathcing parentheses
+// to make the editor happy. The extra ")" and extra endif will be skipped.
+// DO NOT add anything between here and the next #ifdef, otherwise you need
+// to modify the skip count at the end of this file.
+R""()
+#endif
+
+#ifndef _HISTOGRAM_16_KERNEL_
+#define _HISTOGRAM_16_KERNEL_
+
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+
+// Configurable options:
+// NUM_BANKS should be a power of 2
+#ifndef NUM_BANKS
+#define NUM_BANKS 8
+#endif
+// how many bits in thread ID represent the bank = log2(NUM_BANKS)
+#ifndef BANK_BITS
+#define BANK_BITS 3
+#endif
+// use double precision or not
+#ifndef USE_DP_FLOAT
+#define USE_DP_FLOAT 0
+#endif
+// ignore hessian, and use the local memory for hessian as an additional bank for gradient
+#ifndef CONST_HESSIAN
+#define CONST_HESSIAN 0
+#endif
+
+
+#define LOCAL_SIZE_0 256
+#define NUM_BINS 16
+// if USE_DP_FLOAT is set to 1, we will use double precision for the accumulator
+#if USE_DP_FLOAT == 1
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+typedef double acc_type;
+typedef ulong acc_int_type;
+#define as_acc_type as_double
+#define as_acc_int_type as_ulong
+#else
+typedef float acc_type;
+typedef uint acc_int_type;
+#define as_acc_type as_float
+#define as_acc_int_type as_uint
+#endif
+// number of features to process in a 4-byte feature tuple
+#define DWORD_FEATURES 8
+// number of bits per feature
+#define FEATURE_BITS (sizeof(uchar4) * 8 / DWORD_FEATURES)
+// bit mask for number of features to process in a 4-byte feature tuple
+#define DWORD_FEATURES_MASK (DWORD_FEATURES - 1)
+// log2 of number of features to process in a 4-byte feature tuple
+#define LOG2_DWORD_FEATURES 3
+// mask for getting the bank ID
+#define BANK_MASK (NUM_BANKS - 1)
+// 8 features, each has a gradient and a hessian
+#define HG_BIN_MULT (NUM_BANKS * DWORD_FEATURES * 2)
+// 8 features, each has a counter
+#define CNT_BIN_MULT (NUM_BANKS * DWORD_FEATURES)
+// local memory size in bytes
+#define LOCAL_MEM_SIZE (DWORD_FEATURES * (sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS * NUM_BANKS)
+
+// unroll the atomic operation for a few times. Takes more code space, 
+// but compiler can generate better code for faster atomics.
+#define UNROLL_ATOMIC 1
+
+// Options passed by compiler at run time:
+// IGNORE_INDICES will be set when the kernel does not 
+// #define IGNORE_INDICES
+// #define POWER_FEATURE_WORKGROUPS 10
+
+// use all features and do not use feature mask
+#ifndef ENABLE_ALL_FEATURES
+#define ENABLE_ALL_FEATURES 1
+#endif
+
+// detect Nvidia platforms
+#ifdef cl_nv_pragma_unroll
+#define NVIDIA 1
+#endif
+
+// use binary patching for AMD GCN 1.2 or newer
+#ifndef AMD_USE_DS_ADD_F32
+#define AMD_USE_DS_ADD_F32 0
+#endif
+
+typedef uint data_size_t;
+typedef float score_t;
+
+#define ATOMIC_FADD_SUB1 { \
+    expected.f_val = current.f_val; \
+    next.f_val = expected.f_val + val; \
+    current.u_val = atom_cmpxchg((volatile __local acc_int_type *)addr, expected.u_val, next.u_val); \
+    if (current.u_val == expected.u_val) \
+        goto end; \
+    }
+#define ATOMIC_FADD_SUB2  ATOMIC_FADD_SUB1 \
+                          ATOMIC_FADD_SUB1
+#define ATOMIC_FADD_SUB4  ATOMIC_FADD_SUB2 \
+                          ATOMIC_FADD_SUB2
+#define ATOMIC_FADD_SUB8  ATOMIC_FADD_SUB4 \
+                          ATOMIC_FADD_SUB4
+#define ATOMIC_FADD_SUB16 ATOMIC_FADD_SUB8 \
+                          ATOMIC_FADD_SUB8
+#define ATOMIC_FADD_SUB32 ATOMIC_FADD_SUB16\
+                          ATOMIC_FADD_SUB16
+#define ATOMIC_FADD_SUB64 ATOMIC_FADD_SUB32\
+                          ATOMIC_FADD_SUB32
+
+
+// atomic add for float number in local memory
+inline void atomic_local_add_f(__local acc_type *addr, const float val)
+{
+    union{
+        acc_int_type u_val;
+        acc_type f_val;
+    } next, expected, current;
+#if (NVIDIA == 1 && USE_DP_FLOAT == 0)
+    float res = 0;
+    asm volatile ("atom.shared.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(addr), "f"(val));
+#elif (AMD_USE_DS_ADD_F32 == 1 && USE_DP_FLAT == 0)
+    // this instruction (DS_AND_U32) will be patched into a DS_ADD_F32
+    // we need to hack here because DS_ADD_F32 is not exposed via OpenCL
+    atom_and((__local acc_int_type *)addr, as_acc_int_type(val));
+#else
+    current.f_val = *addr;
+    #if UNROLL_ATOMIC == 1
+    // provide a fast path
+    // then do the complete loop
+    // this should work on all devices
+    ATOMIC_FADD_SUB8
+    ATOMIC_FADD_SUB4
+    #endif
+    do {
+        expected.f_val = current.f_val;
+        next.f_val = expected.f_val + val;
+        current.u_val = atom_cmpxchg((volatile __local acc_int_type *)addr, expected.u_val, next.u_val);
+    } while (current.u_val != expected.u_val);
+    end:
+        ;
+#endif
+}
+
+// this function will be called by histogram16
+// we have one sub-histogram of one feature in registers, and need to read others
+void within_kernel_reduction16x8(uchar8 feature_mask,
+                           __global const acc_type* restrict feature4_sub_hist, 
+                           const uint skip_id,
+                           acc_type stat_val, uint cnt_val,
+                           const ushort num_sub_hist,
+                           __global acc_type* restrict output_buf,
+                           __local acc_type * restrict local_hist) {
+    const ushort ltid = get_local_id(0); // range 0 - 255
+    const ushort lsize = LOCAL_SIZE_0;
+    ushort feature_id = ltid & DWORD_FEATURES_MASK; // range 0 - 7
+    uchar is_hessian_first = (ltid >> LOG2_DWORD_FEATURES) & 1; // hessian or gradient
+    ushort bin_id = ltid >> (LOG2_DWORD_FEATURES + 1); // range 0 - 16
+    ushort i;
+    #if POWER_FEATURE_WORKGROUPS != 0 
+    // if there is only 1 work group, no need to do the reduction
+    // add all sub-histograms for 4 features
+    __global const acc_type* restrict p = feature4_sub_hist + ltid;
+    for (i = 0; i < skip_id; ++i) {
+            // 256 threads working on 8 features' 16 bins, gradient and hessian
+            stat_val += *p;
+            p += NUM_BINS * DWORD_FEATURES * 2;
+            if (ltid < LOCAL_SIZE_0 / 2) {
+                cnt_val += as_acc_int_type(*p); 
+            }
+            p += NUM_BINS * DWORD_FEATURES;
+    }
+    // skip the counters we already have
+    p += 3 * DWORD_FEATURES * NUM_BINS;
+    for (i = i + 1; i < num_sub_hist; ++i) {
+            stat_val += *p; 
+            p += NUM_BINS * DWORD_FEATURES * 2;
+            if (ltid < LOCAL_SIZE_0 / 2) {
+                cnt_val += as_acc_int_type(*p); 
+            }
+            p += NUM_BINS * DWORD_FEATURES;
+    }
+    #endif
+    // printf("thread %d:feature=%d, bin_id=%d, hessian=%d, stat_val=%f, cnt=%d", ltid, feature_id, bin_id, is_hessian_first, stat_val, cnt_val);
+    // now overwrite the local_hist for final reduction and output
+    // reverse the f7...f0 order to match the real order
+    feature_id = DWORD_FEATURES_MASK - feature_id;
+    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + is_hessian_first] = stat_val;
+    bin_id = ltid >> (LOG2_DWORD_FEATURES); // range 0 - 16, for counter
+    if (ltid < LOCAL_SIZE_0 / 2) {
+        local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (i = ltid; i < DWORD_FEATURES * 3 * NUM_BINS; i += lsize) {
+        output_buf[i] = local_hist[i];
+    }
+}
+
+
+__attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1)))
+#if USE_CONSTANT_BUF == 1
+__kernel void histogram16(__global const uchar4* restrict feature_data_base, 
+                      __constant const uchar8* restrict feature_masks __attribute__((max_constant_size(65536))),
+                      const data_size_t feature_size,
+                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), 
+                      const data_size_t num_data, 
+                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), 
+#if CONST_HESSIAN == 0
+                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
+#else
+                      const score_t const_hessian,
+#endif
+                      __global char* restrict output_buf,
+                      __global volatile int * sync_counters,
+                      __global acc_type* restrict hist_buf_base) {
+#else
+__kernel void histogram16(__global const uchar4* feature_data_base, 
+                      __constant const uchar8* restrict feature_masks __attribute__((max_constant_size(65536))),
+                      const data_size_t feature_size,
+                      __global const data_size_t* data_indices, 
+                      const data_size_t num_data, 
+                      __global const score_t*  ordered_gradients, 
+#if CONST_HESSIAN == 0
+                      __global const score_t*  ordered_hessians,
+#else
+                      const score_t const_hessian,
+#endif
+                      __global char* restrict output_buf, 
+                      __global volatile int * sync_counters,
+                      __global acc_type* restrict hist_buf_base) {
+#endif
+    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
+    // otherwise a "Misaligned Address" exception may occur
+    __local float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
+    const uint gtid = get_global_id(0);
+    const uint gsize = get_global_size(0);
+    const ushort ltid = get_local_id(0);
+    const ushort lsize = LOCAL_SIZE_0; // get_local_size(0);
+    const ushort group_id = get_group_id(0);
+
+    // local memory per workgroup is 12 KB
+    // clear local memory
+    __local uint * ptr = (__local uint *) shared_array;
+    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+        ptr[i] = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // gradient/hessian histograms
+    // assume this starts at 32 * 4 = 128-byte boundary
+    // each bank: 2 * 8 * 16 * size_of(float) = 1 KB
+    // there are 8 banks (sub-histograms) used by 256 threads total 8 KB
+    /* memory layout of gh_hist:
+       -----------------------------------------------------------------------------------------------
+       bk0_g_f0_bin0   bk0_g_f1_bin0   bk0_g_f2_bin0   bk0_g_f3_bin0   bk0_g_f4_bin0   bk0_g_f5_bin0   bk0_g_f6_bin0   bk0_g_f7_bin0   
+       bk0_h_f0_bin0   bk0_h_f1_bin0   bk0_h_f2_bin0   bk0_h_f3_bin0   bk0_h_f4_bin0   bk0_h_f5_bin0   bk0_h_f6_bin0   bk0_h_f7_bin0
+       bk1_g_f0_bin0   bk1_g_f1_bin0   bk1_g_f2_bin0   bk1_g_f3_bin0   bk1_g_f4_bin0   bk1_g_f5_bin0   bk1_g_f6_bin0   bk1_g_f7_bin0   
+       bk1_h_f0_bin0   bk1_h_f1_bin0   bk1_h_f2_bin0   bk1_h_f3_bin0   bk1_h_f4_bin0   bk1_h_f5_bin0   bk1_h_f6_bin0   bk1_h_f7_bin0
+       bk2_g_f0_bin0   bk2_g_f1_bin0   bk2_g_f2_bin0   bk2_g_f3_bin0   bk2_g_f4_bin0   bk2_g_f5_bin0   bk2_g_f6_bin0   bk2_g_f7_bin0   
+       bk2_h_f0_bin0   bk2_h_f1_bin0   bk2_h_f2_bin0   bk2_h_f3_bin0   bk2_h_f4_bin0   bk2_h_f5_bin0   bk2_h_f6_bin0   bk2_h_f7_bin0
+       bk3_g_f0_bin0   bk3_g_f1_bin0   bk3_g_f2_bin0   bk3_g_f3_bin0   bk3_g_f4_bin0   bk3_g_f5_bin0   bk3_g_f6_bin0   bk3_g_f7_bin0   
+       bk3_h_f0_bin0   bk3_h_f1_bin0   bk3_h_f2_bin0   bk3_h_f3_bin0   bk3_h_f4_bin0   bk3_h_f5_bin0   bk3_h_f6_bin0   bk3_h_f7_bin0
+       bk4_g_f0_bin0   bk4_g_f1_bin0   bk4_g_f2_bin0   bk4_g_f3_bin0   bk4_g_f4_bin0   bk4_g_f5_bin0   bk4_g_f6_bin0   bk4_g_f7_bin0   
+       bk4_h_f0_bin0   bk4_h_f1_bin0   bk4_h_f2_bin0   bk4_h_f3_bin0   bk4_h_f4_bin0   bk4_h_f5_bin0   bk4_h_f6_bin0   bk4_h_f7_bin0
+       bk5_g_f0_bin0   bk5_g_f1_bin0   bk5_g_f2_bin0   bk5_g_f3_bin0   bk5_g_f4_bin0   bk5_g_f5_bin0   bk5_g_f6_bin0   bk5_g_f7_bin0   
+       bk5_h_f0_bin0   bk5_h_f1_bin0   bk5_h_f2_bin0   bk5_h_f3_bin0   bk5_h_f4_bin0   bk5_h_f5_bin0   bk5_h_f6_bin0   bk5_h_f7_bin0
+       bk6_g_f0_bin0   bk6_g_f1_bin0   bk6_g_f2_bin0   bk6_g_f3_bin0   bk6_g_f4_bin0   bk6_g_f5_bin0   bk6_g_f6_bin0   bk6_g_f7_bin0   
+       bk6_h_f0_bin0   bk6_h_f1_bin0   bk6_h_f2_bin0   bk6_h_f3_bin0   bk6_h_f4_bin0   bk6_h_f5_bin0   bk6_h_f6_bin0   bk6_h_f7_bin0
+       bk7_g_f0_bin0   bk7_g_f1_bin0   bk7_g_f2_bin0   bk7_g_f3_bin0   bk7_g_f4_bin0   bk7_g_f5_bin0   bk7_g_f6_bin0   bk7_g_f7_bin0   
+       bk7_h_f0_bin0   bk7_h_f1_bin0   bk7_h_f2_bin0   bk7_h_f3_bin0   bk7_h_f4_bin0   bk7_h_f5_bin0   bk7_h_f6_bin0   bk7_h_f7_bin0
+       ...
+       bk0_g_f0_bin16  bk0_g_f1_bin16  bk0_g_f2_bin16  bk0_g_f3_bin16  bk0_g_f4_bin16  bk0_g_f5_bin16  bk0_g_f6_bin16  bk0_g_f7_bin16 
+       bk0_h_f0_bin16  bk0_h_f1_bin16  bk0_h_f2_bin16  bk0_h_f3_bin16  bk0_h_f4_bin16  bk0_h_f5_bin16  bk0_h_f6_bin16  bk0_h_f7_bin16
+       bk1_g_f0_bin16  bk1_g_f1_bin16  bk1_g_f2_bin16  bk1_g_f3_bin16  bk1_g_f4_bin16  bk1_g_f5_bin16  bk1_g_f6_bin16  bk1_g_f7_bin16 
+       bk1_h_f0_bin16  bk1_h_f1_bin16  bk1_h_f2_bin16  bk1_h_f3_bin16  bk1_h_f4_bin16  bk1_h_f5_bin16  bk1_h_f6_bin16  bk1_h_f7_bin16
+       bk2_g_f0_bin16  bk2_g_f1_bin16  bk2_g_f2_bin16  bk2_g_f3_bin16  bk2_g_f4_bin16  bk2_g_f5_bin16  bk2_g_f6_bin16  bk2_g_f7_bin16 
+       bk2_h_f0_bin16  bk2_h_f1_bin16  bk2_h_f2_bin16  bk2_h_f3_bin16  bk2_h_f4_bin16  bk2_h_f5_bin16  bk2_h_f6_bin16  bk2_h_f7_bin16
+       bk3_g_f0_bin16  bk3_g_f1_bin16  bk3_g_f2_bin16  bk3_g_f3_bin16  bk3_g_f4_bin16  bk3_g_f5_bin16  bk3_g_f6_bin16  bk3_g_f7_bin16 
+       bk3_h_f0_bin16  bk3_h_f1_bin16  bk3_h_f2_bin16  bk3_h_f3_bin16  bk3_h_f4_bin16  bk3_h_f5_bin16  bk3_h_f6_bin16  bk3_h_f7_bin16
+       bk4_g_f0_bin16  bk4_g_f1_bin16  bk4_g_f2_bin16  bk4_g_f3_bin16  bk4_g_f4_bin16  bk4_g_f5_bin16  bk4_g_f6_bin16  bk4_g_f7_bin16 
+       bk4_h_f0_bin16  bk4_h_f1_bin16  bk4_h_f2_bin16  bk4_h_f3_bin16  bk4_h_f4_bin16  bk4_h_f5_bin16  bk4_h_f6_bin16  bk4_h_f7_bin16
+       bk5_g_f0_bin16  bk5_g_f1_bin16  bk5_g_f2_bin16  bk5_g_f3_bin16  bk5_g_f4_bin16  bk5_g_f5_bin16  bk5_g_f6_bin16  bk5_g_f7_bin16 
+       bk5_h_f0_bin16  bk5_h_f1_bin16  bk5_h_f2_bin16  bk5_h_f3_bin16  bk5_h_f4_bin16  bk5_h_f5_bin16  bk5_h_f6_bin16  bk5_h_f7_bin16
+       bk6_g_f0_bin16  bk6_g_f1_bin16  bk6_g_f2_bin16  bk6_g_f3_bin16  bk6_g_f4_bin16  bk6_g_f5_bin16  bk6_g_f6_bin16  bk6_g_f7_bin16 
+       bk6_h_f0_bin16  bk6_h_f1_bin16  bk6_h_f2_bin16  bk6_h_f3_bin16  bk6_h_f4_bin16  bk6_h_f5_bin16  bk6_h_f6_bin16  bk6_h_f7_bin16
+       bk7_g_f0_bin16  bk7_g_f1_bin16  bk7_g_f2_bin16  bk7_g_f3_bin16  bk7_g_f4_bin16  bk7_g_f5_bin16  bk7_g_f6_bin16  bk7_g_f7_bin16 
+       bk7_h_f0_bin16  bk7_h_f1_bin16  bk7_h_f2_bin16  bk7_h_f3_bin16  bk7_h_f4_bin16  bk7_h_f5_bin16  bk7_h_f6_bin16  bk7_h_f7_bin16
+       -----------------------------------------------------------------------------------------------
+    */
+    // with this organization, the LDS/shared memory bank is independent of the bin value
+    // all threads within a quarter-wavefront (half-warp) will not have any bank conflict
+
+    __local acc_type * gh_hist = (__local acc_type *)shared_array;
+    // counter histogram
+    // each bank: 8 * 16 * size_of(uint) = 0.5 KB
+    // there are 8 banks used by 256 threads total 4 KB
+    /* memory layout in cnt_hist:
+       -----------------------------------------------
+       bk0_c_f0_bin0   bk0_c_f1_bin0   bk0_c_f2_bin0   bk0_c_f3_bin0   bk0_c_f4_bin0   bk0_c_f5_bin0   bk0_c_f6_bin0   bk0_c_f7_bin0
+       bk1_c_f0_bin0   bk1_c_f1_bin0   bk1_c_f2_bin0   bk1_c_f3_bin0   bk1_c_f4_bin0   bk1_c_f5_bin0   bk1_c_f6_bin0   bk1_c_f7_bin0
+       bk2_c_f0_bin0   bk2_c_f1_bin0   bk2_c_f2_bin0   bk2_c_f3_bin0   bk2_c_f4_bin0   bk2_c_f5_bin0   bk2_c_f6_bin0   bk2_c_f7_bin0
+       bk3_c_f0_bin0   bk3_c_f1_bin0   bk3_c_f2_bin0   bk3_c_f3_bin0   bk3_c_f4_bin0   bk3_c_f5_bin0   bk3_c_f6_bin0   bk3_c_f7_bin0
+       bk4_c_f0_bin0   bk4_c_f1_bin0   bk4_c_f2_bin0   bk4_c_f3_bin0   bk4_c_f4_bin0   bk4_c_f5_bin0   bk4_c_f6_bin0   bk4_c_f7_bin0
+       bk5_c_f0_bin0   bk5_c_f1_bin0   bk5_c_f2_bin0   bk5_c_f3_bin0   bk5_c_f4_bin0   bk5_c_f5_bin0   bk5_c_f6_bin0   bk5_c_f7_bin0
+       bk6_c_f0_bin0   bk6_c_f1_bin0   bk6_c_f2_bin0   bk6_c_f3_bin0   bk6_c_f4_bin0   bk6_c_f5_bin0   bk6_c_f6_bin0   bk6_c_f7_bin0
+       bk7_c_f0_bin0   bk7_c_f1_bin0   bk7_c_f2_bin0   bk7_c_f3_bin0   bk7_c_f4_bin0   bk7_c_f5_bin0   bk7_c_f6_bin0   bk7_c_f7_bin0
+       ...
+       bk0_c_f0_bin16  bk0_c_f1_bin16  bk0_c_f2_bin16  bk0_c_f3_bin16  bk0_c_f4_bin16  bk0_c_f5_bin16  bk0_c_f6_bin16  bk0_c_f7_bin0
+       bk1_c_f0_bin16  bk1_c_f1_bin16  bk1_c_f2_bin16  bk1_c_f3_bin16  bk1_c_f4_bin16  bk1_c_f5_bin16  bk1_c_f6_bin16  bk1_c_f7_bin0
+       bk2_c_f0_bin16  bk2_c_f1_bin16  bk2_c_f2_bin16  bk2_c_f3_bin16  bk2_c_f4_bin16  bk2_c_f5_bin16  bk2_c_f6_bin16  bk2_c_f7_bin0
+       bk3_c_f0_bin16  bk3_c_f1_bin16  bk3_c_f2_bin16  bk3_c_f3_bin16  bk3_c_f4_bin16  bk3_c_f5_bin16  bk3_c_f6_bin16  bk3_c_f7_bin0
+       bk4_c_f0_bin16  bk4_c_f1_bin16  bk4_c_f2_bin16  bk4_c_f3_bin16  bk4_c_f4_bin16  bk4_c_f5_bin16  bk4_c_f6_bin16  bk4_c_f7_bin0
+       bk5_c_f0_bin16  bk5_c_f1_bin16  bk5_c_f2_bin16  bk5_c_f3_bin16  bk5_c_f4_bin16  bk5_c_f5_bin16  bk5_c_f6_bin16  bk5_c_f7_bin0
+       bk6_c_f0_bin16  bk6_c_f1_bin16  bk6_c_f2_bin16  bk6_c_f3_bin16  bk6_c_f4_bin16  bk6_c_f5_bin16  bk6_c_f6_bin16  bk6_c_f7_bin0
+       bk7_c_f0_bin16  bk7_c_f1_bin16  bk7_c_f2_bin16  bk7_c_f3_bin16  bk7_c_f4_bin16  bk7_c_f5_bin16  bk7_c_f6_bin16  bk7_c_f7_bin0
+       -----------------------------------------------
+    */
+    __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS);
+
+    // thread 0, 1, 2, 3, 4, 5, 6, 7 compute histograms for gradients first
+    // thread 8, 9, 10, 11, 12, 13, 14, 15 compute histograms for hessians first
+    // etc.
+    uchar is_hessian_first = (ltid >> LOG2_DWORD_FEATURES) & 1;
+    // thread 0-15 write result to bank0, 16-31 to bank1, 32-47 to bank2, 48-63 to bank3, etc
+    ushort bank = (ltid >> (LOG2_DWORD_FEATURES + 1)) & BANK_MASK;
+    
+    ushort group_feature = group_id >> POWER_FEATURE_WORKGROUPS;
+    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
+    // feature_size is the number of examples per feature
+    __global const uchar4* feature_data = feature_data_base + group_feature * feature_size;
+    // size of threads that process this feature4
+    const uint subglobal_size = lsize * (1 << POWER_FEATURE_WORKGROUPS);
+    // equavalent thread ID in this subgroup for this feature4
+    const uint subglobal_tid  = gtid - group_feature * subglobal_size;
+    // extract feature mask, when a byte is set to 0, that feature is disabled
+    #if ENABLE_ALL_FEATURES == 1
+    // hopefully the compiler will propogate the constants and eliminate all branches
+    uchar8 feature_mask = (uchar8)(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff);
+    #else
+    uchar8 feature_mask = feature_masks[group_feature];
+    #endif
+    // exit if all features are masked
+    if (!as_ulong(feature_mask)) {
+        return;
+    }
+
+    // STAGE 1: read feature data, and gradient and hessian
+    // first half of the threads read feature data from global memory
+    // 4 features stored in a tuple MSB...(0, 1, 2, 3)...LSB
+    // We will prefetch data into the "next" variable at the beginning of each iteration
+    uchar4 feature4;
+    uchar4 feature4_next;
+    // offset used to rotate feature4 vector, & 0x7
+    ushort offset = (ltid & DWORD_FEATURES_MASK);
+    #if ENABLE_ALL_FEATURES == 0
+    // rotate feature_mask to match the feature order of each thread
+    feature_mask = as_uchar8(rotate(as_ulong(feature_mask), (ulong)offset*8));
+    #endif
+    // store gradient and hessian
+    float stat1, stat2;
+    float stat1_next, stat2_next;
+    ushort bin, addr, addr2;
+    data_size_t ind;
+    data_size_t ind_next;
+    stat1 = ordered_gradients[subglobal_tid];
+    #if CONST_HESSIAN == 0
+    stat2 = ordered_hessians[subglobal_tid];
+    #endif
+    #ifdef IGNORE_INDICES
+    ind = subglobal_tid;
+    #else
+    ind = data_indices[subglobal_tid];
+    #endif
+    feature4 = feature_data[ind];
+
+    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
+    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+        // prefetch the next iteration variables
+        // we don't need bondary check because we have made the buffer larger
+        stat1_next = ordered_gradients[i + subglobal_size];
+        #if CONST_HESSIAN == 0
+        stat2_next = ordered_hessians[i + subglobal_size];
+        #endif
+        #ifdef IGNORE_INDICES
+        // we need to check to bounds here
+        ind_next = i + subglobal_size < num_data ? i + subglobal_size : i;
+        // start load next feature as early as possible
+        feature4_next = feature_data[ind_next];
+        #else
+        ind_next = data_indices[i + subglobal_size];
+        #endif
+        #if CONST_HESSIAN == 0
+        // swap gradient and hessian for threads 8, 9, 10, 11, 12, 13, 14, 15
+        float tmp = stat1;
+        stat1 = is_hessian_first ? stat2 : stat1;
+        stat2 = is_hessian_first ? tmp   : stat2;
+        // stat1 = select(stat1, stat2, is_hessian_first);
+        // stat2 = select(stat2, tmp, is_hessian_first);
+        #endif
+
+        // STAGE 2: accumulate gradient and hessian
+        offset = (ltid & DWORD_FEATURES_MASK);
+        // printf("thread %x, %08x -> %08x", ltid, as_uint(feature4), rotate(as_uint(feature4), (uint)(offset * FEATURE_BITS)));
+        feature4 = as_uchar4(rotate(as_uint(feature4), (uint)(offset * FEATURE_BITS)));
+        if (feature_mask.s7) {
+            bin = feature4.s3 >> 4;
+            addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
+            addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6 ,7's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 0, 1, 2, 3, 4, 5, 6, 7's hessians  for example 8, 9, 10, 11, 12, 13, 14, 15
+            atomic_local_add_f(gh_hist + addr, stat1);
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's hessians  for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 0, 1, 2, 3, 4, 5, 6, 7's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, stat2);
+            #endif
+        }
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s6) {
+            bin = feature4.s3 & 0xf;
+            addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
+            addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 1, 2, 3, 4, 5, 6 ,7, 0's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 1, 2, 3, 4, 5, 6, 7, 0's hessians  for example 8, 9, 10, 11, 12, 13, 14, 15
+            atomic_local_add_f(gh_hist + addr, stat1);
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 1, 2, 3, 4, 5, 6, 7, 0's hessians  for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 1, 2, 3, 4, 5, 6, 7, 0's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, stat2);
+            #endif
+        }
+
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s5) {
+            bin = feature4.s2 >> 4;
+            addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
+            addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 2, 3, 4, 5, 6, 7, 0, 1's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 2, 3, 4, 5, 6, 7, 0, 1's hessians  for example 8, 9, 10, 11, 12, 13, 14, 15
+            atomic_local_add_f(gh_hist + addr, stat1);
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 2, 3, 4, 5, 6, 7, 0, 1's hessians  for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 2, 3, 4, 5, 6, 7, 0, 1's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, stat2);
+            #endif
+        }
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s4) {
+            bin = feature4.s2 & 0xf;
+            addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
+            addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 3, 4, 5, 6, 7, 0, 1, 2's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 3, 4, 5, 6, 7, 0, 1, 2's hessians  for example 8, 9, 10, 11, 12, 13, 14, 15
+            atomic_local_add_f(gh_hist + addr, stat1);
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 3, 4, 5, 6, 7, 0, 1, 2's hessians  for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 3, 4, 5, 6, 7, 0, 1, 2's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, stat2);
+            #endif
+        }
+
+
+        // prefetch the next iteration variables
+        // we don't need bondary check because if it is out of boundary, ind_next = 0
+        #ifndef IGNORE_INDICES
+        feature4_next = feature_data[ind_next];
+        #endif
+
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s3) {
+            bin = feature4.s1 >> 4;
+            addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
+            addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 4, 5, 6, 7, 0, 1, 2, 3's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 4, 5, 6, 7, 0, 1, 2, 3's hessians  for example 8, 9, 10, 11, 12, 13, 14, 15
+            atomic_local_add_f(gh_hist + addr, stat1);
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 4, 5, 6, 7, 0, 1, 2, 3's hessians  for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 4, 5, 6, 7, 0, 1, 2, 3's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, stat2);
+            #endif
+        }
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s2) {
+            bin = feature4.s1 & 0xf;
+            addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
+            addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 5, 6, 7, 0, 1, 2, 3, 4's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 5, 6, 7, 0, 1, 2, 3, 4's hessians  for example 8, 9, 10, 11, 12, 13, 14, 15
+            atomic_local_add_f(gh_hist + addr, stat1);
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 5, 6, 7, 0, 1, 2, 3, 4's hessians  for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 5, 6, 7, 0, 1, 2, 3, 4's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, stat2);
+            #endif
+        }
+
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s1) {
+            bin = feature4.s0 >> 4;
+            addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
+            addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 6, 7, 0, 1, 2, 3, 4, 5's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 6, 7, 0, 1, 2, 3, 4, 5's hessians  for example 8, 9, 10, 11, 12, 13, 14, 15
+            atomic_local_add_f(gh_hist + addr, stat1);
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 6, 7, 0, 1, 2, 3, 4, 5's hessians  for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 6, 7, 0, 1, 2, 3, 4, 5's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, stat2);
+            #endif
+        }
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s0) {
+            bin = feature4.s0 & 0xf;
+            addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
+            addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 7, 0, 1, 2, 3, 4, 5, 6's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 7, 0, 1, 2, 3, 4, 5, 6's hessians  for example 8, 9, 10, 11, 12, 13, 14, 15
+            atomic_local_add_f(gh_hist + addr, stat1);
+            // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 7, 0, 1, 2, 3, 4, 5, 6's hessians  for example 0, 1, 2, 3, 4, 5, 6, 7
+            // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 7, 0, 1, 2, 3, 4, 5, 6's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, stat2);
+            #endif
+        }
+
+        // STAGE 3: accumulate counter
+        // there are 8 counters for 8 features
+        // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's counts for example 0, 1, 2, 3, 4, 5, 6, 7
+        offset = (ltid & DWORD_FEATURES_MASK);
+        if (feature_mask.s7) {
+            bin = feature4.s3 >> 4;
+            addr = bin * CNT_BIN_MULT + bank * DWORD_FEATURES + offset;
+            // printf("thread %x add counter %d feature %d (0)\n", ltid, bin, offset);
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 1, 2, 3, 4, 5, 6, 7, 0's counts for example 0, 1, 2, 3, 4, 5, 6, 7
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s6) {
+            bin = feature4.s3 & 0xf;
+            addr = bin * CNT_BIN_MULT + bank * DWORD_FEATURES + offset;
+            // printf("thread %x add counter %d feature %d (1)\n", ltid, bin, offset);
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 2, 3, 4, 5, 6, 7, 0, 1's counts for example 0, 1, 2, 3, 4, 5, 6, 7
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s5) {
+            bin = feature4.s2 >> 4;
+            addr = bin * CNT_BIN_MULT + bank * DWORD_FEATURES + offset;
+            // printf("thread %x add counter %d feature %d (2)\n", ltid, bin, offset);
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 3, 4, 5, 6, 7, 0, 1, 2's counts for example 0, 1, 2, 3, 4, 5, 6, 7
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s4) {
+            bin = feature4.s2 & 0xf;
+            addr = bin * CNT_BIN_MULT + bank * DWORD_FEATURES + offset;
+            // printf("thread %x add counter %d feature %d (3)\n", ltid, bin, offset);
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 4, 5, 6, 7, 0, 1, 2, 3's counts for example 0, 1, 2, 3, 4, 5, 6, 7
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s3) {
+            bin = feature4.s1 >> 4;
+            addr = bin * CNT_BIN_MULT + bank * DWORD_FEATURES + offset;
+            // printf("thread %x add counter %d feature %d (4)\n", ltid, bin, offset);
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 5, 6, 7, 0, 1, 2, 3, 4's counts for example 0, 1, 2, 3, 4, 5, 6, 7
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s2) {
+            bin = feature4.s1 & 0xf;
+            addr = bin * CNT_BIN_MULT + bank * DWORD_FEATURES + offset;
+            // printf("thread %x add counter %d feature %d (5)\n", ltid, bin, offset);
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 6, 7, 0, 1, 2, 3, 4, 5's counts for example 0, 1, 2, 3, 4, 5, 6, 7
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s1) {
+            bin = feature4.s0 >> 4;
+            addr = bin * CNT_BIN_MULT + bank * DWORD_FEATURES + offset;
+            // printf("thread %x add counter %d feature %d (6)\n", ltid, bin, offset);
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 7, 0, 1, 2, 3, 4, 5, 6's counts for example 0, 1, 2, 3, 4, 5, 6, 7
+        offset = (offset + 1) & DWORD_FEATURES_MASK;
+        if (feature_mask.s0) {
+            bin = feature4.s0 & 0xf;
+            addr = bin * CNT_BIN_MULT + bank * DWORD_FEATURES + offset;
+            // printf("thread %x add counter %d feature %d (7)\n", ltid, bin, offset);
+            atom_inc(cnt_hist + addr);
+        }
+        stat1 = stat1_next;
+        stat2 = stat2_next;
+        feature4 = feature4_next;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    #if ENABLE_ALL_FEATURES == 0
+    // restore feature_mask
+    feature_mask = feature_masks[group_feature];
+    #endif
+    
+    // now reduce the 4 banks of subhistograms into 1
+    acc_type stat_val = 0.0f;
+    uint cnt_val = 0;
+    // 256 threads, working on 8 features and 16 bins, 2 stats
+    // so each thread has an independent feature/bin/stat to work on.
+    const ushort feature_id = ltid & DWORD_FEATURES_MASK; // bits 0 - 2 of ltid, range 0 - 7
+    ushort bin_id = ltid >> (LOG2_DWORD_FEATURES + 1); // bits 3 is is_hessian_first; bits 4 - 7 range 0 - 16 is bin ID
+    offset = (ltid >> (LOG2_DWORD_FEATURES + 1)) & BANK_MASK; // helps avoid LDS bank conflicts
+    for (int i = 0; i < NUM_BANKS; ++i) {
+        ushort bank_id = (i + offset) & BANK_MASK;
+        stat_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + feature_id];
+    }
+    if (ltid < LOCAL_SIZE_0 / 2) {
+        // first 128 threads accumulate the 8 * 16 = 128 counter values
+        bin_id = ltid >> LOG2_DWORD_FEATURES; // bits 3 - 6 range 0 - 16 is bin ID
+        offset = (ltid >> LOG2_DWORD_FEATURES) & BANK_MASK; // helps avoid LDS bank conflicts
+        for (int i = 0; i < NUM_BANKS; ++i) {
+            ushort bank_id = (i + offset) & BANK_MASK;
+            cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * DWORD_FEATURES + feature_id];
+        }
+    }
+    
+    // now thread 0 - 7  holds feature 0 - 7's gradient for bin 0 and counter bin 0
+    // now thread 8 - 15 holds feature 0 - 7's hessian  for bin 0 and counter bin 1
+    // now thread 16- 23 holds feature 0 - 7's gradient for bin 1 and counter bin 2
+    // now thread 24- 31 holds feature 0 - 7's hessian  for bin 1 and counter bin 3
+    // etc,
+
+#if CONST_HESSIAN == 1
+    // Combine the two banks into one, and fill the hessians with counter value * hessian constant
+    barrier(CLK_LOCAL_MEM_FENCE);
+    gh_hist[ltid] = stat_val;
+    if (ltid < LOCAL_SIZE_0 / 2) {
+        cnt_hist[ltid] = cnt_val;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (is_hessian_first) {
+        // this is the hessians
+        // thread 8 - 15 read counters stored by thread 0 - 7
+        // thread 24- 31 read counters stored by thread 8 - 15
+        // thread 40- 47 read counters stored by thread 16- 23, etc
+        stat_val = const_hessian * 
+                   cnt_hist[((ltid - DWORD_FEATURES) >> (LOG2_DWORD_FEATURES + 1)) * DWORD_FEATURES + (ltid & DWORD_FEATURES_MASK)];
+    }
+    else {
+        // this is the gradients
+        // thread 0 - 7  read gradients stored by thread 8 - 15
+        // thread 16- 23 read gradients stored by thread 24- 31
+        // thread 32- 39 read gradients stored by thread 40- 47, etc
+        stat_val += gh_hist[ltid + DWORD_FEATURES];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+    // write to output
+    // write gradients and hessians histogram for all 4 features
+    // output data in linear order for further reduction
+    // output size = 4 (features) * 3 (counters) * 64 (bins) * sizeof(float)
+    /* memory layout of output:
+       g_f0_bin0   g_f1_bin0   g_f2_bin0   g_f3_bin0   g_f4_bin0   g_f5_bin0   g_f6_bin0   g_f7_bin0
+       h_f0_bin0   h_f1_bin0   h_f2_bin0   h_f3_bin0   h_f4_bin0   h_f5_bin0   h_f6_bin0   h_f7_bin0
+       g_f0_bin1   g_f1_bin1   g_f2_bin1   g_f3_bin1   g_f4_bin1   g_f5_bin1   g_f6_bin1   g_f7_bin1
+       h_f0_bin1   h_f1_bin1   h_f2_bin1   h_f3_bin1   h_f4_bin1   h_f5_bin1   h_f6_bin1   h_f7_bin1
+       ...
+       ...
+       g_f0_bin16  g_f1_bin16  g_f2_bin16  g_f3_bin16  g_f4_bin16  g_f5_bin16  g_f6_bin16  g_f7_bin16       
+       h_f0_bin16  h_f1_bin16  h_f2_bin16  h_f3_bin16  h_f4_bin16  h_f5_bin16  h_f6_bin16  h_f7_bin16       
+       c_f0_bin0   c_f1_bin0   c_f2_bin0   c_f3_bin0   c_f4_bin0   c_f5_bin0   c_f6_bin0   c_f7_bin0
+       c_f0_bin1   c_f1_bin1   c_f2_bin1   c_f3_bin1   c_f4_bin1   c_f5_bin1   c_f6_bin1   c_f7_bin1
+       ...
+       c_f0_bin16  c_f1_bin16  c_f2_bin16  c_f3_bin16  c_f4_bin16  c_f5_bin16  c_f6_bin16  c_f7_bin16    
+    */
+    // if there is only one workgroup processing this feature4, don't even need to write
+    uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
+    #if POWER_FEATURE_WORKGROUPS != 0
+    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 3 * NUM_BINS;
+    // if g_val and h_val are double, they are converted to float here
+    // write gradients and hessians for 8 features
+    output[0 * DWORD_FEATURES * NUM_BINS + ltid] = stat_val;
+    // write counts for 8 features
+    if (ltid < LOCAL_SIZE_0 / 2) {
+        output[2 * DWORD_FEATURES * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // To avoid the cost of an extra reducting kernel, we have to deal with some 
+    // gray area in OpenCL. We want the last work group that process this feature to
+    // make the final reduction, and other threads will just quit.
+    // This requires that the results written by other workgroups available to the
+    // last workgroup (memory consistency)
+    #if NVIDIA == 1
+    // this is equavalent to CUDA __threadfence();
+    // ensure the writes above goes to main memory and other workgroups can see it
+    asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
+    #else
+    // FIXME: how to do the above on AMD GPUs??
+    // GCN ISA says that the all writes will bypass L1 cache (write through),
+    // however when the last thread is reading sub-histogram data we have to
+    // make sure that no part of data is modified in local L1 cache of other workgroups.
+    // Otherwise reading can be a problem (atomic operations to get consistency).
+    // But in our case, the sub-histogram of this workgroup cannot be in the cache
+    // of another workgroup, so the following trick will work just fine.
+    #endif
+    // Now, we want one workgroup to do the final reduction.
+    // Other workgroups processing the same feature quit.
+    // The is done by using an global atomic counter.
+    // On AMD GPUs ideally this should be done in GDS,
+    // but currently there is no easy way to access it via OpenCL.
+    __local uint * counter_val = cnt_hist;
+    if (ltid == 0) {
+        // all workgroups processing the same feature add this counter
+        *counter_val = atom_inc(sync_counters + feature4_id);
+    }
+    // make sure everyone in this workgroup is here
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+    if (*counter_val == (1 << POWER_FEATURE_WORKGROUPS) - 1) {
+        if (ltid == 0) {
+            // printf("workgroup %d start reduction!\n", group_id);
+            // printf("feature_data[0] = %d %d %d %d", feature_data[0].s0, feature_data[0].s1, feature_data[0].s2, feature_data[0].s3);
+            // clear the sync counter for using it next time
+            sync_counters[feature4_id] = 0;
+        }
+    #else
+    // only 1 work group, no need to increase counter
+    // the reduction will become a simple copy
+    if (1) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+        // locate our feature4's block in output memory
+        uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
+        __global acc_type const * restrict feature4_subhists = 
+                 (__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 3 * NUM_BINS;
+        // skip reading the data already in local memory
+        uint skip_id = group_id ^ output_offset;
+        // locate output histogram location for this feature4
+        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 3 * NUM_BINS;
+        within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val, cnt_val, 
+                                    1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
+    }
+}
+
+// The following line ends the string literal, adds an extra #endif at the end
+// the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
+// )"" "\n#endif" + 9
+#endif
+
diff --git a/src/treelearner/ocl/histogram256.cl b/src/treelearner/ocl/histogram256.cl
new file mode 100644
index 000000000000..44471849fe6f
--- /dev/null
+++ b/src/treelearner/ocl/histogram256.cl
@@ -0,0 +1,813 @@
+// this file can either be read and passed to an OpenCL compiler directly,
+// or included in a C++11 source file as a string literal
+#ifndef __OPENCL_VERSION__
+// If we are including this file in C++,
+// the entire source file following (except the last #endif) will become
+// a raw string literal. The extra ")" is just for mathcing parentheses
+// to make the editor happy. The extra ")" and extra endif will be skipped.
+// DO NOT add anything between here and the next #ifdef, otherwise you need
+// to modify the skip count at the end of this file.
+R""()
+#endif
+
+#ifndef _HISTOGRAM_256_KERNEL_
+#define _HISTOGRAM_256_KERNEL_
+
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+
+// use double precision or not
+#ifndef USE_DP_FLOAT
+#define USE_DP_FLOAT 0
+#endif
+// ignore hessian, and use the local memory for hessian as an additional bank for gradient
+#ifndef CONST_HESSIAN
+#define CONST_HESSIAN 0
+#endif
+
+#define LOCAL_SIZE_0 256
+#define NUM_BINS 256
+#if USE_DP_FLOAT == 1
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+typedef double acc_type;
+typedef ulong acc_int_type;
+#define as_acc_type as_double
+#define as_acc_int_type as_ulong
+#else
+typedef float acc_type;
+typedef uint acc_int_type;
+#define as_acc_type as_float
+#define as_acc_int_type as_uint
+#endif
+#define LOCAL_MEM_SIZE (4 * (sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)
+
+// unroll the atomic operation for a few times. Takes more code space, 
+// but compiler can generate better code for faster atomics.
+#define UNROLL_ATOMIC 1
+
+// Options passed by compiler at run time:
+// IGNORE_INDICES will be set when the kernel does not 
+// #define IGNORE_INDICES
+// #define POWER_FEATURE_WORKGROUPS 10
+
+// detect Nvidia platforms
+#ifdef cl_nv_pragma_unroll
+#define NVIDIA 1
+#endif
+
+// use all features and do not use feature mask
+#ifndef ENABLE_ALL_FEATURES
+#define ENABLE_ALL_FEATURES 1
+#endif
+
+// use binary patching for AMD GCN 1.2 or newer
+#ifndef AMD_USE_DS_ADD_F32
+#define AMD_USE_DS_ADD_F32 0
+#endif
+
+typedef uint data_size_t;
+typedef float score_t;
+
+
+#define ATOMIC_FADD_SUB1 { \
+    expected.f_val = current.f_val; \
+    next.f_val = expected.f_val + val; \
+    current.u_val = atom_cmpxchg((volatile __local acc_int_type *)addr, expected.u_val, next.u_val); \
+    if (current.u_val == expected.u_val) \
+        goto end; \
+    }
+#define ATOMIC_FADD_SUB2  ATOMIC_FADD_SUB1 \
+                          ATOMIC_FADD_SUB1
+#define ATOMIC_FADD_SUB4  ATOMIC_FADD_SUB2 \
+                          ATOMIC_FADD_SUB2
+#define ATOMIC_FADD_SUB8  ATOMIC_FADD_SUB4 \
+                          ATOMIC_FADD_SUB4
+#define ATOMIC_FADD_SUB16 ATOMIC_FADD_SUB8 \
+                          ATOMIC_FADD_SUB8
+#define ATOMIC_FADD_SUB32 ATOMIC_FADD_SUB16\
+                          ATOMIC_FADD_SUB16
+#define ATOMIC_FADD_SUB64 ATOMIC_FADD_SUB32\
+                          ATOMIC_FADD_SUB32
+
+
+// atomic add for float number in local memory
+inline void atomic_local_add_f(__local acc_type *addr, const float val)
+{
+    union{
+        acc_int_type u_val;
+        acc_type f_val;
+    } next, expected, current;
+#if (NVIDIA == 1 && USE_DP_FLOAT == 0)
+    float res = 0;
+    asm volatile ("atom.shared.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(addr), "f"(val));
+#elif (AMD_USE_DS_ADD_F32 == 1 && USE_DP_FLAT == 0)
+    // this instruction (DS_AND_U32) will be patched into a DS_ADD_F32
+    // we need to hack here because DS_ADD_F32 is not exposed via OpenCL
+    atom_and((__local acc_int_type *)addr, as_acc_int_type(val));
+#else
+    current.f_val = *addr;
+    #if UNROLL_ATOMIC == 1
+    // provide a fast path
+    // then do the complete loop
+    // this should work on all devices
+    ATOMIC_FADD_SUB8
+    ATOMIC_FADD_SUB4
+    ATOMIC_FADD_SUB2
+    #endif
+    do {
+        expected.f_val = current.f_val;
+        next.f_val = expected.f_val + val;
+        current.u_val = atom_cmpxchg((volatile __local acc_int_type *)addr, expected.u_val, next.u_val);
+    } while (current.u_val != expected.u_val);
+    end:
+        ;
+#endif
+}
+
+// this function will be called by histogram256
+// we have one sub-histogram of one feature in local memory, and need to read others
+void within_kernel_reduction256x4(uchar4 feature_mask,
+                           __global const acc_type* restrict feature4_sub_hist, 
+                           const uint skip_id,
+                           const uint old_val_f0_cont_bin0,
+                           const ushort num_sub_hist,
+                           __global acc_type* restrict output_buf,
+                           __local acc_type* restrict local_hist) {
+    const ushort ltid = get_local_id(0);
+    const ushort lsize = LOCAL_SIZE_0;
+    // initialize register counters from our local memory
+    // TODO: try to avoid bank conflict here
+    acc_type f0_grad_bin = local_hist[ltid * 8];
+    acc_type f1_grad_bin = local_hist[ltid * 8 + 1];
+    acc_type f2_grad_bin = local_hist[ltid * 8 + 2];
+    acc_type f3_grad_bin = local_hist[ltid * 8 + 3];
+    acc_type f0_hess_bin = local_hist[ltid * 8 + 4];
+    acc_type f1_hess_bin = local_hist[ltid * 8 + 5];
+    acc_type f2_hess_bin = local_hist[ltid * 8 + 6];
+    acc_type f3_hess_bin = local_hist[ltid * 8 + 7];
+    __local uint* restrict local_cnt = (__local uint *)(local_hist + 4 * 2 * NUM_BINS);
+    #if POWER_FEATURE_WORKGROUPS != 0
+    uint  f0_cont_bin = ltid ? local_cnt[ltid * 4] : old_val_f0_cont_bin0;
+    #else
+    uint  f0_cont_bin = local_cnt[ltid * 4];
+    #endif
+    uint  f1_cont_bin = local_cnt[ltid * 4 + 1];
+    uint  f2_cont_bin = local_cnt[ltid * 4 + 2];
+    uint  f3_cont_bin = local_cnt[ltid * 4 + 3];
+    ushort i;
+    // printf("%d-pre(skip %d): %f %f %f %f %f %f %f %f %d %d %d %d", ltid, skip_id, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
+#if POWER_FEATURE_WORKGROUPS != 0
+    // add all sub-histograms for 4 features
+    __global const acc_type* restrict p = feature4_sub_hist + ltid;
+    for (i = 0; i < skip_id; ++i) {
+        if (feature_mask.s3) {
+            f0_grad_bin += *p;          p += NUM_BINS;
+            f0_hess_bin += *p;          p += NUM_BINS;
+            f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+        else {
+            p += 3 * NUM_BINS;
+        }
+        if (feature_mask.s2) {
+            f1_grad_bin += *p;          p += NUM_BINS;
+            f1_hess_bin += *p;          p += NUM_BINS;
+            f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+        else {
+            p += 3 * NUM_BINS;
+        }
+        if (feature_mask.s1) {
+            f2_grad_bin += *p;          p += NUM_BINS;
+            f2_hess_bin += *p;          p += NUM_BINS;
+            f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+        else {
+            p += 3 * NUM_BINS;
+        }
+        if (feature_mask.s0) {
+            f3_grad_bin += *p;          p += NUM_BINS;
+            f3_hess_bin += *p;          p += NUM_BINS;
+            f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+        else {
+            p += 3 * NUM_BINS;
+        }
+    }
+    // skip the counters we already have
+    p += 3 * 4 * NUM_BINS;
+    for (i = i + 1; i < num_sub_hist; ++i) {
+        if (feature_mask.s3) {
+            f0_grad_bin += *p;          p += NUM_BINS;
+            f0_hess_bin += *p;          p += NUM_BINS;
+            f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+        else {
+            p += 3 * NUM_BINS;
+        }
+        if (feature_mask.s2) {
+            f1_grad_bin += *p;          p += NUM_BINS;
+            f1_hess_bin += *p;          p += NUM_BINS;
+            f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+        else {
+            p += 3 * NUM_BINS;
+        }
+        if (feature_mask.s1) {
+            f2_grad_bin += *p;          p += NUM_BINS;
+            f2_hess_bin += *p;          p += NUM_BINS;
+            f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+        else {
+            p += 3 * NUM_BINS;
+        }
+        if (feature_mask.s0) {
+            f3_grad_bin += *p;          p += NUM_BINS;
+            f3_hess_bin += *p;          p += NUM_BINS;
+            f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
+        }
+        else {
+            p += 3 * NUM_BINS;
+        }
+    }
+    // printf("%d-aft: %f %f %f %f %f %f %f %f %d %d %d %d", ltid, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
+    #endif
+    // now overwrite the local_hist for final reduction and output
+    barrier(CLK_LOCAL_MEM_FENCE);
+    #if USE_DP_FLOAT == 0
+    // reverse the f3...f0 order to match the real order
+    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin;
+    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin;
+    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin);
+    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin;
+    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin;
+    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin);
+    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin;
+    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin;
+    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin);
+    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin;
+    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
+    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    /*
+    for (ushort i = ltid; i < 4 * 3 * NUM_BINS; i += lsize) {
+        output_buf[i] = local_hist[i];
+    }
+    */
+    i = ltid;
+    if (feature_mask.s0) {
+        output_buf[i] = local_hist[i];
+        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
+        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+    }
+    i += 1 * 3 * NUM_BINS;
+    if (feature_mask.s1) {
+        output_buf[i] = local_hist[i];
+        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
+        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+    }
+    i += 1 * 3 * NUM_BINS;
+    if (feature_mask.s2) {
+        output_buf[i] = local_hist[i];
+        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
+        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+    }
+    i += 1 * 3 * NUM_BINS;
+    if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) {
+        output_buf[i] = local_hist[i];
+        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
+        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+    }
+    #else
+    // when double precision is used, we need to write twice, because local memory size is not enough
+    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin;
+    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin;
+    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin);
+    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin;
+    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin;
+    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    /*
+    for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
+        output_buf[i] = local_hist[i];
+    }
+    */
+    i = ltid;
+    if (feature_mask.s0) {
+        output_buf[i] = local_hist[i];
+        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
+        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+    }
+    i += 1 * 3 * NUM_BINS;
+    if (feature_mask.s1) {
+        output_buf[i] = local_hist[i];
+        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
+        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin;
+    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin;
+    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin);
+    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin;
+    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
+    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    /*
+    for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
+        output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
+    }
+    */
+    i = ltid;
+    if (feature_mask.s2) {
+        output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
+        output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
+        output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+    }
+    i += 1 * 3 * NUM_BINS;
+    if (feature_mask.s3) {
+        output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
+        output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
+        output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
+    }
+    #endif
+}
+
+#define printf
+
+__attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1)))
+#if USE_CONSTANT_BUF == 1
+__kernel void histogram256(__global const uchar4* restrict feature_data_base, 
+                      __constant const uchar4* restrict feature_masks __attribute__((max_constant_size(65536))),
+                      const data_size_t feature_size,
+                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), 
+                      const data_size_t num_data, 
+                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), 
+#if CONST_HESSIAN == 0
+                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
+#else
+                      const score_t const_hessian,
+#endif
+                      __global char* restrict output_buf,
+                      __global volatile int * sync_counters,
+                      __global acc_type* restrict hist_buf_base) {
+#else
+__kernel void histogram256(__global const uchar4* feature_data_base, 
+                      __constant const uchar4* restrict feature_masks __attribute__((max_constant_size(65536))),
+                      const data_size_t feature_size,
+                      __global const data_size_t* data_indices, 
+                      const data_size_t num_data, 
+                      __global const score_t*  ordered_gradients, 
+#if CONST_HESSIAN == 0
+                      __global const score_t*  ordered_hessians,
+#else
+                      const score_t const_hessian,
+#endif
+                      __global char* restrict output_buf, 
+                      __global volatile int * sync_counters,
+                      __global acc_type* restrict hist_buf_base) {
+#endif
+    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
+    // otherwise a "Misaligned Address" exception may occur
+    __local float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
+    const uint gtid = get_global_id(0);
+    const uint gsize = get_global_size(0);
+    const ushort ltid = get_local_id(0);
+    const ushort lsize = LOCAL_SIZE_0; // get_local_size(0);
+    const ushort group_id = get_group_id(0);
+
+    // local memory per workgroup is 12 KB
+    // clear local memory
+    __local uint * ptr = (__local uint *) shared_array;
+    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+        ptr[i] = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // gradient/hessian histograms
+    // assume this starts at 32 * 4 = 128-byte boundary
+    // total size: 2 * 4 * 256 * size_of(float) = 8 KB
+    // organization: each feature/grad/hessian is at a different bank, 
+    //               as indepedent of the feature value as possible
+    __local acc_type * gh_hist = (__local acc_type *)shared_array;
+    // counter histogram
+    // total size: 4 * 256 * size_of(uint) = 4 KB
+    __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS);
+
+    // thread 0, 1, 2, 3 compute histograms for gradients first
+    // thread 4, 5, 6, 7 compute histograms for hessians  first
+    // etc.
+    uchar is_hessian_first = (ltid >> 2) & 1;
+    
+    ushort group_feature = group_id >> POWER_FEATURE_WORKGROUPS;
+    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
+    // feature_size is the number of examples per feature
+    __global const uchar4* feature_data = feature_data_base + group_feature * feature_size;
+    // size of threads that process this feature4
+    const uint subglobal_size = lsize * (1 << POWER_FEATURE_WORKGROUPS);
+    // equavalent thread ID in this subgroup for this feature4
+    const uint subglobal_tid  = gtid - group_feature * subglobal_size;
+    // extract feature mask, when a byte is set to 0, that feature is disabled
+    #if ENABLE_ALL_FEATURES == 1
+    // hopefully the compiler will propogate the constants and eliminate all branches
+    uchar4 feature_mask = (uchar4)(0xff, 0xff, 0xff, 0xff);
+    #else
+    uchar4 feature_mask = feature_masks[group_feature];
+    #endif
+    // exit if all features are masked
+    if (!as_uint(feature_mask)) {
+        return;
+    }
+
+    // STAGE 1: read feature data, and gradient and hessian
+    // first half of the threads read feature data from global memory
+    // 4 features stored in a tuple MSB...(0, 1, 2, 3)...LSB
+    // We will prefetch data into the "next" variable at the beginning of each iteration
+    uchar4 feature4;
+    uchar4 feature4_next;
+    uchar4 feature4_prev;
+    // offset used to rotate feature4 vector
+    ushort offset = (ltid & 0x3);
+    // store gradient and hessian
+    float stat1, stat2;
+    float stat1_next, stat2_next;
+    ushort bin, addr, addr2;
+    data_size_t ind;
+    data_size_t ind_next;
+    stat1 = ordered_gradients[subglobal_tid];
+    #if CONST_HESSIAN == 0
+    stat2 = ordered_hessians[subglobal_tid];
+    #endif
+    #ifdef IGNORE_INDICES
+    ind = subglobal_tid;
+    #else
+    ind = data_indices[subglobal_tid];
+    #endif
+    feature4 = feature_data[ind];
+    feature4_prev = feature4;
+    feature4_prev = as_uchar4(rotate(as_uint(feature4_prev), (uint)offset*8));
+    #if ENABLE_ALL_FEATURES == 0
+    // rotate feature_mask to match the feature order of each thread
+    feature_mask = as_uchar4(rotate(as_uint(feature_mask), (uint)offset*8));
+    #endif
+    acc_type s3_stat1 = 0.0f, s3_stat2 = 0.0f;
+    acc_type s2_stat1 = 0.0f, s2_stat2 = 0.0f;
+    acc_type s1_stat1 = 0.0f, s1_stat2 = 0.0f;
+    acc_type s0_stat1 = 0.0f, s0_stat2 = 0.0f;
+
+    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
+    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+        // prefetch the next iteration variables
+        // we don't need bondary check because we have made the buffer larger
+        stat1_next = ordered_gradients[i + subglobal_size];
+        #if CONST_HESSIAN == 0
+        stat2_next = ordered_hessians[i + subglobal_size];
+        #endif
+        #ifdef IGNORE_INDICES
+        // we need to check to bounds here
+        ind_next = i + subglobal_size < num_data ? i + subglobal_size : i;
+        // start load next feature as early as possible
+        feature4_next = feature_data[ind_next];
+        #else
+        ind_next = data_indices[i + subglobal_size];
+        #endif
+        #if CONST_HESSIAN == 0
+        // swap gradient and hessian for threads 4, 5, 6, 7
+        float tmp = stat1;
+        stat1 = is_hessian_first ? stat2 : stat1;
+        stat2 = is_hessian_first ? tmp   : stat2;
+        // stat1 = select(stat1, stat2, is_hessian_first);
+        // stat2 = select(stat2, tmp, is_hessian_first);
+        #endif
+
+        // STAGE 2: accumulate gradient and hessian
+        offset = (ltid & 0x3);
+        feature4 = as_uchar4(rotate(as_uint(feature4), (uint)offset*8));
+        bin = feature4.s3;
+        if ((bin != feature4_prev.s3) && feature_mask.s3) {
+            // printf("%3d (%4d): writing s3 %d %d offset %d", ltid, i, bin, feature4_prev.s3, offset);
+            bin = feature4_prev.s3;
+            feature4_prev.s3 = feature4.s3;
+            addr = bin * 8 + is_hessian_first * 4 + offset;
+            addr2 = addr + 4 - 8 * is_hessian_first;
+            atomic_local_add_f(gh_hist + addr, s3_stat1);
+            // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's gradients for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's hessians  for example 4, 5, 6, 7
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, s3_stat2);
+            #endif
+            // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's hessians  for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's gradients for example 4, 5, 6, 7
+            s3_stat1 = stat1;
+            s3_stat2 = stat2;
+        }
+        else {
+            // printf("%3d (%4d): acc s3 %d", ltid, i, bin);
+            s3_stat1 += stat1;
+            s3_stat2 += stat2;
+        }
+
+        bin = feature4.s2;
+        offset = (offset + 1) & 0x3;
+        if ((bin != feature4_prev.s2) && feature_mask.s2) {
+            // printf("%3d (%4d): writing s2 %d %d feature %d", ltid, i, bin, feature4_prev.s2, offset);
+            bin = feature4_prev.s2;
+            feature4_prev.s2 = feature4.s2;
+            addr = bin * 8 + is_hessian_first * 4 + offset;
+            addr2 = addr + 4 - 8 * is_hessian_first;
+            atomic_local_add_f(gh_hist + addr, s2_stat1);
+            // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's gradients for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's hessians  for example 4, 5, 6, 7
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, s2_stat2);
+            #endif
+            // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's hessians  for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's gradients for example 4, 5, 6, 7
+            s2_stat1 = stat1;
+            s2_stat2 = stat2;
+        }
+        else {
+            // printf("%3d (%4d): acc s2 %d", ltid, i, bin);
+            s2_stat1 += stat1;
+            s2_stat2 += stat2;
+        }
+
+
+        // prefetch the next iteration variables
+        // we don't need bondary check because if it is out of boundary, ind_next = 0
+        #ifndef IGNORE_INDICES
+        feature4_next = feature_data[ind_next];
+        #endif
+
+        bin = feature4.s1;
+        offset = (offset + 1) & 0x3;
+        if ((bin != feature4_prev.s1) && feature_mask.s1) {
+            // printf("%3d (%4d): writing s1 %d %d feature %d", ltid, i, bin, feature4_prev.s1, offset);
+            bin = feature4_prev.s1;
+            feature4_prev.s1 = feature4.s1;
+            addr = bin * 8 + is_hessian_first * 4 + offset;
+            addr2 = addr + 4 - 8 * is_hessian_first;
+            atomic_local_add_f(gh_hist + addr, s1_stat1);
+            // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's gradients for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's hessians  for example 4, 5, 6, 7
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, s1_stat2);
+            #endif
+            // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's hessians  for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's gradients for example 4, 5, 6, 7
+            s1_stat1 = stat1;
+            s1_stat2 = stat2;
+        }
+        else {
+            // printf("%3d (%4d): acc s1 %d", ltid, i, bin);
+            s1_stat1 += stat1;
+            s1_stat2 += stat2;
+        }
+
+        bin = feature4.s0;
+        offset = (offset + 1) & 0x3;
+        if ((bin != feature4_prev.s0) && feature_mask.s0) {
+            // printf("%3d (%4d): writing s0 %d %d feature %d", ltid, i, bin, feature4_prev.s0, offset);
+            bin = feature4_prev.s0;
+            feature4_prev.s0 = feature4.s0;
+            addr = bin * 8 + is_hessian_first * 4 + offset;
+            addr2 = addr + 4 - 8 * is_hessian_first;
+            atomic_local_add_f(gh_hist + addr, s0_stat1);
+            // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's gradients for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's hessians  for example 4, 5, 6, 7
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, s0_stat2);
+            #endif
+            // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's hessians  for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's gradients for example 4, 5, 6, 7
+            s0_stat1 = stat1;
+            s0_stat2 = stat2;
+        }
+        else {
+            // printf("%3d (%4d): acc s0 %d", ltid, i, bin);
+            s0_stat1 += stat1;
+            s0_stat2 += stat2;
+        }
+
+        // STAGE 3: accumulate counter
+        // there are 4 counters for 4 features
+        // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
+        offset = (ltid & 0x3);
+        if (feature_mask.s3) {
+            bin = feature4.s3;
+            addr = bin * 4 + offset;
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's counts for example 0, 1, 2, 3
+        offset = (offset + 1) & 0x3;
+        if (feature_mask.s2) {
+            bin = feature4.s2;
+            addr = bin * 4 + offset;
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's counts for example 0, 1, 2, 3
+        offset = (offset + 1) & 0x3;
+        if (feature_mask.s1) {
+            bin = feature4.s1;
+            addr = bin * 4 + offset;
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's counts for example 0, 1, 2, 3
+        offset = (offset + 1) & 0x3;
+        if (feature_mask.s0) {
+            bin = feature4.s0;
+            addr = bin * 4 + offset;
+            atom_inc(cnt_hist + addr);
+        }
+        stat1 = stat1_next;
+        stat2 = stat2_next;
+        feature4 = feature4_next;
+    }
+
+    bin = feature4_prev.s3;
+    offset = (ltid & 0x3);
+    addr = bin * 8 + is_hessian_first * 4 + offset;
+    addr2 = addr + 4 - 8 * is_hessian_first;
+    atomic_local_add_f(gh_hist + addr, s3_stat1);
+    #if CONST_HESSIAN == 0
+    atomic_local_add_f(gh_hist + addr2, s3_stat2);
+    #endif
+
+    bin = feature4_prev.s2;
+    offset = (offset + 1) & 0x3;
+    addr = bin * 8 + is_hessian_first * 4 + offset;
+    addr2 = addr + 4 - 8 * is_hessian_first;
+    atomic_local_add_f(gh_hist + addr, s2_stat1);
+    #if CONST_HESSIAN == 0
+    atomic_local_add_f(gh_hist + addr2, s2_stat2);
+    #endif
+
+    bin = feature4_prev.s1;
+    offset = (offset + 1) & 0x3;
+    addr = bin * 8 + is_hessian_first * 4 + offset;
+    addr2 = addr + 4 - 8 * is_hessian_first;
+    atomic_local_add_f(gh_hist + addr, s1_stat1);
+    #if CONST_HESSIAN == 0
+    atomic_local_add_f(gh_hist + addr2, s1_stat2);
+    #endif
+
+    bin = feature4_prev.s0;
+    offset = (offset + 1) & 0x3;
+    addr = bin * 8 + is_hessian_first * 4 + offset;
+    addr2 = addr + 4 - 8 * is_hessian_first;
+    atomic_local_add_f(gh_hist + addr, s0_stat1);
+    #if CONST_HESSIAN == 0
+    atomic_local_add_f(gh_hist + addr2, s0_stat2);
+    #endif
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    #if ENABLE_ALL_FEATURES == 0
+    // restore feature_mask
+    feature_mask = feature_masks[group_feature];
+    #endif
+
+    #if CONST_HESSIAN == 1
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // make a final reduction
+    offset = ltid & 0x3; // helps avoid LDS bank conflicts
+    gh_hist[ltid * 8 + offset] += gh_hist[ltid * 8 + offset + 4];
+    gh_hist[ltid * 8 + offset + 4] = const_hessian * cnt_hist[ltid * 4 + offset];
+    offset = (offset + 1) & 0x3;
+    gh_hist[ltid * 8 + offset] += gh_hist[ltid * 8 + offset + 4];
+    gh_hist[ltid * 8 + offset + 4] = const_hessian * cnt_hist[ltid * 4 + offset];
+    offset = (offset + 1) & 0x3;
+    gh_hist[ltid * 8 + offset] += gh_hist[ltid * 8 + offset + 4];
+    gh_hist[ltid * 8 + offset + 4] = const_hessian * cnt_hist[ltid * 4 + offset];
+    offset = (offset + 1) & 0x3;
+    gh_hist[ltid * 8 + offset] += gh_hist[ltid * 8 + offset + 4];
+    gh_hist[ltid * 8 + offset + 4] = const_hessian * cnt_hist[ltid * 4 + offset];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+
+    // write to output
+    // write gradients and hessians histogram for all 4 features
+    /* memory layout in gh_hist (total 2 * 4 * 256 * sizeof(float) = 8 KB):
+       -----------------------------------------------------------------------------------------------
+       g_f0_bin0   g_f1_bin0   g_f2_bin0   g_f3_bin0   h_f0_bin0   h_f1_bin0   h_f2_bin0   h_f3_bin0
+       g_f0_bin1   g_f1_bin1   g_f2_bin1   g_f3_bin1   h_f0_bin1   h_f1_bin1   h_f2_bin1   h_f3_bin1
+       ...
+       g_f0_bin255 g_f1_bin255 g_f2_bin255 g_f3_bin255 h_f0_bin255 h_f1_bin255 h_f2_bin255 h_f3_bin255
+       -----------------------------------------------------------------------------------------------
+    */
+    /* memory layout in cnt_hist (total 4 * 256 * sizeof(uint) = 4 KB):
+       -----------------------------------------------
+       c_f0_bin0   c_f1_bin0   c_f2_bin0   c_f3_bin0
+       c_f0_bin1   c_f1_bin1   c_f2_bin1   c_f3_bin1
+       ...
+       c_f0_bin255 c_f1_bin255 c_f2_bin255 c_f3_bin255
+       -----------------------------------------------
+    */
+    // output data in linear order for further reduction
+    // output size = 4 (features) * 3 (counters) * 256 (bins) * sizeof(float)
+    /* memory layout of output:
+       --------------------------------------------
+       g_f0_bin0   g_f0_bin1   ...   g_f0_bin255   \
+       h_f0_bin0   h_f0_bin1   ...   h_f0_bin255    |
+       c_f0_bin0   c_f0_bin1   ...   c_f0_bin255    |
+       g_f1_bin0   g_f1_bin1   ...   g_f1_bin255    |
+       h_f1_bin0   h_f1_bin1   ...   h_f1_bin255    |
+       c_f1_bin0   c_f1_bin1   ...   c_f1_bin255    |--- 1 sub-histogram block
+       g_f2_bin0   g_f2_bin1   ...   g_f2_bin255    |
+       h_f2_bin0   h_f2_bin1   ...   h_f2_bin255    |
+       c_f2_bin0   c_f2_bin1   ...   c_f2_bin255    |
+       g_f3_bin0   g_f3_bin1   ...   g_f3_bin255    |
+       h_f3_bin0   h_f3_bin1   ...   h_f3_bin255    |
+       c_f3_bin0   c_f3_bin1   ...   c_f3_bin255   /
+       --------------------------------------------
+    */
+    uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
+    // if there is only one workgroup processing this feature4, don't even need to write
+    #if POWER_FEATURE_WORKGROUPS != 0
+    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS;
+    // write gradients and hessians
+    __global acc_type * restrict ptr_f = output;
+    for (ushort j = 0; j < 4; ++j) {
+        for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
+            // even threads read gradients, odd threads read hessians
+            // FIXME: 2-way bank conflict
+            acc_type value = gh_hist[i * 4 + j];
+            ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
+        }
+        ptr_f += 3 * NUM_BINS;
+    }
+    // write counts
+    __global acc_int_type * restrict ptr_i = (__global acc_int_type * restrict)(output + 2 * NUM_BINS);
+    for (ushort j = 0; j < 4; ++j) {
+        for (ushort i = ltid; i < NUM_BINS; i += lsize) {
+            // FIXME: 2-way bank conflict
+            uint value = cnt_hist[i * 4 + j];
+            ptr_i[i] = value;
+        }
+        ptr_i += 3 * NUM_BINS;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // To avoid the cost of an extra reducting kernel, we have to deal with some 
+    // gray area in OpenCL. We want the last work group that process this feature to
+    // make the final reduction, and other threads will just quit.
+    // This requires that the results written by other workgroups available to the
+    // last workgroup (memory consistency)
+    #if NVIDIA == 1
+    // this is equavalent to CUDA __threadfence();
+    // ensure the writes above goes to main memory and other workgroups can see it
+    asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
+    #else
+    // FIXME: how to do the above on AMD GPUs??
+    // GCN ISA says that the all writes will bypass L1 cache (write through),
+    // however when the last thread is reading sub-histogram data we have to
+    // make sure that no part of data is modified in local L1 cache of other workgroups.
+    // Otherwise reading can be a problem (atomic operations to get consistency).
+    // But in our case, the sub-histogram of this workgroup cannot be in the cache
+    // of another workgroup, so the following trick will work just fine.
+    #endif
+    // Now, we want one workgroup to do the final reduction.
+    // Other workgroups processing the same feature quit.
+    // The is done by using an global atomic counter.
+    // On AMD GPUs ideally this should be done in GDS,
+    // but currently there is no easy way to access it via OpenCL.
+    __local uint * counter_val = cnt_hist;
+    // backup the old value
+    uint old_val = *counter_val;
+    if (ltid == 0) {
+        // all workgroups processing the same feature add this counter
+        *counter_val = atom_inc(sync_counters + feature4_id);
+    }
+    // make sure everyone in this workgroup is here
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+    if (*counter_val == (1 << POWER_FEATURE_WORKGROUPS) - 1) {
+        if (ltid == 0) {
+            // printf("workgroup %d: %g %g %g %g %g %g %g %g\n", group_id, gh_hist[0], gh_hist[1], gh_hist[2], gh_hist[3], gh_hist[4], gh_hist[5], gh_hist[6], gh_hist[7]);
+            // printf("feature_data[0] = %d %d %d %d", feature_data[0].s0, feature_data[0].s1, feature_data[0].s2, feature_data[0].s3);
+            // clear the sync counter for using it next time
+            sync_counters[feature4_id] = 0;
+        }
+    #else
+    // only 1 work group, no need to increase counter
+    // the reduction will become a simple copy
+    if (1) {
+        uint old_val; // dummy
+    #endif
+        // locate our feature4's block in output memory
+        uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
+        __global acc_type const * restrict feature4_subhists = 
+                 (__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS;
+        // skip reading the data already in local memory
+        uint skip_id = group_id ^ output_offset;
+        // locate output histogram location for this feature4
+        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS;
+        within_kernel_reduction256x4(feature_mask, feature4_subhists, skip_id, old_val, 1 << POWER_FEATURE_WORKGROUPS, 
+                                     hist_buf, (__local acc_type *)shared_array);
+        // if (ltid == 0) 
+        //    printf("workgroup %d reduction done, %g %g %g %g %g %g %g %g\n", group_id, hist_buf[0], hist_buf[3*NUM_BINS], hist_buf[2*3*NUM_BINS], hist_buf[3*3*NUM_BINS], hist_buf[1], hist_buf[3*NUM_BINS+1], hist_buf[2*3*NUM_BINS+1], hist_buf[3*3*NUM_BINS+1]);
+    }
+}
+
+// The following line ends the string literal, adds an extra #endif at the end
+// the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
+// )"" "\n#endif" + 9
+#endif
+
diff --git a/src/treelearner/ocl/histogram64.cl b/src/treelearner/ocl/histogram64.cl
new file mode 100644
index 000000000000..661f764d4b63
--- /dev/null
+++ b/src/treelearner/ocl/histogram64.cl
@@ -0,0 +1,724 @@
+// this file can either be read and passed to an OpenCL compiler directly,
+// or included in a C++11 source file as a string literal
+#ifndef __OPENCL_VERSION__
+// If we are including this file in C++,
+// the entire source file following (except the last #endif) will become
+// a raw string literal. The extra ")" is just for mathcing parentheses
+// to make the editor happy. The extra ")" and extra endif will be skipped.
+// DO NOT add anything between here and the next #ifdef, otherwise you need
+// to modify the skip count at the end of this file.
+R""()
+#endif
+
+#ifndef _HISTOGRAM_64_KERNEL_
+#define _HISTOGRAM_64_KERNEL_
+
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+
+// Configurable options:
+// NUM_BANKS should be a power of 2
+#ifndef NUM_BANKS
+#define NUM_BANKS 4
+#endif
+// how many bits in thread ID represent the bank = log2(NUM_BANKS)
+#ifndef BANK_BITS
+#define BANK_BITS 2
+#endif
+// use double precision or not
+#ifndef USE_DP_FLOAT
+#define USE_DP_FLOAT 0
+#endif
+// ignore hessian, and use the local memory for hessian as an additional bank for gradient
+#ifndef CONST_HESSIAN
+#define CONST_HESSIAN 0
+#endif
+
+
+#define LOCAL_SIZE_0 256
+#define NUM_BINS 64
+// if USE_DP_FLOAT is set to 1, we will use double precision for the accumulator
+#if USE_DP_FLOAT == 1
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+typedef double acc_type;
+typedef ulong acc_int_type;
+#define as_acc_type as_double
+#define as_acc_int_type as_ulong
+#else
+typedef float acc_type;
+typedef uint acc_int_type;
+#define as_acc_type as_float
+#define as_acc_int_type as_uint
+#endif
+// mask for getting the bank ID
+#define BANK_MASK (NUM_BANKS - 1)
+// 4 features, each has a gradient and a hessian
+#define HG_BIN_MULT (NUM_BANKS * 4 * 2)
+// 4 features, each has a counter
+#define CNT_BIN_MULT (NUM_BANKS * 4)
+// local memory size in bytes
+#define LOCAL_MEM_SIZE (4 * (sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS * NUM_BANKS)
+
+// unroll the atomic operation for a few times. Takes more code space, 
+// but compiler can generate better code for faster atomics.
+#define UNROLL_ATOMIC 1
+
+// Options passed by compiler at run time:
+// IGNORE_INDICES will be set when the kernel does not 
+// #define IGNORE_INDICES
+// #define POWER_FEATURE_WORKGROUPS 10
+
+// use all features and do not use feature mask
+#ifndef ENABLE_ALL_FEATURES
+#define ENABLE_ALL_FEATURES 1
+#endif
+
+// detect Nvidia platforms
+#ifdef cl_nv_pragma_unroll
+#define NVIDIA 1
+#endif
+
+// use binary patching for AMD GCN 1.2 or newer
+#ifndef AMD_USE_DS_ADD_F32
+#define AMD_USE_DS_ADD_F32 0
+#endif
+
+typedef uint data_size_t;
+typedef float score_t;
+
+#define ATOMIC_FADD_SUB1 { \
+    expected.f_val = current.f_val; \
+    next.f_val = expected.f_val + val; \
+    current.u_val = atom_cmpxchg((volatile __local acc_int_type *)addr, expected.u_val, next.u_val); \
+    if (current.u_val == expected.u_val) \
+        goto end; \
+    }
+#define ATOMIC_FADD_SUB2  ATOMIC_FADD_SUB1 \
+                          ATOMIC_FADD_SUB1
+#define ATOMIC_FADD_SUB4  ATOMIC_FADD_SUB2 \
+                          ATOMIC_FADD_SUB2
+#define ATOMIC_FADD_SUB8  ATOMIC_FADD_SUB4 \
+                          ATOMIC_FADD_SUB4
+#define ATOMIC_FADD_SUB16 ATOMIC_FADD_SUB8 \
+                          ATOMIC_FADD_SUB8
+#define ATOMIC_FADD_SUB32 ATOMIC_FADD_SUB16\
+                          ATOMIC_FADD_SUB16
+#define ATOMIC_FADD_SUB64 ATOMIC_FADD_SUB32\
+                          ATOMIC_FADD_SUB32
+
+
+// atomic add for float number in local memory
+inline void atomic_local_add_f(__local acc_type *addr, const float val)
+{
+    union{
+        acc_int_type u_val;
+        acc_type f_val;
+    } next, expected, current;
+#if (NVIDIA == 1 && USE_DP_FLOAT == 0)
+    float res = 0;
+    asm volatile ("atom.shared.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(addr), "f"(val));
+#elif (AMD_USE_DS_ADD_F32 == 1 && USE_DP_FLAT == 0)
+    // this instruction (DS_AND_U32) will be patched into a DS_ADD_F32
+    // we need to hack here because DS_ADD_F32 is not exposed via OpenCL
+    atom_and((__local acc_int_type *)addr, as_acc_int_type(val));
+#else
+    current.f_val = *addr;
+    #if UNROLL_ATOMIC == 1
+    // provide a fast path
+    // then do the complete loop
+    // this should work on all devices
+    ATOMIC_FADD_SUB8
+    ATOMIC_FADD_SUB4
+    ATOMIC_FADD_SUB2
+    #endif
+    do {
+        expected.f_val = current.f_val;
+        next.f_val = expected.f_val + val;
+        current.u_val = atom_cmpxchg((volatile __local acc_int_type *)addr, expected.u_val, next.u_val);
+    } while (current.u_val != expected.u_val);
+    end:
+        ;
+#endif
+}
+
+// this function will be called by histogram64
+// we have one sub-histogram of one feature in registers, and need to read others
+void within_kernel_reduction64x4(uchar4 feature_mask,
+                           __global const acc_type* restrict feature4_sub_hist, 
+                           const uint skip_id,
+                           acc_type g_val, acc_type h_val, uint cnt_val,
+                           const ushort num_sub_hist,
+                           __global acc_type* restrict output_buf,
+                           __local acc_type * restrict local_hist) {
+    const ushort ltid = get_local_id(0); // range 0 - 255
+    const ushort lsize = LOCAL_SIZE_0;
+    ushort feature_id = ltid & 3; // range 0 - 4
+    const ushort bin_id = ltid >> 2; // range 0 - 63W
+    ushort i;
+    #if POWER_FEATURE_WORKGROUPS != 0 
+    // if there is only 1 work group, no need to do the reduction
+    // add all sub-histograms for 4 features
+    __global const acc_type* restrict p = feature4_sub_hist + ltid;
+    for (i = 0; i < skip_id; ++i) {
+            g_val += *p;            p += NUM_BINS * 4; // 256 threads working on 4 features' 64 bins
+            h_val += *p;            p += NUM_BINS * 4;
+            cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4;
+    }
+    // skip the counters we already have
+    p += 3 * 4 * NUM_BINS;
+    for (i = i + 1; i < num_sub_hist; ++i) {
+            g_val += *p;            p += NUM_BINS * 4;
+            h_val += *p;            p += NUM_BINS * 4;
+            cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4;
+    }
+    #endif
+    // printf("thread %d: g_val=%f, h_val=%f cnt=%d", ltid, g_val, h_val, cnt_val);
+    // now overwrite the local_hist for final reduction and output
+    // reverse the f3...f0 order to match the real order
+    feature_id = 3 - feature_id;
+    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 0] = g_val;
+    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 1] = h_val;
+    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    i = ltid;
+    if (feature_mask.s0 && i < 1 * 3 * NUM_BINS) {
+        output_buf[i] = local_hist[i];
+    }
+    i += 1 * 3 * NUM_BINS;
+    if (feature_mask.s1 && i < 2 * 3 * NUM_BINS) {
+        output_buf[i] = local_hist[i];
+    }
+    i += 1 * 3 * NUM_BINS;
+    if (feature_mask.s2 && i < 3 * 3 * NUM_BINS) {
+        output_buf[i] = local_hist[i];
+    }
+    i += 1 * 3 * NUM_BINS;
+    if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) {
+        output_buf[i] = local_hist[i];
+    }
+}
+
+#define printf
+
+__attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1)))
+#if USE_CONSTANT_BUF == 1
+__kernel void histogram64(__global const uchar4* restrict feature_data_base, 
+                      __constant const uchar4* restrict feature_masks __attribute__((max_constant_size(65536))),
+                      const data_size_t feature_size,
+                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), 
+                      const data_size_t num_data, 
+                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), 
+#if CONST_HESSIAN == 0
+                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
+#else
+                      const score_t const_hessian,
+#endif
+                      __global char* restrict output_buf,
+                      __global volatile int * sync_counters,
+                      __global acc_type* restrict hist_buf_base) {
+#else
+__kernel void histogram64(__global const uchar4* feature_data_base, 
+                      __constant const uchar4* restrict feature_masks __attribute__((max_constant_size(65536))),
+                      const data_size_t feature_size,
+                      __global const data_size_t* data_indices, 
+                      const data_size_t num_data, 
+                      __global const score_t*  ordered_gradients, 
+#if CONST_HESSIAN == 0
+                      __global const score_t*  ordered_hessians,
+#else
+                      const score_t const_hessian,
+#endif
+                      __global char* restrict output_buf, 
+                      __global volatile int * sync_counters,
+                      __global acc_type* restrict hist_buf_base) {
+#endif
+    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
+    // otherwise a "Misaligned Address" exception may occur
+    __local float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
+    const uint gtid = get_global_id(0);
+    const uint gsize = get_global_size(0);
+    const ushort ltid = get_local_id(0);
+    const ushort lsize = LOCAL_SIZE_0; // get_local_size(0);
+    const ushort group_id = get_group_id(0);
+
+    // local memory per workgroup is 12 KB
+    // clear local memory
+    __local uint * ptr = (__local uint *) shared_array;
+    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
+        ptr[i] = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // gradient/hessian histograms
+    // assume this starts at 32 * 4 = 128-byte boundary
+    // each bank: 2 * 4 * 64 * size_of(float) = 2 KB
+    // there are 4 banks (sub-histograms) used by 256 threads total 8 KB
+    /* memory layout of gh_hist:
+       -----------------------------------------------------------------------------------------------
+       bk0_g_f0_bin0   bk0_g_f1_bin0   bk0_g_f2_bin0   bk0_g_f3_bin0   bk0_h_f0_bin0   bk0_h_f1_bin0   bk0_h_f2_bin0   bk0_h_f3_bin0
+       bk1_g_f0_bin0   bk1_g_f1_bin0   bk1_g_f2_bin0   bk1_g_f3_bin0   bk1_h_f0_bin0   bk1_h_f1_bin0   bk1_h_f2_bin0   bk1_h_f3_bin0
+       bk2_g_f0_bin0   bk2_g_f1_bin0   bk2_g_f2_bin0   bk2_g_f3_bin0   bk2_h_f0_bin0   bk2_h_f1_bin0   bk2_h_f2_bin0   bk2_h_f3_bin0
+       bk3_g_f0_bin0   bk3_g_f1_bin0   bk3_g_f2_bin0   bk3_g_f3_bin0   bk3_h_f0_bin0   bk3_h_f1_bin0   bk3_h_f2_bin0   bk3_h_f3_bin0
+       bk0_g_f0_bin1   bk0_g_f1_bin1   bk0_g_f2_bin1   bk0_g_f3_bin1   bk0_h_f0_bin1   bk0_h_f1_bin1   bk0_h_f2_bin1   bk0_h_f3_bin1
+       bk1_g_f0_bin1   bk1_g_f1_bin1   bk1_g_f2_bin1   bk1_g_f3_bin1   bk1_h_f0_bin1   bk1_h_f1_bin1   bk1_h_f2_bin1   bk1_h_f3_bin1
+       bk2_g_f0_bin1   bk2_g_f1_bin1   bk2_g_f2_bin1   bk2_g_f3_bin1   bk2_h_f0_bin1   bk2_h_f1_bin1   bk2_h_f2_bin1   bk2_h_f3_bin1
+       bk3_g_f0_bin1   bk3_g_f1_bin1   bk3_g_f2_bin1   bk3_g_f3_bin1   bk3_h_f0_bin1   bk3_h_f1_bin1   bk3_h_f2_bin1   bk3_h_f3_bin1
+       ...
+       bk0_g_f0_bin64 bk0_g_f1_bin64 bk0_g_f2_bin64 bk0_g_f3_bin64 bk0_h_f0_bin64 bk0_h_f1_bin64 bk0_h_f2_bin64 bk0_h_f3_bin64
+       bk1_g_f0_bin64 bk1_g_f1_bin64 bk1_g_f2_bin64 bk1_g_f3_bin64 bk1_h_f0_bin64 bk1_h_f1_bin64 bk1_h_f2_bin64 bk1_h_f3_bin64
+       bk2_g_f0_bin64 bk2_g_f1_bin64 bk2_g_f2_bin64 bk2_g_f3_bin64 bk2_h_f0_bin64 bk2_h_f1_bin64 bk2_h_f2_bin64 bk2_h_f3_bin64
+       bk3_g_f0_bin64 bk3_g_f1_bin64 bk3_g_f2_bin64 bk3_g_f3_bin64 bk3_h_f0_bin64 bk3_h_f1_bin64 bk3_h_f2_bin64 bk3_h_f3_bin64
+       -----------------------------------------------------------------------------------------------
+    */
+    // with this organization, the LDS/shared memory bank is independent of the bin value
+    // all threads within a quarter-wavefront (half-warp) will not have any bank conflict
+
+    __local acc_type * gh_hist = (__local acc_type *)shared_array;
+    // counter histogram
+    // each bank: 4 * 64 * size_of(uint) = 1 KB
+    // there are 4 banks used by 256 threads total 4 KB
+    /* memory layout in cnt_hist:
+       -----------------------------------------------
+       bk0_c_f0_bin0   bk0_c_f1_bin0   bk0_c_f2_bin0   bk0_c_f3_bin0
+       bk1_c_f0_bin0   bk1_c_f1_bin0   bk1_c_f2_bin0   bk1_c_f3_bin0
+       bk2_c_f0_bin0   bk2_c_f1_bin0   bk2_c_f2_bin0   bk2_c_f3_bin0
+       bk3_c_f0_bin0   bk3_c_f1_bin0   bk3_c_f2_bin0   bk3_c_f3_bin0
+       bk0_c_f0_bin1   bk0_c_f1_bin1   bk0_c_f2_bin1   bk0_c_f3_bin1
+       bk1_c_f0_bin1   bk1_c_f1_bin1   bk1_c_f2_bin1   bk1_c_f3_bin1
+       bk2_c_f0_bin1   bk2_c_f1_bin1   bk2_c_f2_bin1   bk2_c_f3_bin1
+       bk3_c_f0_bin1   bk3_c_f1_bin1   bk3_c_f2_bin1   bk3_c_f3_bin1
+       ...
+       bk0_c_f0_bin64 bk0_c_f1_bin64 bk0_c_f2_bin64 bk0_c_f3_bin64
+       bk1_c_f0_bin64 bk1_c_f1_bin64 bk1_c_f2_bin64 bk1_c_f3_bin64
+       bk2_c_f0_bin64 bk2_c_f1_bin64 bk2_c_f2_bin64 bk2_c_f3_bin64
+       bk3_c_f0_bin64 bk3_c_f1_bin64 bk3_c_f2_bin64 bk3_c_f3_bin64
+       -----------------------------------------------
+    */
+    __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);
+
+    // thread 0, 1, 2, 3 compute histograms for gradients first
+    // thread 4, 5, 6, 7 compute histograms for hessians  first
+    // etc.
+    uchar is_hessian_first = (ltid >> 2) & 1;
+    // thread 0-7 write result to bank0, 8-15 to bank1, 16-23 to bank2, 24-31 to bank3
+    ushort bank = (ltid >> 3) & BANK_MASK;
+    
+    ushort group_feature = group_id >> POWER_FEATURE_WORKGROUPS;
+    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
+    // feature_size is the number of examples per feature
+    __global const uchar4* feature_data = feature_data_base + group_feature * feature_size;
+    // size of threads that process this feature4
+    const uint subglobal_size = lsize * (1 << POWER_FEATURE_WORKGROUPS);
+    // equavalent thread ID in this subgroup for this feature4
+    const uint subglobal_tid  = gtid - group_feature * subglobal_size;
+    // extract feature mask, when a byte is set to 0, that feature is disabled
+    #if ENABLE_ALL_FEATURES == 1
+    // hopefully the compiler will propogate the constants and eliminate all branches
+    uchar4 feature_mask = (uchar4)(0xff, 0xff, 0xff, 0xff);
+    #else
+    uchar4 feature_mask = feature_masks[group_feature];
+    #endif
+    // exit if all features are masked
+    if (!as_uint(feature_mask)) {
+        return;
+    }
+
+    // STAGE 1: read feature data, and gradient and hessian
+    // first half of the threads read feature data from global memory
+    // 4 features stored in a tuple MSB...(0, 1, 2, 3)...LSB
+    // We will prefetch data into the "next" variable at the beginning of each iteration
+    uchar4 feature4;
+    uchar4 feature4_next;
+    uchar4 feature4_prev;
+    // offset used to rotate feature4 vector
+    ushort offset = (ltid & 0x3);
+    // store gradient and hessian
+    float stat1, stat2;
+    float stat1_next, stat2_next;
+    ushort bin, addr, addr2;
+    data_size_t ind;
+    data_size_t ind_next;
+    stat1 = ordered_gradients[subglobal_tid];
+    #if CONST_HESSIAN == 0
+    stat2 = ordered_hessians[subglobal_tid];
+    #endif
+    #ifdef IGNORE_INDICES
+    ind = subglobal_tid;
+    #else
+    ind = data_indices[subglobal_tid];
+    #endif
+    feature4 = feature_data[ind];
+    feature4 = as_uchar4(as_uint(feature4) & 0x3f3f3f3f);
+    feature4_prev = feature4;
+    feature4_prev = as_uchar4(rotate(as_uint(feature4_prev), (uint)offset*8));
+    #if ENABLE_ALL_FEATURES == 0
+    // rotate feature_mask to match the feature order of each thread
+    feature_mask = as_uchar4(rotate(as_uint(feature_mask), (uint)offset*8));
+    #endif
+    acc_type s3_stat1 = 0.0f, s3_stat2 = 0.0f;
+    acc_type s2_stat1 = 0.0f, s2_stat2 = 0.0f;
+    acc_type s1_stat1 = 0.0f, s1_stat2 = 0.0f;
+    acc_type s0_stat1 = 0.0f, s0_stat2 = 0.0f;
+
+    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
+    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
+        // prefetch the next iteration variables
+        // we don't need bondary check because we have made the buffer larger
+        stat1_next = ordered_gradients[i + subglobal_size];
+        #if CONST_HESSIAN == 0
+        stat2_next = ordered_hessians[i + subglobal_size];
+        #endif
+        #ifdef IGNORE_INDICES
+        // we need to check to bounds here
+        ind_next = i + subglobal_size < num_data ? i + subglobal_size : i;
+        // start load next feature as early as possible
+        feature4_next = feature_data[ind_next];
+        #else
+        ind_next = data_indices[i + subglobal_size];
+        #endif
+        #if CONST_HESSIAN == 0
+        // swap gradient and hessian for threads 4, 5, 6, 7
+        float tmp = stat1;
+        stat1 = is_hessian_first ? stat2 : stat1;
+        stat2 = is_hessian_first ? tmp   : stat2;
+        // stat1 = select(stat1, stat2, is_hessian_first);
+        // stat2 = select(stat2, tmp, is_hessian_first);
+        #endif
+
+        // STAGE 2: accumulate gradient and hessian
+        offset = (ltid & 0x3);
+        feature4 = as_uchar4(rotate(as_uint(feature4), (uint)offset*8));
+        bin = feature4.s3;
+        if ((bin != feature4_prev.s3) && feature_mask.s3) {
+            // printf("%3d (%4d): writing s3 %d %d offset %d", ltid, i, bin, feature4_prev.s3, offset);
+            bin = feature4_prev.s3;
+            feature4_prev.s3 = feature4.s3;
+            addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
+            addr2 = addr + 4 - 8 * is_hessian_first;
+            // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's gradients for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's hessians  for example 4, 5, 6, 7
+            atomic_local_add_f(gh_hist + addr, s3_stat1);
+            // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's hessians  for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's gradients for example 4, 5, 6, 7
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, s3_stat2);
+            #endif
+            s3_stat1 = stat1;
+            s3_stat2 = stat2;
+        }
+        else {
+            // printf("%3d (%4d): acc s3 %d", ltid, i, bin);
+            s3_stat1 += stat1;
+            s3_stat2 += stat2;
+        }
+
+        bin = feature4.s2;
+        offset = (offset + 1) & 0x3;
+        if ((bin != feature4_prev.s2) && feature_mask.s2) {
+            // printf("%3d (%4d): writing s2 %d %d feature %d", ltid, i, bin, feature4_prev.s2, offset);
+            bin = feature4_prev.s2;
+            feature4_prev.s2 = feature4.s2;
+            addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
+            addr2 = addr + 4 - 8 * is_hessian_first;
+            // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's gradients for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's hessians  for example 4, 5, 6, 7
+            atomic_local_add_f(gh_hist + addr, s2_stat1);
+            // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's hessians  for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's gradients for example 4, 5, 6, 7
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, s2_stat2);
+            #endif
+            s2_stat1 = stat1;
+            s2_stat2 = stat2;
+        }
+        else {
+            // printf("%3d (%4d): acc s2 %d", ltid, i, bin);
+            s2_stat1 += stat1;
+            s2_stat2 += stat2;
+        }
+
+
+        // prefetch the next iteration variables
+        // we don't need bondary check because if it is out of boundary, ind_next = 0
+        #ifndef IGNORE_INDICES
+        feature4_next = feature_data[ind_next];
+        #endif
+
+        bin = feature4.s1 & 0x3f;
+        offset = (offset + 1) & 0x3;
+        if ((bin != feature4_prev.s1) && feature_mask.s1) {
+            // printf("%3d (%4d): writing s1 %d %d feature %d", ltid, i, bin, feature4_prev.s1, offset);
+            bin = feature4_prev.s1;
+            feature4_prev.s1 = feature4.s1;
+            addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
+            addr2 = addr + 4 - 8 * is_hessian_first;
+            // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's gradients for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's hessians  for example 4, 5, 6, 7
+            atomic_local_add_f(gh_hist + addr, s1_stat1);
+            // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's hessians  for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's gradients for example 4, 5, 6, 7
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, s1_stat2);
+            #endif
+            s1_stat1 = stat1;
+            s1_stat2 = stat2;
+        }
+        else {
+            // printf("%3d (%4d): acc s1 %d", ltid, i, bin);
+            s1_stat1 += stat1;
+            s1_stat2 += stat2;
+        }
+
+        bin = feature4.s0;
+        offset = (offset + 1) & 0x3;
+        if ((bin != feature4_prev.s0) && feature_mask.s0) {
+            // printf("%3d (%4d): writing s0 %d %d feature %d", ltid, i, bin, feature4_prev.s0, offset);
+            bin = feature4_prev.s0;
+            feature4_prev.s0 = feature4.s0;
+            addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
+            addr2 = addr + 4 - 8 * is_hessian_first;
+            // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's gradients for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's hessians  for example 4, 5, 6, 7
+            atomic_local_add_f(gh_hist + addr, s0_stat1);
+            // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's hessians  for example 0, 1, 2, 3
+            // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's gradients for example 4, 5, 6, 7
+            #if CONST_HESSIAN == 0
+            atomic_local_add_f(gh_hist + addr2, s0_stat2);
+            #endif
+            s0_stat1 = stat1;
+            s0_stat2 = stat2;
+        }
+        else {
+            // printf("%3d (%4d): acc s0 %d", ltid, i, bin);
+            s0_stat1 += stat1;
+            s0_stat2 += stat2;
+        }
+
+        // STAGE 3: accumulate counter
+        // there are 4 counters for 4 features
+        // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
+        offset = (ltid & 0x3);
+        if (feature_mask.s3) {
+            bin = feature4.s3;
+            addr = bin * CNT_BIN_MULT + bank * 4 + offset;
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's counts for example 0, 1, 2, 3
+        offset = (offset + 1) & 0x3;
+        if (feature_mask.s2) {
+            bin = feature4.s2;
+            addr = bin * CNT_BIN_MULT + bank * 4 + offset;
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's counts for example 0, 1, 2, 3
+        offset = (offset + 1) & 0x3;
+        if (feature_mask.s1) {
+            bin = feature4.s1;
+            addr = bin * CNT_BIN_MULT + bank * 4 + offset;
+            atom_inc(cnt_hist + addr);
+        }
+        // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's counts for example 0, 1, 2, 3
+        offset = (offset + 1) & 0x3;
+        if (feature_mask.s0) {
+            bin = feature4.s0;
+            addr = bin * CNT_BIN_MULT + bank * 4 + offset;
+            atom_inc(cnt_hist + addr);
+        }
+        stat1 = stat1_next;
+        stat2 = stat2_next;
+        feature4 = feature4_next;
+        feature4 = as_uchar4(as_uint(feature4) & 0x3f3f3f3f);
+    }
+
+    bin = feature4_prev.s3;
+    offset = (ltid & 0x3);
+    addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
+    addr2 = addr + 4 - 8 * is_hessian_first;
+    atomic_local_add_f(gh_hist + addr, s3_stat1);
+    #if CONST_HESSIAN == 0
+    atomic_local_add_f(gh_hist + addr2, s3_stat2);
+    #endif
+
+    bin = feature4_prev.s2;
+    offset = (offset + 1) & 0x3;
+    addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
+    addr2 = addr + 4 - 8 * is_hessian_first;
+    atomic_local_add_f(gh_hist + addr, s2_stat1);
+    #if CONST_HESSIAN == 0
+    atomic_local_add_f(gh_hist + addr2, s2_stat2);
+    #endif
+
+    bin = feature4_prev.s1;
+    offset = (offset + 1) & 0x3;
+    addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
+    addr2 = addr + 4 - 8 * is_hessian_first;
+    atomic_local_add_f(gh_hist + addr, s1_stat1);
+    #if CONST_HESSIAN == 0
+    atomic_local_add_f(gh_hist + addr2, s1_stat2);
+    #endif
+
+    bin = feature4_prev.s0;
+    offset = (offset + 1) & 0x3;
+    addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
+    addr2 = addr + 4 - 8 * is_hessian_first;
+    atomic_local_add_f(gh_hist + addr, s0_stat1);
+    #if CONST_HESSIAN == 0
+    atomic_local_add_f(gh_hist + addr2, s0_stat2);
+    #endif
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    #if ENABLE_ALL_FEATURES == 0
+    // restore feature_mask
+    feature_mask = feature_masks[group_feature];
+    #endif
+    
+    // now reduce the 4 banks of subhistograms into 1
+    /* memory layout of gh_hist:
+       -----------------------------------------------------------------------------------------------
+       bk0_g_f0_bin0   bk0_g_f1_bin0   bk0_g_f2_bin0   bk0_g_f3_bin0   bk0_h_f0_bin0   bk0_h_f1_bin0   bk0_h_f2_bin0   bk0_h_f3_bin0
+       bk1_g_f0_bin0   bk1_g_f1_bin0   bk1_g_f2_bin0   bk1_g_f3_bin0   bk1_h_f0_bin0   bk1_h_f1_bin0   bk1_h_f2_bin0   bk1_h_f3_bin0
+       bk2_g_f0_bin0   bk2_g_f1_bin0   bk2_g_f2_bin0   bk2_g_f3_bin0   bk2_h_f0_bin0   bk2_h_f1_bin0   bk2_h_f2_bin0   bk2_h_f3_bin0
+       bk3_g_f0_bin0   bk3_g_f1_bin0   bk3_g_f2_bin0   bk3_g_f3_bin0   bk3_h_f0_bin0   bk3_h_f1_bin0   bk3_h_f2_bin0   bk3_h_f3_bin0
+       bk0_g_f0_bin1   bk0_g_f1_bin1   bk0_g_f2_bin1   bk0_g_f3_bin1   bk0_h_f0_bin1   bk0_h_f1_bin1   bk0_h_f2_bin1   bk0_h_f3_bin1
+       bk1_g_f0_bin1   bk1_g_f1_bin1   bk1_g_f2_bin1   bk1_g_f3_bin1   bk1_h_f0_bin1   bk1_h_f1_bin1   bk1_h_f2_bin1   bk1_h_f3_bin1
+       bk2_g_f0_bin1   bk2_g_f1_bin1   bk2_g_f2_bin1   bk2_g_f3_bin1   bk2_h_f0_bin1   bk2_h_f1_bin1   bk2_h_f2_bin1   bk2_h_f3_bin1
+       bk3_g_f0_bin1   bk3_g_f1_bin1   bk3_g_f2_bin1   bk3_g_f3_bin1   bk3_h_f0_bin1   bk3_h_f1_bin1   bk3_h_f2_bin1   bk3_h_f3_bin1
+       ...
+       bk0_g_f0_bin64 bk0_g_f1_bin64 bk0_g_f2_bin64 bk0_g_f3_bin64 bk0_h_f0_bin64 bk0_h_f1_bin64 bk0_h_f2_bin64 bk0_h_f3_bin64
+       bk1_g_f0_bin64 bk1_g_f1_bin64 bk1_g_f2_bin64 bk1_g_f3_bin64 bk1_h_f0_bin64 bk1_h_f1_bin64 bk1_h_f2_bin64 bk1_h_f3_bin64
+       bk2_g_f0_bin64 bk2_g_f1_bin64 bk2_g_f2_bin64 bk2_g_f3_bin64 bk2_h_f0_bin64 bk2_h_f1_bin64 bk2_h_f2_bin64 bk2_h_f3_bin64
+       bk3_g_f0_bin64 bk3_g_f1_bin64 bk3_g_f2_bin64 bk3_g_f3_bin64 bk3_h_f0_bin64 bk3_h_f1_bin64 bk3_h_f2_bin64 bk3_h_f3_bin64
+       -----------------------------------------------------------------------------------------------
+    */
+    /* memory layout in cnt_hist:
+       -----------------------------------------------
+       bk0_c_f0_bin0   bk0_c_f1_bin0   bk0_c_f2_bin0   bk0_c_f3_bin0
+       bk1_c_f0_bin0   bk1_c_f1_bin0   bk1_c_f2_bin0   bk1_c_f3_bin0
+       bk2_c_f0_bin0   bk2_c_f1_bin0   bk2_c_f2_bin0   bk2_c_f3_bin0
+       bk3_c_f0_bin0   bk3_c_f1_bin0   bk3_c_f2_bin0   bk3_c_f3_bin0
+       bk0_c_f0_bin1   bk0_c_f1_bin1   bk0_c_f2_bin1   bk0_c_f3_bin1
+       bk1_c_f0_bin1   bk1_c_f1_bin1   bk1_c_f2_bin1   bk1_c_f3_bin1
+       bk2_c_f0_bin1   bk2_c_f1_bin1   bk2_c_f2_bin1   bk2_c_f3_bin1
+       bk3_c_f0_bin1   bk3_c_f1_bin1   bk3_c_f2_bin1   bk3_c_f3_bin1
+       ...
+       bk0_c_f0_bin64 bk0_c_f1_bin64 bk0_c_f2_bin64 bk0_c_f3_bin64
+       bk1_c_f0_bin64 bk1_c_f1_bin64 bk1_c_f2_bin64 bk1_c_f3_bin64
+       bk2_c_f0_bin64 bk2_c_f1_bin64 bk2_c_f2_bin64 bk2_c_f3_bin64
+       bk3_c_f0_bin64 bk3_c_f1_bin64 bk3_c_f2_bin64 bk3_c_f3_bin64
+       -----------------------------------------------
+    */
+    acc_type g_val = 0.0f;
+    acc_type h_val = 0.0f;
+    uint cnt_val = 0;
+    // 256 threads, working on 4 features and 64 bins,
+    // so each thread has an independent feature/bin to work on.
+    const ushort feature_id = ltid & 3; // range 0 - 4
+    const ushort bin_id = ltid >> 2; // range 0 - 63
+    offset = (ltid >> 2) & BANK_MASK; // helps avoid LDS bank conflicts
+    for (int i = 0; i < NUM_BANKS; ++i) {
+        ushort bank_id = (i + offset) & BANK_MASK;
+        g_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id];
+        h_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id + 4];
+        cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * 4 + feature_id];
+    }
+    // now thread 0 - 3 holds feature 0, 1, 2, 3's gradient, hessian and count bin 0
+    // now thread 4 - 7 holds feature 0, 1, 2, 3's gradient, hessian and count bin 1
+    // etc,
+
+    #if CONST_HESSIAN == 1
+    g_val += h_val;
+    h_val = cnt_val * const_hessian;
+    #endif
+    // write to output
+    // write gradients and hessians histogram for all 4 features
+    // output data in linear order for further reduction
+    // output size = 4 (features) * 3 (counters) * 64 (bins) * sizeof(float)
+    /* memory layout of output:
+       g_f0_bin0   g_f1_bin0   g_f2_bin0   g_f3_bin0
+       g_f0_bin1   g_f1_bin1   g_f2_bin1   g_f3_bin1
+       ...
+       g_f0_bin63  g_f1_bin63  g_f2_bin63  g_f3_bin63
+       h_f0_bin0   h_f1_bin0   h_f2_bin0   h_f3_bin0
+       h_f0_bin1   h_f1_bin1   h_f2_bin1   h_f3_bin1
+       ...
+       h_f0_bin63  h_f1_bin63  h_f2_bin63  h_f3_bin63
+       c_f0_bin0   c_f1_bin0   c_f2_bin0   c_f3_bin0
+       c_f0_bin1   c_f1_bin1   c_f2_bin1   c_f3_bin1
+       ...
+       c_f0_bin63  c_f1_bin63  c_f2_bin63  c_f3_bin63
+    */
+    // if there is only one workgroup processing this feature4, don't even need to write
+    uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
+    #if POWER_FEATURE_WORKGROUPS != 0
+    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS;
+    // if g_val and h_val are double, they are converted to float here
+    // write gradients for 4 features
+    output[0 * 4 * NUM_BINS + ltid] = g_val;
+    // write hessians for 4 features
+    output[1 * 4 * NUM_BINS + ltid] = h_val;
+    // write counts for 4 features
+    output[2 * 4 * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val);
+    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    // To avoid the cost of an extra reducting kernel, we have to deal with some 
+    // gray area in OpenCL. We want the last work group that process this feature to
+    // make the final reduction, and other threads will just quit.
+    // This requires that the results written by other workgroups available to the
+    // last workgroup (memory consistency)
+    #if NVIDIA == 1
+    // this is equavalent to CUDA __threadfence();
+    // ensure the writes above goes to main memory and other workgroups can see it
+    asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
+    #else
+    // FIXME: how to do the above on AMD GPUs??
+    // GCN ISA says that the all writes will bypass L1 cache (write through),
+    // however when the last thread is reading sub-histogram data we have to
+    // make sure that no part of data is modified in local L1 cache of other workgroups.
+    // Otherwise reading can be a problem (atomic operations to get consistency).
+    // But in our case, the sub-histogram of this workgroup cannot be in the cache
+    // of another workgroup, so the following trick will work just fine.
+    #endif
+    // Now, we want one workgroup to do the final reduction.
+    // Other workgroups processing the same feature quit.
+    // The is done by using an global atomic counter.
+    // On AMD GPUs ideally this should be done in GDS,
+    // but currently there is no easy way to access it via OpenCL.
+    __local uint * counter_val = cnt_hist;
+    if (ltid == 0) {
+        // all workgroups processing the same feature add this counter
+        *counter_val = atom_inc(sync_counters + feature4_id);
+    }
+    // make sure everyone in this workgroup is here
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
+    if (*counter_val == (1 << POWER_FEATURE_WORKGROUPS) - 1) {
+        if (ltid == 0) {
+            // printf("workgroup %d start reduction!\n", group_id);
+            // printf("feature_data[0] = %d %d %d %d", feature_data[0].s0, feature_data[0].s1, feature_data[0].s2, feature_data[0].s3);
+            // clear the sync counter for using it next time
+            sync_counters[feature4_id] = 0;
+        }
+    #else
+    // only 1 work group, no need to increase counter
+    // the reduction will become a simple copy
+    if (1) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+        // locate our feature4's block in output memory
+        uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
+        __global acc_type const * restrict feature4_subhists = 
+                 (__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS;
+        // skip reading the data already in local memory
+        uint skip_id = group_id ^ output_offset;
+        // locate output histogram location for this feature4
+        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS;
+        within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val, cnt_val, 
+                                    1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
+    }
+}
+
+// The following line ends the string literal, adds an extra #endif at the end
+// the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
+// )"" "\n#endif" + 9
+#endif
+
diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index 07a8532755bc..812827381400 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -5,6 +5,7 @@
 
 #include <LightGBM/network.h>
 #include "serial_tree_learner.h"
+#include "gpu_tree_learner.h"
 
 #include <cstring>
 
@@ -18,11 +19,12 @@ namespace LightGBM {
 *        Different machine will find best split on different features, then sync global best split
 *        It is recommonded used when #data is small or #feature is large
 */
-class FeatureParallelTreeLearner: public SerialTreeLearner {
+template <typename TREELEARNER_T>
+class FeatureParallelTreeLearner: public TREELEARNER_T {
 public:
   explicit FeatureParallelTreeLearner(const TreeConfig* tree_config);
   ~FeatureParallelTreeLearner();
-  void Init(const Dataset* train_data) override;
+  void Init(const Dataset* train_data, bool is_constant_hessian) override;
 
 protected:
   void BeforeTrain() override;
@@ -43,11 +45,12 @@ class FeatureParallelTreeLearner: public SerialTreeLearner {
 *        Workers use local data to construct histograms locally, then sync up global histograms.
 *        It is recommonded used when #data is large or #feature is small
 */
-class DataParallelTreeLearner: public SerialTreeLearner {
+template <typename TREELEARNER_T>
+class DataParallelTreeLearner: public TREELEARNER_T {
 public:
   explicit DataParallelTreeLearner(const TreeConfig* tree_config);
   ~DataParallelTreeLearner();
-  void Init(const Dataset* train_data) override;
+  void Init(const Dataset* train_data, bool is_constant_hessian) override;
   void ResetConfig(const TreeConfig* tree_config) override;
 protected:
   void BeforeTrain() override;
@@ -95,11 +98,12 @@ class DataParallelTreeLearner: public SerialTreeLearner {
 * Here using voting to reduce features, and only aggregate histograms for selected features.
 * When #data is large and #feature is large, you can use this to have better speed-up
 */
-class VotingParallelTreeLearner: public SerialTreeLearner {
+template <typename TREELEARNER_T>
+class VotingParallelTreeLearner: public TREELEARNER_T {
 public:
   explicit VotingParallelTreeLearner(const TreeConfig* tree_config);
   ~VotingParallelTreeLearner() { }
-  void Init(const Dataset* train_data) override;
+  void Init(const Dataset* train_data, bool is_constant_hessian) override;
   void ResetConfig(const TreeConfig* tree_config) override;
 protected:
   void BeforeTrain() override;
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 6b5220835f42..ffe45340fed8 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -37,10 +37,11 @@ SerialTreeLearner::~SerialTreeLearner() {
   #endif
 }
 
-void SerialTreeLearner::Init(const Dataset* train_data) {
+void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
   train_data_ = train_data;
   num_data_ = train_data_->num_data();
   num_features_ = train_data_->num_features();
+  is_constant_hessian_ = is_constant_hessian;
   int max_cache_size = 0;
   // Get the max size of pool
   if (tree_config_->histogram_pool_size <= 0) {
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 6a8b4a895269..eac99ca81cda 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -18,6 +18,11 @@
 #include <random>
 #include <cmath>
 #include <memory>
+#ifdef USE_GPU
+// Use 4KBytes aligned allocator for ordered gradients and ordered hessians when GPU is enabled.
+// This is necessary to pin the two arrays in memory and make transferring faster.
+#include <boost/align/aligned_allocator.hpp>
+#endif
 
 namespace LightGBM {
 
@@ -30,7 +35,7 @@ class SerialTreeLearner: public TreeLearner {
 
   ~SerialTreeLearner();
 
-  void Init(const Dataset* train_data) override;
+  void Init(const Dataset* train_data, bool is_constant_hessian) override;
 
   void ResetTrainingData(const Dataset* train_data) override;
 
@@ -69,7 +74,7 @@ class SerialTreeLearner: public TreeLearner {
   */
   virtual bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf);
 
-  void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract);
+  virtual void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract);
 
   /*!
   * \brief Find best thresholds for all features, using multi-threading.
@@ -130,10 +135,17 @@ class SerialTreeLearner: public TreeLearner {
   /*! \brief stores best thresholds for all feature for larger leaf */
   std::unique_ptr<LeafSplits> larger_leaf_splits_;
 
+#ifdef USE_GPU
+  /*! \brief gradients of current iteration, ordered for cache optimized, aligned to 4K page */
+  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
+  /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
+  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
+#else
   /*! \brief gradients of current iteration, ordered for cache optimized */
   std::vector<score_t> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized */
   std::vector<score_t> ordered_hessians_;
+#endif
 
   /*! \brief Store ordered bin */
   std::vector<std::unique_ptr<OrderedBin>> ordered_bins_;
diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp
index d72f404f6b1f..e5bb96e62971 100644
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -1,19 +1,33 @@
 #include <LightGBM/tree_learner.h>
 
 #include "serial_tree_learner.h"
+#include "gpu_tree_learner.h"
 #include "parallel_tree_learner.h"
 
 namespace LightGBM {
 
-TreeLearner* TreeLearner::CreateTreeLearner(const std::string& type, const TreeConfig* tree_config) {
-  if (type == std::string("serial")) {
-    return new SerialTreeLearner(tree_config);
-  } else if (type == std::string("feature")) {
-    return new FeatureParallelTreeLearner(tree_config);
-  } else if (type == std::string("data")) {
-    return new DataParallelTreeLearner(tree_config);
-  } else if (type == std::string("voting")) {
-    return new VotingParallelTreeLearner(tree_config);
+TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, const std::string& device_type, const TreeConfig* tree_config) {
+  if (device_type == std::string("cpu")) {
+    if (learner_type == std::string("serial")) {
+      return new SerialTreeLearner(tree_config);
+    } else if (learner_type == std::string("feature")) {
+      return new FeatureParallelTreeLearner<SerialTreeLearner>(tree_config);
+    } else if (learner_type == std::string("data")) {
+      return new DataParallelTreeLearner<SerialTreeLearner>(tree_config);
+    } else if (learner_type == std::string("voting")) {
+      return new VotingParallelTreeLearner<SerialTreeLearner>(tree_config);
+    }
+  }
+  else if (device_type == std::string("gpu")) {
+    if (learner_type == std::string("serial")) {
+      return new GPUTreeLearner(tree_config);
+    } else if (learner_type == std::string("feature")) {
+      return new FeatureParallelTreeLearner<GPUTreeLearner>(tree_config);
+    } else if (learner_type == std::string("data")) {
+      return new DataParallelTreeLearner<GPUTreeLearner>(tree_config);
+    } else if (learner_type == std::string("voting")) {
+      return new VotingParallelTreeLearner<GPUTreeLearner>(tree_config);
+    }
   }
   return nullptr;
 }
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index 78066eb8bb5d..4b4fe650fc76 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -9,25 +9,27 @@
 
 namespace LightGBM {
 
-VotingParallelTreeLearner::VotingParallelTreeLearner(const TreeConfig* tree_config)
-  :SerialTreeLearner(tree_config) {
-  top_k_ = tree_config_->top_k;
+template <typename TREELEARNER_T>
+VotingParallelTreeLearner<TREELEARNER_T>::VotingParallelTreeLearner(const TreeConfig* tree_config)
+  :TREELEARNER_T(tree_config) {
+  top_k_ = this->tree_config_->top_k;
 }
 
-void VotingParallelTreeLearner::Init(const Dataset* train_data) {
-  SerialTreeLearner::Init(train_data);
+template <typename TREELEARNER_T>
+void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
+  TREELEARNER_T::Init(train_data, is_constant_hessian);
   rank_ = Network::rank();
   num_machines_ = Network::num_machines();
 
   // limit top k
-  if (top_k_ > num_features_) {
-    top_k_ = num_features_;
+  if (top_k_ > this->num_features_) {
+    top_k_ = this->num_features_;
   }
   // get max bin
   int max_bin = 0;
-  for (int i = 0; i < num_features_; ++i) {
-    if (max_bin < train_data_->FeatureNumBin(i)) {
-      max_bin = train_data_->FeatureNumBin(i);
+  for (int i = 0; i < this->num_features_; ++i) {
+    if (max_bin < this->train_data_->FeatureNumBin(i)) {
+      max_bin = this->train_data_->FeatureNumBin(i);
     }
   }
   // calculate buffer size
@@ -36,29 +38,29 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
   input_buffer_.resize(buffer_size);
   output_buffer_.resize(buffer_size);
 
-  smaller_is_feature_aggregated_.resize(num_features_);
-  larger_is_feature_aggregated_.resize(num_features_);
+  smaller_is_feature_aggregated_.resize(this->num_features_);
+  larger_is_feature_aggregated_.resize(this->num_features_);
 
   block_start_.resize(num_machines_);
   block_len_.resize(num_machines_);
 
-  smaller_buffer_read_start_pos_.resize(num_features_);
-  larger_buffer_read_start_pos_.resize(num_features_);
-  global_data_count_in_leaf_.resize(tree_config_->num_leaves);
+  smaller_buffer_read_start_pos_.resize(this->num_features_);
+  larger_buffer_read_start_pos_.resize(this->num_features_);
+  global_data_count_in_leaf_.resize(this->tree_config_->num_leaves);
 
-  smaller_leaf_splits_global_.reset(new LeafSplits(train_data_->num_data()));
-  larger_leaf_splits_global_.reset(new LeafSplits(train_data_->num_data()));
+  smaller_leaf_splits_global_.reset(new LeafSplits(this->train_data_->num_data()));
+  larger_leaf_splits_global_.reset(new LeafSplits(this->train_data_->num_data()));
 
-  local_tree_config_ = *tree_config_;
+  local_tree_config_ = *this->tree_config_;
   local_tree_config_.min_data_in_leaf /= num_machines_;
   local_tree_config_.min_sum_hessian_in_leaf /= num_machines_;
 
-  histogram_pool_.ResetConfig(&local_tree_config_);
+  this->histogram_pool_.ResetConfig(&local_tree_config_);
 
   // initialize histograms for global
-  smaller_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]);
-  larger_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]);
-  auto num_total_bin = train_data_->NumTotalBin();
+  smaller_leaf_histogram_array_global_.reset(new FeatureHistogram[this->num_features_]);
+  larger_leaf_histogram_array_global_.reset(new FeatureHistogram[this->num_features_]);
+  auto num_total_bin = this->train_data_->NumTotalBin();
   smaller_leaf_histogram_data_.resize(num_total_bin);
   larger_leaf_histogram_data_.resize(num_total_bin);
   feature_metas_.resize(train_data->num_features());
@@ -70,7 +72,7 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
     } else {
       feature_metas_[i].bias = 0;
     }
-    feature_metas_[i].tree_config = tree_config_;
+    feature_metas_[i].tree_config = this->tree_config_;
   }
   uint64_t offset = 0;
   for (int j = 0; j < train_data->num_features(); ++j) {
@@ -85,25 +87,27 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
   }
 }
 
-void VotingParallelTreeLearner::ResetConfig(const TreeConfig* tree_config) {
-  SerialTreeLearner::ResetConfig(tree_config);
+template <typename TREELEARNER_T>
+void VotingParallelTreeLearner<TREELEARNER_T>::ResetConfig(const TreeConfig* tree_config) {
+  TREELEARNER_T::ResetConfig(tree_config);
 
-  local_tree_config_ = *tree_config_;
+  local_tree_config_ = *this->tree_config_;
   local_tree_config_.min_data_in_leaf /= num_machines_;
   local_tree_config_.min_sum_hessian_in_leaf /= num_machines_;
 
-  histogram_pool_.ResetConfig(&local_tree_config_);
-  global_data_count_in_leaf_.resize(tree_config_->num_leaves);
+  this->histogram_pool_.ResetConfig(&local_tree_config_);
+  global_data_count_in_leaf_.resize(this->tree_config_->num_leaves);
 
   for (size_t i = 0; i < feature_metas_.size(); ++i) {
-    feature_metas_[i].tree_config = tree_config_;
+    feature_metas_[i].tree_config = this->tree_config_;
   }
 }
 
-void VotingParallelTreeLearner::BeforeTrain() {
-  SerialTreeLearner::BeforeTrain();
+template <typename TREELEARNER_T>
+void VotingParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
+  TREELEARNER_T::BeforeTrain();
   // sync global data sumup info
-  std::tuple<data_size_t, double, double> data(smaller_leaf_splits_->num_data_in_leaf(), smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians());
+  std::tuple<data_size_t, double, double> data(this->smaller_leaf_splits_->num_data_in_leaf(), this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians());
   int size = sizeof(std::tuple<data_size_t, double, double>);
   std::memcpy(input_buffer_.data(), &data, size);
 
@@ -133,20 +137,21 @@ void VotingParallelTreeLearner::BeforeTrain() {
   global_data_count_in_leaf_[0] = std::get<0>(data);
 }
 
-bool VotingParallelTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
-  if (SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf)) {
+template <typename TREELEARNER_T>
+bool VotingParallelTreeLearner<TREELEARNER_T>::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
+  if (TREELEARNER_T::BeforeFindBestSplit(tree, left_leaf, right_leaf)) {
     data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
     data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
     if (right_leaf < 0) {
       return true;
     } else if (num_data_in_left_child < num_data_in_right_child) {
       // get local sumup
-      smaller_leaf_splits_->Init(left_leaf, data_partition_.get(), gradients_, hessians_);
-      larger_leaf_splits_->Init(right_leaf, data_partition_.get(), gradients_, hessians_);
+      this->smaller_leaf_splits_->Init(left_leaf, this->data_partition_.get(), this->gradients_, this->hessians_);
+      this->larger_leaf_splits_->Init(right_leaf, this->data_partition_.get(), this->gradients_, this->hessians_);
     } else {
       // get local sumup
-      smaller_leaf_splits_->Init(right_leaf, data_partition_.get(), gradients_, hessians_);
-      larger_leaf_splits_->Init(left_leaf, data_partition_.get(), gradients_, hessians_);
+      this->smaller_leaf_splits_->Init(right_leaf, this->data_partition_.get(), this->gradients_, this->hessians_);
+      this->larger_leaf_splits_->Init(left_leaf, this->data_partition_.get(), this->gradients_, this->hessians_);
     }
     return true;
   } else {
@@ -154,14 +159,15 @@ bool VotingParallelTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_l
   }
 }
 
-void VotingParallelTreeLearner::GlobalVoting(int leaf_idx, const std::vector<SplitInfo>& splits, std::vector<int>* out) {
+template <typename TREELEARNER_T>
+void VotingParallelTreeLearner<TREELEARNER_T>::GlobalVoting(int leaf_idx, const std::vector<SplitInfo>& splits, std::vector<int>* out) {
   out->clear();
   if (leaf_idx < 0) {
     return;
   }
   // get mean number on machines
   score_t mean_num_data = GetGlobalDataCountInLeaf(leaf_idx) / static_cast<score_t>(num_machines_);
-  std::vector<SplitInfo> feature_best_split(num_features_, SplitInfo());
+  std::vector<SplitInfo> feature_best_split(this->num_features_, SplitInfo());
   for (auto & split : splits) {
     int fid = split.feature;
     if (fid < 0) {
@@ -185,8 +191,9 @@ void VotingParallelTreeLearner::GlobalVoting(int leaf_idx, const std::vector<Spl
   }
 }
 
-void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& smaller_top_features, const std::vector<int>& larger_top_features) {
-  for (int i = 0; i < num_features_; ++i) {
+template <typename TREELEARNER_T>
+void VotingParallelTreeLearner<TREELEARNER_T>::CopyLocalHistogram(const std::vector<int>& smaller_top_features, const std::vector<int>& larger_top_features) {
+  for (int i = 0; i < this->num_features_; ++i) {
     smaller_is_feature_aggregated_[i] = false;
     larger_is_feature_aggregated_[i] = false;
   }
@@ -203,7 +210,7 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
     while (cur_used_features < cur_total_feature) {
       // copy smaller leaf histograms first
       if (smaller_idx < smaller_top_features.size()) {
-        int inner_feature_index = train_data_->InnerFeatureIndex(smaller_top_features[smaller_idx]);
+        int inner_feature_index = this->train_data_->InnerFeatureIndex(smaller_top_features[smaller_idx]);
         ++cur_used_features;
         // mark local aggregated feature
         if (i == rank_) {
@@ -211,9 +218,9 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
           smaller_buffer_read_start_pos_[inner_feature_index] = static_cast<int>(cur_size);
         }
         // copy
-        std::memcpy(input_buffer_.data() + reduce_scatter_size_, smaller_leaf_histogram_array_[inner_feature_index].RawData(), smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram());
-        cur_size += smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
-        reduce_scatter_size_ += smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
+        std::memcpy(input_buffer_.data() + reduce_scatter_size_, this->smaller_leaf_histogram_array_[inner_feature_index].RawData(), this->smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram());
+        cur_size += this->smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
+        reduce_scatter_size_ += this->smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
         ++smaller_idx;
       }
       if (cur_used_features >= cur_total_feature) {
@@ -221,7 +228,7 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
       }
       // then copy larger leaf histograms
       if (larger_idx < larger_top_features.size()) {
-        int inner_feature_index = train_data_->InnerFeatureIndex(larger_top_features[larger_idx]);
+        int inner_feature_index = this->train_data_->InnerFeatureIndex(larger_top_features[larger_idx]);
         ++cur_used_features;
         // mark local aggregated feature
         if (i == rank_) {
@@ -229,9 +236,9 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
           larger_buffer_read_start_pos_[inner_feature_index] = static_cast<int>(cur_size);
         }
         // copy
-        std::memcpy(input_buffer_.data() + reduce_scatter_size_, larger_leaf_histogram_array_[inner_feature_index].RawData(), larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram());
-        cur_size += larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
-        reduce_scatter_size_ += larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
+        std::memcpy(input_buffer_.data() + reduce_scatter_size_, this->larger_leaf_histogram_array_[inner_feature_index].RawData(), this->larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram());
+        cur_size += this->larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
+        reduce_scatter_size_ += this->larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
         ++larger_idx;
       }
     }
@@ -243,60 +250,61 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
   }
 }
 
-void VotingParallelTreeLearner::FindBestThresholds() {
+template <typename TREELEARNER_T>
+void VotingParallelTreeLearner<TREELEARNER_T>::FindBestThresholds() {
   // use local data to find local best splits
-  std::vector<int8_t> is_feature_used(num_features_, 0);
+  std::vector<int8_t> is_feature_used(this->num_features_, 0);
 #pragma omp parallel for schedule(static)
-  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
-    if (!is_feature_used_[feature_index]) continue;
-    if (parent_leaf_histogram_array_ != nullptr
-      && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
-      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
+  for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
+    if (!this->is_feature_used_[feature_index]) continue;
+    if (this->parent_leaf_histogram_array_ != nullptr
+      && !this->parent_leaf_histogram_array_[feature_index].is_splittable()) {
+      this->smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
       continue;
     }
     is_feature_used[feature_index] = 1;
   }
   bool use_subtract = true;
-  if (parent_leaf_histogram_array_ == nullptr) {
+  if (this->parent_leaf_histogram_array_ == nullptr) {
     use_subtract = false;
   }
-  ConstructHistograms(is_feature_used, use_subtract);
+  this->ConstructHistograms(is_feature_used, use_subtract);
 
-  std::vector<SplitInfo> smaller_bestsplit_per_features(num_features_);
-  std::vector<SplitInfo> larger_bestsplit_per_features(num_features_);
+  std::vector<SplitInfo> smaller_bestsplit_per_features(this->num_features_);
+  std::vector<SplitInfo> larger_bestsplit_per_features(this->num_features_);
   OMP_INIT_EX();
   // find splits
 #pragma omp parallel for schedule(static)
-  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+  for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
     OMP_LOOP_EX_BEGIN();
     if (!is_feature_used[feature_index]) { continue; }
-    const int real_feature_index = train_data_->RealFeatureIndex(feature_index);
-    train_data_->FixHistogram(feature_index,
-      smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
-      smaller_leaf_splits_->num_data_in_leaf(),
-      smaller_leaf_histogram_array_[feature_index].RawData());
-
-    smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
-      smaller_leaf_splits_->sum_gradients(),
-      smaller_leaf_splits_->sum_hessians(),
-      smaller_leaf_splits_->num_data_in_leaf(),
+    const int real_feature_index = this->train_data_->RealFeatureIndex(feature_index);
+    this->train_data_->FixHistogram(feature_index,
+      this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
+      this->smaller_leaf_splits_->num_data_in_leaf(),
+      this->smaller_leaf_histogram_array_[feature_index].RawData());
+
+    this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
+      this->smaller_leaf_splits_->sum_gradients(),
+      this->smaller_leaf_splits_->sum_hessians(),
+      this->smaller_leaf_splits_->num_data_in_leaf(),
       &smaller_bestsplit_per_features[feature_index]);
     smaller_bestsplit_per_features[feature_index].feature = real_feature_index;
     // only has root leaf
-    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
+    if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->LeafIndex() < 0) { continue; }
 
     if (use_subtract) {
-      larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
+      this->larger_leaf_histogram_array_[feature_index].Subtract(this->smaller_leaf_histogram_array_[feature_index]);
     } else {
-      train_data_->FixHistogram(feature_index, larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians(),
-        larger_leaf_splits_->num_data_in_leaf(),
-        larger_leaf_histogram_array_[feature_index].RawData());
+      this->train_data_->FixHistogram(feature_index, this->larger_leaf_splits_->sum_gradients(), this->larger_leaf_splits_->sum_hessians(),
+        this->larger_leaf_splits_->num_data_in_leaf(),
+        this->larger_leaf_histogram_array_[feature_index].RawData());
     }
     // find best threshold for larger child
-    larger_leaf_histogram_array_[feature_index].FindBestThreshold(
-      larger_leaf_splits_->sum_gradients(),
-      larger_leaf_splits_->sum_hessians(),
-      larger_leaf_splits_->num_data_in_leaf(),
+    this->larger_leaf_histogram_array_[feature_index].FindBestThreshold(
+      this->larger_leaf_splits_->sum_gradients(),
+      this->larger_leaf_splits_->sum_hessians(),
+      this->larger_leaf_splits_->num_data_in_leaf(),
       &larger_bestsplit_per_features[feature_index]);
     larger_bestsplit_per_features[feature_index].feature = real_feature_index;
     OMP_LOOP_EX_END();
@@ -332,8 +340,8 @@ void VotingParallelTreeLearner::FindBestThresholds() {
   }
   // global voting
   std::vector<int> smaller_top_features, larger_top_features;
-  GlobalVoting(smaller_leaf_splits_->LeafIndex(), smaller_top_k_splits_global, &smaller_top_features);
-  GlobalVoting(larger_leaf_splits_->LeafIndex(), larger_top_k_splits_global, &larger_top_features);
+  GlobalVoting(this->smaller_leaf_splits_->LeafIndex(), smaller_top_k_splits_global, &smaller_top_features);
+  GlobalVoting(this->larger_leaf_splits_->LeafIndex(), larger_top_k_splits_global, &larger_top_features);
   // copy local histgrams to buffer
   CopyLocalHistogram(smaller_top_features, larger_top_features);
 
@@ -341,11 +349,11 @@ void VotingParallelTreeLearner::FindBestThresholds() {
   Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, block_start_.data(), block_len_.data(),
     output_buffer_.data(), &HistogramBinEntry::SumReducer);
 
-  std::vector<SplitInfo> smaller_best(num_threads_);
-  std::vector<SplitInfo> larger_best(num_threads_);
+  std::vector<SplitInfo> smaller_best(this->num_threads_);
+  std::vector<SplitInfo> larger_best(this->num_threads_);
   // find best split from local aggregated histograms
 #pragma omp parallel for schedule(static)
-  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+  for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
     OMP_LOOP_EX_BEGIN();
     const int tid = omp_get_thread_num();
     if (smaller_is_feature_aggregated_[feature_index]) {
@@ -354,7 +362,7 @@ void VotingParallelTreeLearner::FindBestThresholds() {
       smaller_leaf_histogram_array_global_[feature_index].FromMemory(
         output_buffer_.data() + smaller_buffer_read_start_pos_[feature_index]);
 
-      train_data_->FixHistogram(feature_index,
+      this->train_data_->FixHistogram(feature_index,
         smaller_leaf_splits_global_->sum_gradients(), smaller_leaf_splits_global_->sum_hessians(),
         GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()),
         smaller_leaf_histogram_array_global_[feature_index].RawData());
@@ -367,7 +375,7 @@ void VotingParallelTreeLearner::FindBestThresholds() {
         &smaller_split);
       if (smaller_split.gain > smaller_best[tid].gain) {
         smaller_best[tid] = smaller_split;
-        smaller_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
+        smaller_best[tid].feature = this->train_data_->RealFeatureIndex(feature_index);
       }
     }
 
@@ -376,7 +384,7 @@ void VotingParallelTreeLearner::FindBestThresholds() {
       // restore from buffer
       larger_leaf_histogram_array_global_[feature_index].FromMemory(output_buffer_.data() + larger_buffer_read_start_pos_[feature_index]);
 
-      train_data_->FixHistogram(feature_index,
+      this->train_data_->FixHistogram(feature_index,
         larger_leaf_splits_global_->sum_gradients(), larger_leaf_splits_global_->sum_hessians(),
         GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()),
         larger_leaf_histogram_array_global_[feature_index].RawData());
@@ -389,31 +397,32 @@ void VotingParallelTreeLearner::FindBestThresholds() {
         &larger_split);
       if (larger_split.gain > larger_best[tid].gain) {
         larger_best[tid] = larger_split;
-        larger_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
+        larger_best[tid].feature = this->train_data_->RealFeatureIndex(feature_index);
       }
     }
     OMP_LOOP_EX_END();
   }
   OMP_THROW_EX();
   auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
-  int leaf = smaller_leaf_splits_->LeafIndex();
-  best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
+  int leaf = this->smaller_leaf_splits_->LeafIndex();
+  this->best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
 
-  if (larger_leaf_splits_ != nullptr && larger_leaf_splits_->LeafIndex() >= 0) {
-    leaf = larger_leaf_splits_->LeafIndex();
+  if (this->larger_leaf_splits_ != nullptr && this->larger_leaf_splits_->LeafIndex() >= 0) {
+    leaf = this->larger_leaf_splits_->LeafIndex();
     auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
-    best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
+    this->best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
   }
 
 }
 
-void VotingParallelTreeLearner::FindBestSplitsForLeaves() {
+template <typename TREELEARNER_T>
+void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsForLeaves() {
   // find local best
   SplitInfo smaller_best, larger_best;
-  smaller_best = best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()];
+  smaller_best = this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()];
   // find local best split for larger leaf
-  if (larger_leaf_splits_->LeafIndex() >= 0) {
-    larger_best = best_split_per_leaf_[larger_leaf_splits_->LeafIndex()];
+  if (this->larger_leaf_splits_->LeafIndex() >= 0) {
+    larger_best = this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()];
   }
   // sync global best info
   std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo));
@@ -425,34 +434,38 @@ void VotingParallelTreeLearner::FindBestSplitsForLeaves() {
   std::memcpy(&larger_best, output_buffer_.data() + sizeof(SplitInfo), sizeof(SplitInfo));
 
   // copy back
-  best_split_per_leaf_[smaller_leaf_splits_global_->LeafIndex()] = smaller_best;
+  this->best_split_per_leaf_[smaller_leaf_splits_global_->LeafIndex()] = smaller_best;
   if (larger_best.feature >= 0 && larger_leaf_splits_global_->LeafIndex() >= 0) {
-    best_split_per_leaf_[larger_leaf_splits_global_->LeafIndex()] = larger_best;
+    this->best_split_per_leaf_[larger_leaf_splits_global_->LeafIndex()] = larger_best;
   }
 }
 
-void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
-  SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
-  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
+template <typename TREELEARNER_T>
+void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
+  TREELEARNER_T::Split(tree, best_Leaf, left_leaf, right_leaf);
+  const SplitInfo& best_split_info = this->best_split_per_leaf_[best_Leaf];
   // set the global number of data for leaves
   global_data_count_in_leaf_[*left_leaf] = best_split_info.left_count;
   global_data_count_in_leaf_[*right_leaf] = best_split_info.right_count;
   // init the global sumup info
   if (best_split_info.left_count < best_split_info.right_count) {
-    smaller_leaf_splits_global_->Init(*left_leaf, data_partition_.get(),
+    smaller_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(),
       best_split_info.left_sum_gradient,
       best_split_info.left_sum_hessian);
-    larger_leaf_splits_global_->Init(*right_leaf, data_partition_.get(),
+    larger_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(),
       best_split_info.right_sum_gradient,
       best_split_info.right_sum_hessian);
   } else {
-    smaller_leaf_splits_global_->Init(*right_leaf, data_partition_.get(),
+    smaller_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(),
       best_split_info.right_sum_gradient,
       best_split_info.right_sum_hessian);
-    larger_leaf_splits_global_->Init(*left_leaf, data_partition_.get(),
+    larger_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(),
       best_split_info.left_sum_gradient,
       best_split_info.left_sum_hessian);
   }
 }
 
+// instantiate template classes, otherwise linker cannot find the code
+template class VotingParallelTreeLearner<GPUTreeLearner>;
+template class VotingParallelTreeLearner<SerialTreeLearner>;
 }  // namespace FTLBoost
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index a3c41e56bfb4..241884746096 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -220,10 +220,10 @@ def test_pandas_categorical(self):
         gbm3.save_model('categorical.model')
         gbm4 = lgb.Booster(model_file='categorical.model')
         pred4 = list(gbm4.predict(X_test))
-        self.assertListEqual(pred0, pred1)
-        self.assertListEqual(pred0, pred2)
-        self.assertListEqual(pred0, pred3)
-        self.assertListEqual(pred0, pred4)
+        np.testing.assert_almost_equal(pred0, pred1)
+        np.testing.assert_almost_equal(pred0, pred2)
+        np.testing.assert_almost_equal(pred0, pred3)
+        np.testing.assert_almost_equal(pred0, pred4)
 
 
 print("----------------------------------------------------------------------")