Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support both row-wise and col-wise multi-threading #2699

Merged
merged 33 commits into from
Feb 2, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
c8883fc
commit
guolinke Jan 20, 2020
281dd32
fix a bug
guolinke Jan 20, 2020
ea718c2
fix bug
guolinke Jan 21, 2020
2ad4af5
reset to track changes
guolinke Jan 30, 2020
748c95a
refine the auto choose logic
guolinke Jan 30, 2020
0340ffd
sort the time stats output
guolinke Jan 30, 2020
d3434c7
fix include
guolinke Jan 30, 2020
8c4ea1a
change multi_val_bin_sparse_threshold
guolinke Jan 30, 2020
6cac288
add cmake
guolinke Jan 30, 2020
afdbf3c
add _mm_malloc and _mm_free for cross platform
guolinke Jan 30, 2020
210ac4b
fix cmake bug
guolinke Jan 30, 2020
ad2865d
timer for split
guolinke Jan 30, 2020
4c4a33b
try to fix cmake
guolinke Jan 30, 2020
2a33dcb
fix tests
guolinke Jan 30, 2020
256e6d9
refactor DataPartition::Split
guolinke Jan 30, 2020
a722b38
Merge remote-tracking branch 'origin/master' into sparse_bin_clean
guolinke Jan 30, 2020
7a59f19
fix test
guolinke Jan 30, 2020
1ac8283
typo
guolinke Jan 30, 2020
5b8de4f
formating
guolinke Jan 30, 2020
106c081
Revert "formating"
guolinke Jan 31, 2020
382e13e
add document
guolinke Jan 31, 2020
dec3d79
[R-package] Added tests on use of force_col_wise and force_row_wise i…
jameslamb Jan 31, 2020
d2fb9b3
naming
guolinke Jan 31, 2020
5db5d74
fix gpu code
guolinke Jan 31, 2020
7fda05a
Update include/LightGBM/bin.h
guolinke Jan 31, 2020
27a7209
Update src/treelearner/ocl/histogram16.cl
guolinke Jan 31, 2020
4623cd4
test: swap compilers for CI
StrikerRUS Jan 31, 2020
38d1e57
fix omp
guolinke Feb 1, 2020
8e27631
not avx2
guolinke Feb 1, 2020
c86a479
no aligned for feature histogram
guolinke Feb 1, 2020
737e9c9
Revert "refactor DataPartition::Split"
guolinke Feb 1, 2020
ce5f66b
slightly refactor data partition
guolinke Feb 1, 2020
a123c47
reduce the memory cost
guolinke Feb 2, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion R-package/tests/testthat/test_basic.R
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ test_that("lgb.train() works with force_col_wise and force_row_wise", {

# check some basic details of the boosters just to be sure force_col_wise
# and force_row_wise are not causing any weird side effects
for (bst in list(bst_row_wise, bst_colwise)){
for (bst in list(bst_row_wise, bst_colwise)) {
expect_equal(bst$current_iter(), nrounds)
parsed_model <- jsonlite::fromJSON(bst$dump_model())
expect_equal(parsed_model$objective, "binary sigmoid:1")
Expand Down
2 changes: 1 addition & 1 deletion docs/Parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ Learning Control Parameters

- Recommend ``force_col_wise=true`` when:

- the number of cloumns is large, or the total number of bin is large
- the number of columns is large, or the total number of bin is large

- when ``num_threads`` is large, e.g. ``>20``

Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ struct Config {

// desc = set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build
// desc = Recommend ``force_col_wise=true`` when:
// descl2 = the number of cloumns is large, or the total number of bin is large
// descl2 = the number of columns is large, or the total number of bin is large
// descl2 = when ``num_threads`` is large, e.g. ``>20``
// descl2 = want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up
// descl2 = want to reduce memory cost
Expand Down
5 changes: 5 additions & 0 deletions src/io/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,11 @@ void Config::CheckParamConflict() {
num_leaves = static_cast<int>(full_num_leaves);
}
}
// force col-wise for gpu
if (device_type == std::string("gpu")) {
force_col_wise = true;
force_row_wise = false;
}
}

std::string Config::ToString() const {
Expand Down
195 changes: 93 additions & 102 deletions src/io/dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
if (!second_round_features.empty()) {
features_in_group.emplace_back();
conflict_marks.emplace_back(total_sample_cnt, false);
bool is_multi_val = false;
bool is_multi_val = is_use_gpu ? true : false;
int conflict_cnt = 0;
for (auto fidx : second_round_features) {
features_in_group.back().push_back(fidx);
Expand Down Expand Up @@ -1211,114 +1211,105 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
int num_used_dense_group = static_cast<int>(used_dense_group.size());
global_timer.Stop("Dataset::Get used group");
global_timer.Start("Dataset::dense_bin_histogram");
auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians;
if (data_indices != nullptr && num_data < num_data_) {
if (!is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]];
}
} else {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
}
}
ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians;
if (!is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_aligned_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin* KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
0,
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
OMP_LOOP_EX_END();
if (num_used_dense_group > 0) {
auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians;
if (data_indices != nullptr && num_data < num_data_) {
if (!is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]];
}
} else {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
}
}
OMP_THROW_EX();
ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians;
if (!is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_aligned_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
data_ptr);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();

} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_aligned_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin* KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
0,
num_data,
ptr_ordered_grad,
data_ptr);
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_aligned_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
}
OMP_LOOP_EX_END();
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
}
} else {
if (!is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_aligned_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin* KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
0,
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
OMP_LOOP_EX_END();
OMP_THROW_EX();
}
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_aligned_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin* KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
0,
num_data,
ptr_ordered_grad,
data_ptr);
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
if (!is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_aligned_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_aligned_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
0, num_data, ptr_ordered_grad, data_ptr);
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
}
OMP_LOOP_EX_END();
}
OMP_LOOP_EX_END();
OMP_THROW_EX();
}
OMP_THROW_EX();
}
}
global_timer.Stop("Dataset::dense_bin_histogram");
Expand Down
6 changes: 3 additions & 3 deletions src/objective/rank_xendcg_objective.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ class RankXENDCG: public ObjectiveFunction {
// Skip query if sum of labels is 0.
float sum_labels = 0;
for (data_size_t i = 0; i < cnt; ++i) {
sum_labels += phi(label[i], gammas[i]);
sum_labels += static_cast<float>(phi(label[i], gammas[i]));
}
if (sum_labels == 0) {
if (std::fabs(sum_labels) < kEpsilon) {
return;
}

Expand Down Expand Up @@ -111,7 +111,7 @@ class RankXENDCG: public ObjectiveFunction {
}

double phi(const label_t l, double g) const {
return Common::Pow(2, l) - g;
return Common::Pow(2, static_cast<int>(l)) - g;
}

const char* GetName() const override {
Expand Down
51 changes: 29 additions & 22 deletions src/treelearner/gpu_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,20 +76,16 @@ void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
a.f = GET_GRAD(h1, i);
b.f = GET_GRAD(h2, i);
int32_t ulps = Float_t::ulp_diff(a, b);
if (ulps > 0)) {
printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps);
goto err;
}
if (ulps > 0) {
// printf("grad %g != %g (%d ULPs)\n", h1[i].sum_gradients, h2[i].sum_gradients, ulps);
// printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps);
// goto err;
}
a.f = GET_HESS(h1, i);
b.f = GET_HESS(h2, i);
ulps = Float_t::ulp_diff(a, b);
if (ulps > 0) {
// printf("hessian %g != %g (%d ULPs)\n", h1[i].sum_hessians, h2[i].sum_hessians, ulps);
// goto err;
if (std::fabs(a.f - b.f) >= 1e-20) {
printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps);
goto err;
}
}
return;
Expand Down Expand Up @@ -213,7 +209,6 @@ void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) {
int ind = 0;
for (int j = 0; j < bin_size; ++j) {
double sum_g = 0.0, sum_h = 0.0;
size_t cnt = 0;
for (int k = 0; k < device_bin_mults_[i]; ++k) {
sum_g += GET_GRAD(hist_outputs, i * device_bin_size_+ ind);
sum_h += GET_HESS(hist_outputs, i * device_bin_size_+ ind);
Expand Down Expand Up @@ -679,6 +674,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
printf("bin size: ");
#endif
for (int i = 0; i < num_feature_groups_; ++i) {
if (train_data_->IsMultiGroup(i)) {
continue;
}
#if GPU_DEBUG >= 1
printf("%d, ", train_data_->FeatureGroupNumBin(i));
#endif
Expand Down Expand Up @@ -971,9 +969,8 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
nullptr, nullptr,
nullptr, nullptr);
// then construct sparse features on CPU
// We set data_indices to null to avoid rebuilding ordered gradients/hessians
train_data_->ConstructHistograms(is_sparse_feature_used,
nullptr, smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
multi_val_bin_.get(), is_hist_colwise_,
Expand Down Expand Up @@ -1002,16 +999,27 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
hist_t* gpu_histogram = new hist_t[size * 2];
data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size);
std::copy(current_histogram, current_histogram + size, gpu_histogram);
std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(hist_t) * 2);
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
num_data != num_data_ ? smaller_leaf_splits_->data_indices() : nullptr,
num_data,
num_data != num_data_ ? ordered_gradients_.data() : gradients_,
num_data != num_data_ ? ordered_hessians_.data() : hessians_,
current_histogram);
std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
if(train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr){continue;}
if (num_data != num_data_ ) {
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
smaller_leaf_splits_->data_indices(),
0,
num_data,
ordered_gradients_.data(),
ordered_hessians_.data(),
current_histogram);
} else {
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
0,
num_data,
gradients_,
hessians_,
current_histogram);
}
CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
std::copy(gpu_histogram, gpu_histogram + size, current_histogram);
std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
delete [] gpu_histogram;
}
#endif
Expand All @@ -1024,9 +1032,8 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data());
// then construct sparse features on CPU
// We set data_indices to null to avoid rebuilding ordered gradients/hessians
train_data_->ConstructHistograms(is_sparse_feature_used,
nullptr, larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
multi_val_bin_.get(), is_hist_colwise_,
Expand Down
Loading