Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix GPU categorical split memory allocation. #9529

Merged
merged 1 commit into from
Aug 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/common/categorical.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
*
* Go to left if it's NOT the matching category, which matches one-hot encoding.
*/
inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat) {
inline XGBOOST_DEVICE bool Decision(common::Span<CatBitField::value_type const> cats, float cat) {
KCatBitField const s_cats(cats);
if (XGBOOST_EXPECT(InvalidCat(cat), false)) {
return true;
Expand Down
21 changes: 8 additions & 13 deletions src/tree/gpu_hist/evaluate_splits.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2020-2022 by XGBoost Contributors
/**
* Copyright 2020-2023, XGBoost Contributors
*/
#include <algorithm> // std::max
#include <vector>
Expand All @@ -11,9 +11,7 @@
#include "evaluate_splits.cuh"
#include "expand_entry.cuh"

namespace xgboost {
namespace tree {

namespace xgboost::tree {
// With constraints
XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan,
const GradientPairInt64 &missing,
Expand Down Expand Up @@ -315,11 +313,11 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
common::Span<common::CatBitField::value_type> out,
DeviceSplitCandidate *p_out_split) {
auto &out_split = *p_out_split;
out_split.split_cats = common::CatBitField{out};
auto out_cats = common::CatBitField{out};

// Simple case for one hot split
if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) {
out_split.split_cats.Set(common::AsCat(out_split.thresh));
out_cats.Set(common::AsCat(out_split.thresh));
return;
}

Expand All @@ -339,7 +337,7 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
assert(partition > 0 && "Invalid partition.");
thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) {
auto cat = shared_inputs.feature_values[c - node_offset];
out_split.SetCat(cat);
out_cats.Set(common::AsCat(cat));
});
}

Expand Down Expand Up @@ -444,8 +442,7 @@ void GPUHistEvaluator::EvaluateSplits(

if (split.is_cat) {
SetCategoricalSplit(shared_inputs, d_sorted_idx, fidx, i,
device_cats_accessor.GetNodeCatStorage(input.nidx),
&out_splits[i]);
device_cats_accessor.GetNodeCatStorage(input.nidx), &out_splits[i]);
}

float base_weight =
Expand Down Expand Up @@ -477,6 +474,4 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
cudaMemcpyDeviceToHost));
return root_entry;
}

} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree
22 changes: 16 additions & 6 deletions src/tree/gpu_hist/evaluate_splits.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ struct EvaluateSplitSharedInputs {
common::Span<const float> feature_values;
common::Span<const float> min_fvalue;
bool is_dense;
XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
__device__ auto FeatureBins(bst_feature_t fidx) const {
[[nodiscard]] XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
[[nodiscard]] __device__ std::uint32_t FeatureBins(bst_feature_t fidx) const {
return feature_segments[fidx + 1] - feature_segments[fidx];
}
};
Expand Down Expand Up @@ -105,7 +105,7 @@ class GPUHistEvaluator {
}

/**
* \brief Get device category storage of nidx for internal calculation.
* @brief Get device category storage of nidx for internal calculation.
*/
auto DeviceCatStorage(const std::vector<bst_node_t> &nidx) {
if (!has_categoricals_) return CatAccessor{};
Expand All @@ -120,8 +120,8 @@ class GPUHistEvaluator {
/**
* \brief Get sorted index storage based on the left node of inputs.
*/
auto SortedIdx(int num_nodes, bst_feature_t total_bins) {
if(!need_sort_histogram_) return common::Span<bst_feature_t>();
auto SortedIdx(int num_nodes, bst_bin_t total_bins) {
if (!need_sort_histogram_) return common::Span<bst_feature_t>{};
cat_sorted_idx_.resize(num_nodes * total_bins);
return dh::ToSpan(cat_sorted_idx_);
}
Expand All @@ -146,12 +146,22 @@ class GPUHistEvaluator {
* \brief Get host category storage for nidx. Different from the internal version, this
* returns strictly 1 node.
*/
common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
[[nodiscard]] common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
copy_stream_.View().Sync();
auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(
nidx * node_categorical_storage_size_, node_categorical_storage_size_);
return cats_out;
}

[[nodiscard]] auto GetDeviceNodeCats(bst_node_t nidx) {
copy_stream_.View().Sync();
if (has_categoricals_) {
CatAccessor accessor = {dh::ToSpan(split_cats_), node_categorical_storage_size_};
return common::KCatBitField{accessor.GetNodeCatStorage(nidx)};
} else {
return common::KCatBitField{};
}
}
/**
* \brief Add a split to the internal tree evaluator.
*/
Expand Down
36 changes: 15 additions & 21 deletions src/tree/updater_gpu_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,20 +64,13 @@ struct DeviceSplitCandidate {
// split.
bst_cat_t thresh{-1};

common::CatBitField split_cats;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trying to understand what this PR does.
Does the reference split_cats get invalidated as more categories are added? Why is it better to get the categories from the evaluator directly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. It's invalidated when more nodes are added.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So same reason as why you shouldn't carry around references to vector elements? Got it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it was introduced during optimization and refactoring in 1.7. But should be fixed now. I will do some more tests myself. The error was only reproducible using a sanitizer.

bool is_cat { false };

GradientPairInt64 left_sum;
GradientPairInt64 right_sum;

XGBOOST_DEVICE DeviceSplitCandidate() {} // NOLINT

template <typename T>
XGBOOST_DEVICE void SetCat(T c) {
this->split_cats.Set(common::AsCat(c));
fvalue = std::max(this->fvalue, static_cast<float>(c));
}

XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in, float fvalue_in,
int findex_in, GradientPairInt64 left_sum_in,
GradientPairInt64 right_sum_in, bool cat,
Expand All @@ -100,22 +93,23 @@ struct DeviceSplitCandidate {
*/
XGBOOST_DEVICE void UpdateCat(float loss_chg_in, DefaultDirection dir_in, bst_cat_t thresh_in,
bst_feature_t findex_in, GradientPairInt64 left_sum_in,
GradientPairInt64 right_sum_in, GPUTrainingParam const& param, const GradientQuantiser& quantiser) {
if (loss_chg_in > loss_chg &&
quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
loss_chg = loss_chg_in;
dir = dir_in;
fvalue = std::numeric_limits<float>::quiet_NaN();
thresh = thresh_in;
is_cat = true;
left_sum = left_sum_in;
right_sum = right_sum_in;
findex = findex_in;
}
GradientPairInt64 right_sum_in, GPUTrainingParam const& param,
const GradientQuantiser& quantiser) {
if (loss_chg_in > loss_chg &&
quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
loss_chg = loss_chg_in;
dir = dir_in;
fvalue = std::numeric_limits<float>::quiet_NaN();
thresh = thresh_in;
is_cat = true;
left_sum = left_sum_in;
right_sum = right_sum_in;
findex = findex_in;
}
}

XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
[[nodiscard]] XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }

friend std::ostream& operator<<(std::ostream& os, DeviceSplitCandidate const& c) {
os << "loss_chg:" << c.loss_chg << ", "
Expand Down
50 changes: 27 additions & 23 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

#include <algorithm>
#include <cmath>
#include <limits>
#include <memory>
#include <utility>
#include <cstddef> // for size_t
#include <memory> // for unique_ptr, make_unique
#include <utility> // for move
#include <vector>

#include "../collective/communicator-inl.cuh"
Expand Down Expand Up @@ -216,9 +216,9 @@ struct GPUHistMakerDevice {
void InitFeatureGroupsOnce() {
if (!feature_groups) {
CHECK(page);
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
sizeof(GradientPairPrecise)));
feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
sizeof(GradientPairPrecise));
}
}

Expand All @@ -245,10 +245,10 @@ struct GPUHistMakerDevice {
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
dmat->Info().IsColumnSplit(), ctx_->gpu_id);

quantiser.reset(new GradientQuantiser(this->gpair));
quantiser = std::make_unique<GradientQuantiser>(this->gpair);

row_partitioner.reset(); // Release the device memory first before reallocating
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);

// Init histogram
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
Expand Down Expand Up @@ -295,7 +295,7 @@ struct GPUHistMakerDevice {
dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
// Store the feature set ptrs so they dont go out of scope before the kernel is called
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
for (size_t i = 0; i < candidates.size(); i++) {
for (std::size_t i = 0; i < candidates.size(); i++) {
auto candidate = candidates.at(i);
int left_nidx = tree[candidate.nid].LeftChild();
int right_nidx = tree[candidate.nid].RightChild();
Expand Down Expand Up @@ -328,14 +328,13 @@ struct GPUHistMakerDevice {
d_node_inputs.data().get(), h_node_inputs.data(),
h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));

this->evaluator_.EvaluateSplits(nidx, max_active_features,
dh::ToSpan(d_node_inputs), shared_inputs,
dh::ToSpan(entries));
this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
shared_inputs, dh::ToSpan(entries));
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
cudaMemcpyDeviceToHost));
dh::DefaultStream().Sync();
}
}

void BuildHist(int nidx) {
auto d_node_hist = hist.GetNodeHistogram(nidx);
Expand Down Expand Up @@ -367,31 +366,37 @@ struct GPUHistMakerDevice {
struct NodeSplitData {
RegTree::Node split_node;
FeatureType split_type;
common::CatBitField node_cats;
common::KCatBitField node_cats;
};

void UpdatePosition(const std::vector<GPUExpandEntry>& candidates, RegTree* p_tree) {
if (candidates.empty()) return;
std::vector<int> nidx(candidates.size());
std::vector<int> left_nidx(candidates.size());
std::vector<int> right_nidx(candidates.size());
void UpdatePosition(std::vector<GPUExpandEntry> const& candidates, RegTree* p_tree) {
if (candidates.empty()) {
return;
}

std::vector<bst_node_t> nidx(candidates.size());
std::vector<bst_node_t> left_nidx(candidates.size());
std::vector<bst_node_t> right_nidx(candidates.size());
std::vector<NodeSplitData> split_data(candidates.size());

for (size_t i = 0; i < candidates.size(); i++) {
auto& e = candidates[i];
auto const& e = candidates[i];
RegTree::Node split_node = (*p_tree)[e.nid];
auto split_type = p_tree->NodeSplitType(e.nid);
nidx.at(i) = e.nid;
left_nidx.at(i) = split_node.LeftChild();
right_nidx.at(i) = split_node.RightChild();
split_data.at(i) = NodeSplitData{split_node, split_type, e.split.split_cats};
split_data.at(i) = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats(e.nid)};

CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
}

auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
row_partitioner->UpdatePositionBatch(
nidx, left_nidx, right_nidx, split_data,
[=] __device__(bst_uint ridx, const NodeSplitData& data) {
// given a row index, returns the node id it belongs to
bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
// Missing value
bool go_left = true;
if (isnan(cut_value)) {
Expand Down Expand Up @@ -621,7 +626,6 @@ struct GPUHistMakerDevice {
CHECK(common::CheckNAN(candidate.split.fvalue));
std::vector<common::CatBitField::value_type> split_cats;

CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex);
split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0);
Expand Down