-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support vertical federated learning #8932
Changes from 3 commits
d4eb488
858bded
e1f67f2
81235a5
1e0c070
6ac73e3
5f8c7e2
21d1519
919dadb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -440,7 +440,7 @@ class LearnerConfiguration : public Learner { | |
info.Validate(Ctx()->gpu_id); | ||
// We estimate it from input data. | ||
linalg::Tensor<float, 1> base_score; | ||
UsePtr(obj_)->InitEstimation(info, &base_score); | ||
InitEstimation(info, &base_score); | ||
CHECK_EQ(base_score.Size(), 1); | ||
mparam_.base_score = base_score(0); | ||
CHECK(!std::isnan(mparam_.base_score)); | ||
|
@@ -857,6 +857,25 @@ class LearnerConfiguration : public Learner { | |
mparam_.num_target = n_targets; | ||
} | ||
} | ||
|
||
void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens if we just calculate the gradient using individual workers? Is the gradient still the same? If so, we can just let them calculate. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we don't have labels in non-0 workers, they won't be able to calculate the gradient. |
||
// Special handling for vertical federated learning. | ||
if (collective::IsFederated() && info.data_split_mode == DataSplitMode::kCol) { | ||
// We assume labels are only available on worker 0, so the estimation is calculated there | ||
// and added to other workers. | ||
if (collective::GetRank() == 0) { | ||
UsePtr(obj_)->InitEstimation(info, base_score); | ||
collective::Broadcast(base_score->Data()->HostPointer(), | ||
sizeof(bst_float) * base_score->Size(), 0); | ||
} else { | ||
base_score->Reshape(1); | ||
collective::Broadcast(base_score->Data()->HostPointer(), | ||
sizeof(bst_float) * base_score->Size(), 0); | ||
} | ||
} else { | ||
UsePtr(obj_)->InitEstimation(info, base_score); | ||
} | ||
} | ||
}; | ||
|
||
std::string const LearnerConfiguration::kEvalMetric {"eval_metric"}; // NOLINT | ||
|
@@ -1303,7 +1322,7 @@ class LearnerImpl : public LearnerIO { | |
monitor_.Stop("PredictRaw"); | ||
|
||
monitor_.Start("GetGradient"); | ||
obj_->GetGradient(predt.predictions, train->Info(), iter, &gpair_); | ||
GetGradient(predt.predictions, train->Info(), iter, &gpair_); | ||
monitor_.Stop("GetGradient"); | ||
TrainingObserver::Instance().Observe(gpair_, "Gradients"); | ||
|
||
|
@@ -1482,6 +1501,28 @@ class LearnerImpl : public LearnerIO { | |
} | ||
|
||
private: | ||
void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info, int iteration, | ||
HostDeviceVector<GradientPair>* out_gpair) { | ||
// Special handling for vertical federated learning. | ||
if (collective::IsFederated() && info.data_split_mode == DataSplitMode::kCol) { | ||
// We assume labels are only available on worker 0, so the gradients are calculated there | ||
// and broadcast to other workers. | ||
if (collective::GetRank() == 0) { | ||
obj_->GetGradient(preds, info, iteration, out_gpair); | ||
collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair), | ||
0); | ||
} else { | ||
CHECK_EQ(info.labels.Size(), 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be difficult for users to specify their own worker rank once we put xgboost in an automated pipeline. I look at your nvflare example, the rank is not assigned by user. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can check if the label size is 0 here to determine who needs to calculate the gradient. But in general we need stable ranks for the trained model to be useful for inference. That's more of an nvflare requirement. I'll ask them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any way to automatically agree on who should be the one to own the label? Maybe it's easier to have a fully automated pipeline if everyone has equal access to labels? Just curious from a user's perspective. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sometime (most times?) it's not possible for all the parties to have access to the labels. For example, a hospital may have the diagnosis results of a patient, but labs only have access to blood work, DNA tests, etc. I think the best way to guarantee the ordering for now is to always launch the workers in the same sequence. Since federated learning is usually done by a single admin, this is reasonable solution. I'll ask the NVFLARE team to see if they can add some new features to better support this. |
||
<< "In vertical federated learning, labels should only be on the first worker"; | ||
out_gpair->Resize(preds.Size()); | ||
collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair), | ||
0); | ||
} | ||
} else { | ||
obj_->GetGradient(preds, info, iteration, out_gpair); | ||
} | ||
} | ||
|
||
/*! \brief random number transformation seed. */ | ||
static int32_t constexpr kRandSeedMagic = 127; | ||
// gradient pairs | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,7 +21,8 @@ | |
namespace xgboost { | ||
namespace tree { | ||
namespace cpu_impl { | ||
void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair, | ||
void FitStump(Context const* ctx, MetaInfo const& info, | ||
linalg::TensorView<GradientPair const, 2> gpair, | ||
linalg::VectorView<float> out) { | ||
auto n_targets = out.Size(); | ||
CHECK_EQ(n_targets, gpair.Shape(1)); | ||
|
@@ -43,8 +44,12 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai | |
} | ||
} | ||
CHECK(h_sum.CContiguous()); | ||
collective::Allreduce<collective::Operation::kSum>( | ||
reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2); | ||
|
||
// In vertical federated learning, only worker 0 needs to call this, no need to do an allreduce. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we can simply run it for all workers to remove a condition? We have a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The issue is in |
||
if (!collective::IsFederated() || info.data_split_mode != DataSplitMode::kCol) { | ||
collective::Allreduce<collective::Operation::kSum>( | ||
reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2); | ||
} | ||
|
||
for (std::size_t i = 0; i < h_sum.Size(); ++i) { | ||
out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess())); | ||
|
@@ -64,15 +69,15 @@ inline void FitStump(Context const*, linalg::TensorView<GradientPair const, 2>, | |
#endif // !defined(XGBOOST_USE_CUDA) | ||
} // namespace cuda_impl | ||
|
||
void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair, | ||
void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair, | ||
bst_target_t n_targets, linalg::Vector<float>* out) { | ||
out->SetDevice(ctx->gpu_id); | ||
out->Reshape(n_targets); | ||
auto n_samples = gpair.Size() / n_targets; | ||
|
||
gpair.SetDevice(ctx->gpu_id); | ||
auto gpair_t = linalg::MakeTensorView(ctx, &gpair, n_samples, n_targets); | ||
ctx->IsCPU() ? cpu_impl::FitStump(ctx, gpair_t, out->HostView()) | ||
ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView()) | ||
: cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id)); | ||
} | ||
} // namespace tree | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This potentially pulls data from the device to the host.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed, but it's not much different from some of the other methods there.