[pyspark] Sort workers by task ID.

The PySpark interface uses partition ID as task ID. Unlike the ordering of workers, partitions should be deterministic.
dmlc · Apr 28, 2024 · 06a3f59 · 06a3f59
1 parent f355418
commit 06a3f59
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
@@ -55,7 +55,7 @@ def _start_tracker(context: BarrierTaskContext, n_workers: int) -> Dict[str, Any
     """Start Rabit tracker with n_workers"""
     env: Dict[str, Any] = {"DMLC_NUM_WORKER": n_workers}
     host = _get_host_ip(context)
-    rabit_context = RabitTracker(host_ip=host, n_workers=n_workers)
+    rabit_context = RabitTracker(host_ip=host, n_workers=n_workers, sortby="task")
     env.update(rabit_context.worker_envs())
     rabit_context.start(n_workers)
     thread = Thread(target=rabit_context.join)