diff --git a/python-package/xgboost/tracker.py b/python-package/xgboost/tracker.py index 142a70fc50b3..606c63791c6d 100644 --- a/python-package/xgboost/tracker.py +++ b/python-package/xgboost/tracker.py @@ -137,15 +137,9 @@ def assign_rank( return self._get_remote(wait_conn, nnset) def _get_remote( - self, wait_conn: Dict[int, "WorkerEntry"], nnset: Set[int] + self, wait_conn: Dict[int, "WorkerEntry"], badset: Set[int] ) -> List[int]: while True: - ngood = self.sock.recvint() - goodset = set() - for _ in range(ngood): - goodset.add(self.sock.recvint()) - assert goodset.issubset(nnset) - badset = nnset - goodset conset = [] for r in badset: if r in wait_conn: @@ -343,7 +337,7 @@ def accept_workers(self, n_workers: int) -> None: shutdown[s.rank] = s logging.debug("Received %s signal from %d", s.cmd, s.rank) continue - assert s.cmd in ("start", "recover") + assert s.cmd == "start" # lazily initialize the workers if tree_map is None: assert s.cmd == "start" diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc index e123b52d899d..ac08ac12a2cf 100644 --- a/rabit/src/allreduce_base.cc +++ b/rabit/src/allreduce_base.cc @@ -318,21 +318,10 @@ bool AllreduceBase::ReConnectLinks(const char *cmd) { // get number of to connect and number of to accept nodes from tracker int num_conn, num_accept, num_error = 1; do { - // send over good links - std::vector good_link; for (auto & all_link : all_links) { - if (!all_link.sock.BadSocket()) { - good_link.push_back(static_cast(all_link.rank)); - } else { - if (!all_link.sock.IsClosed()) all_link.sock.Close(); - } + all_link.sock.Close(); } - int ngood = static_cast(good_link.size()); // tracker construct goodset - Assert(tracker.SendAll(&ngood, sizeof(ngood)) == sizeof(ngood), "ReConnectLink failure 5"); - for (int &i : good_link) { - Assert(tracker.SendAll(&i, sizeof(i)) == sizeof(i), "ReConnectLink failure 6"); - } Assert(tracker.RecvAll(&num_conn, sizeof(num_conn)) == sizeof(num_conn), "ReConnectLink failure 7"); Assert(tracker.RecvAll(&num_accept, sizeof(num_accept)) == sizeof(num_accept),