diff --git a/src/v/cluster/scheduling/leader_balancer.cc b/src/v/cluster/scheduling/leader_balancer.cc index 78a3ce2732e1..08e90a1e6662 100644 --- a/src/v/cluster/scheduling/leader_balancer.cc +++ b/src/v/cluster/scheduling/leader_balancer.cc @@ -134,6 +134,10 @@ void leader_balancer::on_leadership_change( // Update in flight state if (auto it = _in_flight_changes.find(group); it != _in_flight_changes.end()) { + vlog( + clusterlog.trace, + "transfer of group {} finished, removing from in-flight set", + group); _in_flight_changes.erase(it); check_unregister_leadership_change_notification(); @@ -511,6 +515,7 @@ ss::future leader_balancer::balance() { co_return ss::stop_iteration::yes; } + size_t num_dispatched = 0; for (size_t i = 0; i < allowed_change_cnt; i++) { if (should_stop_balance()) { co_return ss::stop_iteration::yes; @@ -519,11 +524,14 @@ ss::future leader_balancer::balance() { auto transfer = strategy->find_movement(muted_groups()); if (!transfer) { vlog( - clusterlog.debug, - "No leadership balance improvements found with total delta {}, " - "number of muted groups {}", + clusterlog.info, + "Leadership balancer tick: no further improvements found, " + "total error: {:.4}, number of muted groups: {}, " + "number in flight: {}, dispatched in this tick: {}", strategy->error(), - _muted.size()); + _muted.size(), + _in_flight_changes.size(), + num_dispatched); if (!_timer.armed()) { _timer.arm(_idle_timeout()); } @@ -531,6 +539,16 @@ ss::future leader_balancer::balance() { co_return ss::stop_iteration::yes; } + vlog( + clusterlog.trace, + "dispatching transfer of group {}: {} -> {}, " + "current num_dispatched: {}, in_flight: {}", + transfer->group, + transfer->from, + transfer->to, + num_dispatched, + _in_flight_changes.size()); + _in_flight_changes[transfer->group] = { *transfer, clock_type::now() + _mute_timeout()}; check_register_leadership_change_notification(); @@ -539,10 +557,12 @@ ss::future leader_balancer::balance() { if (!success) { vlog( clusterlog.info, - "Error transferring leadership group {} from {} to {}", + "Error transferring leadership group {} from {} to {} " + "(already dispatched in this tick: {})", transfer->group, transfer->from, - transfer->to); + transfer->to, + num_dispatched); _in_flight_changes.erase(transfer->group); check_unregister_leadership_change_notification(); @@ -562,6 +582,7 @@ ss::future leader_balancer::balance() { } else { _probe.leader_transfer_succeeded(); + num_dispatched += 1; strategy->apply_movement(*transfer); } @@ -900,13 +921,6 @@ leader_balancer::index_type leader_balancer::build_index( } ss::future leader_balancer::do_transfer(reassignment transfer) { - vlog( - clusterlog.debug, - "Transferring leadership for group {} from {} to {}", - transfer.group, - transfer.from, - transfer.to); - if (transfer.from.node_id == _raft0->self().id()) { co_return co_await do_transfer_local(transfer); } else { @@ -1033,10 +1047,6 @@ ss::future leader_balancer::do_transfer_remote(reassignment transfer) { res.error().message()); co_return false; } else if (res.value().data.success) { - vlog( - clusterlog.trace, - "Leadership transfer of group {} succeeded", - transfer.group); co_return true; } else { vlog( diff --git a/tests/rptest/tests/maintenance_test.py b/tests/rptest/tests/maintenance_test.py index 7548c745e0c2..9993c8a2540a 100644 --- a/tests/rptest/tests/maintenance_test.py +++ b/tests/rptest/tests/maintenance_test.py @@ -29,9 +29,9 @@ def __init__(self, *args, **kwargs): # Vary partition count relative to num_cpus. This is to ensure that # leadership is moved back to a node that exits maintenance. num_cpus = self.redpanda.get_node_cpu_count() - self.topics = (TopicSpec(partition_count=num_cpus * 3, + self.topics = (TopicSpec(partition_count=num_cpus * 5, replication_factor=3), - TopicSpec(partition_count=num_cpus * 3, + TopicSpec(partition_count=num_cpus * 10, replication_factor=3)) self.admin = Admin(self.redpanda) self.rpk = RpkTool(self.redpanda) @@ -131,8 +131,12 @@ def _enable_maintenance(self, node): """ self.logger.debug( f"Checking that node {node.name} has a leadership role") + # In case the node is unlucky and doesn't get any leaders "naturally", + # we have to wait for the leadership balancer to do its job. We have to wait + # at least 1 minute for it to unmute just restarted nodes and perform another + # tick. Wait more than leader_balancer_idle_timeout (2 minutes) just to be sure. wait_until(lambda: self._has_leadership_role(node), - timeout_sec=60, + timeout_sec=150, backoff_sec=10) self.logger.debug(