From b5378f98c7bf701202f00f682929b7e8cee2045a Mon Sep 17 00:00:00 2001 From: Alexey Zatelepin Date: Wed, 10 Jul 2024 12:17:19 +0200 Subject: [PATCH 1/3] c/leader_balancer: improve logging (cherry picked from commit e53abfd7135e38cfba8f9934c9b565160bb4fe15) --- src/v/cluster/scheduling/leader_balancer.cc | 44 +++++++++++++-------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/src/v/cluster/scheduling/leader_balancer.cc b/src/v/cluster/scheduling/leader_balancer.cc index 78a3ce2732e1..08e90a1e6662 100644 --- a/src/v/cluster/scheduling/leader_balancer.cc +++ b/src/v/cluster/scheduling/leader_balancer.cc @@ -134,6 +134,10 @@ void leader_balancer::on_leadership_change( // Update in flight state if (auto it = _in_flight_changes.find(group); it != _in_flight_changes.end()) { + vlog( + clusterlog.trace, + "transfer of group {} finished, removing from in-flight set", + group); _in_flight_changes.erase(it); check_unregister_leadership_change_notification(); @@ -511,6 +515,7 @@ ss::future leader_balancer::balance() { co_return ss::stop_iteration::yes; } + size_t num_dispatched = 0; for (size_t i = 0; i < allowed_change_cnt; i++) { if (should_stop_balance()) { co_return ss::stop_iteration::yes; @@ -519,11 +524,14 @@ ss::future leader_balancer::balance() { auto transfer = strategy->find_movement(muted_groups()); if (!transfer) { vlog( - clusterlog.debug, - "No leadership balance improvements found with total delta {}, " - "number of muted groups {}", + clusterlog.info, + "Leadership balancer tick: no further improvements found, " + "total error: {:.4}, number of muted groups: {}, " + "number in flight: {}, dispatched in this tick: {}", strategy->error(), - _muted.size()); + _muted.size(), + _in_flight_changes.size(), + num_dispatched); if (!_timer.armed()) { _timer.arm(_idle_timeout()); } @@ -531,6 +539,16 @@ ss::future leader_balancer::balance() { co_return ss::stop_iteration::yes; } + vlog( + clusterlog.trace, + "dispatching transfer of group {}: {} -> {}, " + "current num_dispatched: {}, in_flight: {}", + transfer->group, + transfer->from, + transfer->to, + num_dispatched, + _in_flight_changes.size()); + _in_flight_changes[transfer->group] = { *transfer, clock_type::now() + _mute_timeout()}; check_register_leadership_change_notification(); @@ -539,10 +557,12 @@ ss::future leader_balancer::balance() { if (!success) { vlog( clusterlog.info, - "Error transferring leadership group {} from {} to {}", + "Error transferring leadership group {} from {} to {} " + "(already dispatched in this tick: {})", transfer->group, transfer->from, - transfer->to); + transfer->to, + num_dispatched); _in_flight_changes.erase(transfer->group); check_unregister_leadership_change_notification(); @@ -562,6 +582,7 @@ ss::future leader_balancer::balance() { } else { _probe.leader_transfer_succeeded(); + num_dispatched += 1; strategy->apply_movement(*transfer); } @@ -900,13 +921,6 @@ leader_balancer::index_type leader_balancer::build_index( } ss::future leader_balancer::do_transfer(reassignment transfer) { - vlog( - clusterlog.debug, - "Transferring leadership for group {} from {} to {}", - transfer.group, - transfer.from, - transfer.to); - if (transfer.from.node_id == _raft0->self().id()) { co_return co_await do_transfer_local(transfer); } else { @@ -1033,10 +1047,6 @@ ss::future leader_balancer::do_transfer_remote(reassignment transfer) { res.error().message()); co_return false; } else if (res.value().data.success) { - vlog( - clusterlog.trace, - "Leadership transfer of group {} succeeded", - transfer.group); co_return true; } else { vlog( From 5beb25602bf5a1b7b3386cac39f52f4ff1f8c0b6 Mon Sep 17 00:00:00 2001 From: Alexey Zatelepin Date: Tue, 16 Jul 2024 21:49:38 +0200 Subject: [PATCH 2/3] tests/maintenance_test: increase the number of partitions Previously when this test was running in CI there were not a lot of partitions (12) and the leadership balancer could mute them all after several ticks (it mutes the partitions that it moves). After that it cannot take any further action for some time and the node that comes out of the maintenance mode won't acquire any new leaders. Increase the number of partitions in the test to avoid that. Fixes: https://github.com/redpanda-data/redpanda/issues/13650 (cherry picked from commit a6456189adb1e1343be2fd7bdb99bb6995b829d9) --- tests/rptest/tests/maintenance_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/rptest/tests/maintenance_test.py b/tests/rptest/tests/maintenance_test.py index 7548c745e0c2..4d115320499a 100644 --- a/tests/rptest/tests/maintenance_test.py +++ b/tests/rptest/tests/maintenance_test.py @@ -29,9 +29,9 @@ def __init__(self, *args, **kwargs): # Vary partition count relative to num_cpus. This is to ensure that # leadership is moved back to a node that exits maintenance. num_cpus = self.redpanda.get_node_cpu_count() - self.topics = (TopicSpec(partition_count=num_cpus * 3, + self.topics = (TopicSpec(partition_count=num_cpus * 5, replication_factor=3), - TopicSpec(partition_count=num_cpus * 3, + TopicSpec(partition_count=num_cpus * 10, replication_factor=3)) self.admin = Admin(self.redpanda) self.rpk = RpkTool(self.redpanda) From fed355e43dfa72ebb807798bf49d12afb252e8fb Mon Sep 17 00:00:00 2001 From: Alexey Zatelepin Date: Tue, 16 Jul 2024 21:59:01 +0200 Subject: [PATCH 3/3] tests/maintenance_test: increase _has_leadership_role timeout Previously we waited for 50 seconds in total to ensure that the node has some leaders. If it hadn't acquired any leaders "naturally", this was not enough because the leadership balancer needs at least 1 minute to unmute just restarted nodes and perform another tick. (cherry picked from commit 2c6aaf73da1e2a094bcd0bd8995d2709df82b32c) --- tests/rptest/tests/maintenance_test.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/rptest/tests/maintenance_test.py b/tests/rptest/tests/maintenance_test.py index 4d115320499a..9993c8a2540a 100644 --- a/tests/rptest/tests/maintenance_test.py +++ b/tests/rptest/tests/maintenance_test.py @@ -131,8 +131,12 @@ def _enable_maintenance(self, node): """ self.logger.debug( f"Checking that node {node.name} has a leadership role") + # In case the node is unlucky and doesn't get any leaders "naturally", + # we have to wait for the leadership balancer to do its job. We have to wait + # at least 1 minute for it to unmute just restarted nodes and perform another + # tick. Wait more than leader_balancer_idle_timeout (2 minutes) just to be sure. wait_until(lambda: self._has_leadership_role(node), - timeout_sec=60, + timeout_sec=150, backoff_sec=10) self.logger.debug(