Skip to content

Commit

Permalink
tests: add PartitionBalancerTest.test_recovery_mode_rebalance_finish
Browse files Browse the repository at this point in the history
Test that rebalancing after node addition eventually achieves balanced
distribution, even though some nodes were temporarily in recovery mode.
  • Loading branch information
ztlpn committed May 15, 2024
1 parent d54e6b6 commit 588ca47
Showing 1 changed file with 64 additions and 0 deletions.
64 changes: 64 additions & 0 deletions tests/rptest/tests/partition_balancer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,3 +1142,67 @@ def test_transfer_controller_leadership(self):
target_id=transfer_to_idx)

self.wait_until_ready()

@skip_debug_mode
@cluster(num_nodes=7, log_allow_list=CHAOS_LOG_ALLOW_LIST)
def test_recovery_mode_rebalance_finish(self):
"""
Test that rebalancing on node add correctly finishes
if some (but not all) nodes were in recovery mode.
"""

# start first 3 nodes and create some partitions on them
self.start_redpanda(num_nodes=5,
num_started_nodes=3,
new_bootstrap=True)
self.topic = TopicSpec(partition_count=50)
self.client().create_topic(self.topic)

self.start_producer(1)
self.start_consumer(1)
self.await_startup()

# restart seed nodes in recovery mode
seed_nodes = self.redpanda.nodes[:3]
self.redpanda.restart_nodes(
seed_nodes,
auto_assign_node_id=True,
omit_seeds_on_idx_one=False,
override_cfg_params={"recovery_mode_enabled": True})

# add 2 more nodes and make sure the balancer runs on one of them
# (it can't run on seed nodes because of recovery mode)
joiner_nodes = self.redpanda.nodes[3:]
for node in joiner_nodes:
self.redpanda.start_node(node,
auto_assign_node_id=True,
omit_seeds_on_idx_one=False)
self.redpanda.wait_for_membership(first_start=False)

admin = Admin(self.redpanda)

admin.transfer_leadership_to(namespace='redpanda',
topic='controller',
partition=0,
target_id=self.redpanda.node_id(
joiner_nodes[0]))

# the balancer will stall because not all partitions are moveable
self.wait_until_status(lambda s: s["status"] == "stalled")

# restart seed nodes in normal mode
self.redpanda.restart_nodes(seed_nodes, auto_assign_node_id=True)
self.redpanda.wait_for_membership(first_start=False)

self.wait_until_ready()

# check that partition counts are balanced
partition_counts = [
len(admin.get_partitions(node=n)) for n in self.redpanda.nodes
]
self.logger.info(f"partition counts: {partition_counts}")
avg = sum(partition_counts) / len(partition_counts)
assert all(abs(c - avg) / avg < 0.05 for c in partition_counts), \
"partition counts not balanced"

self.run_validation(consumer_timeout_sec=CONSUMER_TIMEOUT)

0 comments on commit 588ca47

Please sign in to comment.