From 1c236e470a569f7ac91753c0233fff2832023ea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Ma=C5=9Blanka?= Date: Thu, 27 Jun 2024 11:38:26 +0000 Subject: [PATCH 1/2] tests: fixed querying new leader node in leadership transfer test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test was using an api expecting the node ordinal to query for the node that was elected as a leader. The query was based on the node id. This lead to the validation being invalid. Fixes: #19953 Signed-off-by: Michał Maślanka (cherry picked from commit b455334d54d693434837cef041b9460fcfae46dc) --- tests/rptest/tests/raft_availability_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/rptest/tests/raft_availability_test.py b/tests/rptest/tests/raft_availability_test.py index db1b43457ddc..5695378a2f0d 100644 --- a/tests/rptest/tests/raft_availability_test.py +++ b/tests/rptest/tests/raft_availability_test.py @@ -387,10 +387,12 @@ def test_leadership_transfer(self): leader_id=initial_leader_id) new_leader_id, _ = self._wait_for_leader( lambda l: l is not None and l != initial_leader_id) + new_leader_node = self.redpanda.get_node_by_id(new_leader_id) + assert new_leader_node is not None self.logger.info( - f"New leader is {new_leader_id} {self.redpanda.get_node(new_leader_id).account.hostname}" + f"New leader is {new_leader_id} {new_leader_node.account.hostname}" ) - time.sleep(ELECTION_TIMEOUT) + for [id, metric_check] in metric_checks.items(): # the metric should be updated only on the node that was elected as a leader if id == new_leader_id: From c7cbb6819aeb7968c02684f5f85ad92402cf44fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Ma=C5=9Blanka?= Date: Thu, 27 Jun 2024 12:44:13 +0000 Subject: [PATCH 2/2] tests: use get_node_by_id in raft availability test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Maślanka (cherry picked from commit 205b9fab03755855f192ccca791ee461328bd6a7) --- tests/rptest/tests/raft_availability_test.py | 31 ++++++++++---------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tests/rptest/tests/raft_availability_test.py b/tests/rptest/tests/raft_availability_test.py index 5695378a2f0d..31a150fc16d6 100644 --- a/tests/rptest/tests/raft_availability_test.py +++ b/tests/rptest/tests/raft_availability_test.py @@ -187,7 +187,7 @@ def test_one_node_down(self): replication=3, timeout_s=ELECTION_TIMEOUT * 2) - leader_node = self.redpanda.get_node(initial_leader_id) + leader_node = self.redpanda.get_node_by_id(initial_leader_id) self.logger.info( f"Initial leader {initial_leader_id} {leader_node.account.hostname}" ) @@ -195,11 +195,12 @@ def test_one_node_down(self): # Priority mechanism should reliably select next replica in list expect_new_leader_id = replicas[1] - expect_new_leader_node = self.redpanda.get_node(expect_new_leader_id) + expect_new_leader_node = self.redpanda.get_node_by_id( + expect_new_leader_id) observer_node_id = (set(replicas) - {expect_new_leader_id, initial_leader_id}).pop() - observer_node = self.redpanda.get_node(observer_node_id) + observer_node = self.redpanda.get_node_by_id(observer_node_id) self.logger.info( f"Tracking stats on observer node {observer_node_id} {observer_node.account.hostname}" ) @@ -276,9 +277,9 @@ def test_two_nodes_down(self): self.ping_pong().ping_pong() - leader_node = self.redpanda.get_node(initial_leader_id) + leader_node = self.redpanda.get_node_by_id(initial_leader_id) other_node_id = (set(replicas) - {initial_leader_id}).pop() - other_node = self.redpanda.get_node(other_node_id) + other_node = self.redpanda.get_node_by_id(other_node_id) self.logger.info( f"Stopping {initial_leader_id} ({leader_node.account.hostname}) and {other_node_id} ({other_node.account.hostname})" @@ -290,7 +291,7 @@ def test_two_nodes_down(self): self._expect_unavailable() # Bring back one node (not the original leader) - self.redpanda.start_node(self.redpanda.get_node(other_node_id)) + self.redpanda.start_node(self.redpanda.get_node_by_id(other_node_id)) hosts = [ n.account.hostname for n in self.redpanda.nodes @@ -327,7 +328,7 @@ def test_leader_restart(self): the original leader stopped. """ initial_leader_id, replicas = self._wait_for_leader() - initial_leader_node = self.redpanda.get_node(initial_leader_id) + initial_leader_node = self.redpanda.get_node_by_id(initial_leader_id) self.logger.info( f"Stopping initial leader {initial_leader_id} {initial_leader_node.account.hostname}" @@ -337,7 +338,7 @@ def test_leader_restart(self): new_leader_id, _ = self._wait_for_leader( lambda l: l is not None and l != initial_leader_id) self.logger.info( - f"New leader is {new_leader_id} {self.redpanda.get_node(new_leader_id).account.hostname}" + f"New leader is {new_leader_id} {self.redpanda.get_node_by_id(new_leader_id).account.hostname}" ) self.logger.info( @@ -369,7 +370,7 @@ def test_leadership_transfer(self): continue serving requests. """ initial_leader_id, replicas = self._wait_for_leader() - initial_leader_node = self.redpanda.get_node(initial_leader_id) + initial_leader_node = self.redpanda.get_node_by_id(initial_leader_id) metric_checks = {} for n in self.redpanda.nodes: @@ -461,10 +462,10 @@ def test_leader_transfers_recovery(self, acks): initial_leader_id = leader_node_id for n in range(0, transfer_count): target_idx = (initial_leader_id + n) % len(self.redpanda.nodes) - target_node_id = target_idx + 1 + target_node_by_id_id = target_idx + 1 self._transfer_leadership(admin, "kafka", self.topic, - target_node_id) + target_node_by_id_id) # Wait til we can see producer progressing, to avoid a situation where # we do leadership transfers so quickly that we stall the producer @@ -499,7 +500,7 @@ def test_follower_isolation(self): self._expect_available() - leader_node = self.redpanda.get_node(initial_leader_id) + leader_node = self.redpanda.get_node_by_id(initial_leader_id) self.logger.info( f"Initial leader {initial_leader_id} {leader_node.account.hostname}" ) @@ -525,7 +526,7 @@ def test_follower_isolation(self): # isolate one of the followers fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, - self.redpanda.get_node(follower))) + self.redpanda.get_node_by_id(follower))) # expect messages to be produced and consumed without a timeout connection = self.ping_pong() @@ -547,7 +548,7 @@ def test_id_allocator_leader_isolation(self): replication=3) initial_leader_id = admin.get_partition_leader( namespace='kafka_internal', topic='id_allocator', partition=0) - leader_node = self.redpanda.get_node(initial_leader_id) + leader_node = self.redpanda.get_node_by_id(initial_leader_id) self.logger.info( f"kafka_internal/id_allocator/0 leader: {initial_leader_id}, node: {leader_node.account.hostname}" ) @@ -558,7 +559,7 @@ def test_id_allocator_leader_isolation(self): # isolate id_allocator fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, - self.redpanda.get_node(initial_leader_id))) + self.redpanda.get_node_by_id(initial_leader_id))) # expect messages to be produced and consumed without a timeout connection = self.ping_pong()