From f17ed3467703754fb1800af1e65d5bbf7b179951 Mon Sep 17 00:00:00 2001 From: ALeksandr BYkov Date: Fri, 19 Jan 2024 19:15:22 +0700 Subject: [PATCH] fix(decommissionstreamerr): Set valid decommission nodetool timeout Timeout for nodetool decommission was set incorrectly for 180sec in DecommissionStreamErr nemesis. Decommission process could run much longer. If decommission should be aborted after log message which expected to be at the end of decommission process, nodetool decommission command will be terminated by timeout too earlier and next logic of nemesis failed because status of decommissioning node will be UL because decommission itself continue to run on the node. Set valid waiting timeoutes for commands and ParalleObject to correctly abort decommission. --- sdcm/nemesis.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/sdcm/nemesis.py b/sdcm/nemesis.py index f5bcbdd4a3..bf97c104af 100644 --- a/sdcm/nemesis.py +++ b/sdcm/nemesis.py @@ -3779,17 +3779,20 @@ def decommission_post_action(): self.unset_current_running_nemesis(new_node) return new_node - trigger = partial( - self.target_node.run_nodetool, sub_cmd="decommission", timeout=180, warning_event_on_exception=(Exception,), - retry=0, - ) - terminate_pattern = self.target_node.raft.get_random_log_message(operation=TopologyOperations.DECOMMISSION, seed=self.tester.params.get("nemesis_seed")) self.log.debug("Reboot node after log message: '%s'", terminate_pattern.log_message) + nodetool_decommission_timeout = terminate_pattern.timeout + 600 + log_follower = self.target_node.follow_system_log(patterns=[terminate_pattern.log_message]) + trigger = partial(self.target_node.run_nodetool, + sub_cmd="decommission", + timeout=nodetool_decommission_timeout, + warning_event_on_exception=(Exception,), + retry=0) + watcher = partial( self._call_disrupt_func_after_expression_logged, log_follower=log_follower, @@ -3797,13 +3800,13 @@ def decommission_post_action(): disrupt_func_kwargs={"target_node": self.target_node, "hard": True, "verify_ssh": True}, delay=0 ) - + full_operations_timeout = nodetool_decommission_timeout + 600 with contextlib.ExitStack() as stack: for expected_start_failed_context in self.target_node.raft.get_severity_change_filters_scylla_start_failed( terminate_pattern.timeout): stack.enter_context(expected_start_failed_context) with ignore_stream_mutation_fragments_errors(): - ParallelObject(objects=[trigger, watcher], timeout=terminate_pattern.timeout).call_objects() + ParallelObject(objects=[trigger, watcher], timeout=full_operations_timeout).call_objects() if new_node := decommission_post_action(): new_node.wait_node_fully_start() new_node.run_nodetool("rebuild", retry=0)