fix(decommissionstreamerr): Set valid decommission nodetool timeout

Timeout for nodetool decommission was set incorrectly for 180sec in DecommissionStreamErr nemesis. Decommission process could run much longer. If decommission should be aborted after log message which expected to be at the end of decommission process, nodetool decommission command will be terminated by timeout too earlier and next logic of nemesis failed because status of decommissioning node will be UL because decommission itself continue to run on the node. Set valid waiting timeoutes for commands and ParalleObject to correctly abort decommission.
juliayakovlev · Jan 25, 2024 · f17ed34 · f17ed34
1 parent 722f51a
commit f17ed34
Showing 1 changed file with 10 additions and 7 deletions.
diff --git a/sdcm/nemesis.py b/sdcm/nemesis.py
@@ -3779,31 +3779,34 @@ def decommission_post_action():
             self.unset_current_running_nemesis(new_node)
             return new_node
 
-        trigger = partial(
-            self.target_node.run_nodetool, sub_cmd="decommission", timeout=180, warning_event_on_exception=(Exception,),
-            retry=0,
-        )
-
         terminate_pattern = self.target_node.raft.get_random_log_message(operation=TopologyOperations.DECOMMISSION,
                                                                          seed=self.tester.params.get("nemesis_seed"))
         self.log.debug("Reboot node after log message: '%s'", terminate_pattern.log_message)
 
+        nodetool_decommission_timeout = terminate_pattern.timeout + 600
+
         log_follower = self.target_node.follow_system_log(patterns=[terminate_pattern.log_message])
 
+        trigger = partial(self.target_node.run_nodetool,
+                          sub_cmd="decommission",
+                          timeout=nodetool_decommission_timeout,
+                          warning_event_on_exception=(Exception,),
+                          retry=0)
+
         watcher = partial(
             self._call_disrupt_func_after_expression_logged,
             log_follower=log_follower,
             disrupt_func=self.reboot_node,
             disrupt_func_kwargs={"target_node": self.target_node, "hard": True, "verify_ssh": True},
             delay=0
         )
-
+        full_operations_timeout = nodetool_decommission_timeout + 600
         with contextlib.ExitStack() as stack:
             for expected_start_failed_context in self.target_node.raft.get_severity_change_filters_scylla_start_failed(
                     terminate_pattern.timeout):
                 stack.enter_context(expected_start_failed_context)
             with ignore_stream_mutation_fragments_errors():
-                ParallelObject(objects=[trigger, watcher], timeout=terminate_pattern.timeout).call_objects()
+                ParallelObject(objects=[trigger, watcher], timeout=full_operations_timeout).call_objects()
             if new_node := decommission_post_action():
                 new_node.wait_node_fully_start()
                 new_node.run_nodetool("rebuild", retry=0)