From 6f48e1012c9ad253c3a0ac3c87e8c6341bc0772a Mon Sep 17 00:00:00 2001 From: Valerii Ponomarov Date: Mon, 22 Jan 2024 15:41:58 +0200 Subject: [PATCH] fix(nemesis): fix 'disable_binary_gossip_execute_major_compaction' Check the gossip status and CQL workability in the end of the 'disrupt_disable_binary_gossip_execute_major_compaction' nemesis instead of looking for the 'gate closed' message in DB logs. Fixes: #6819 --- sdcm/cluster.py | 3 ++- sdcm/nemesis.py | 20 +++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/sdcm/cluster.py b/sdcm/cluster.py index 51e02f9346..1e5ae9b0ba 100644 --- a/sdcm/cluster.py +++ b/sdcm/cluster.py @@ -2747,7 +2747,7 @@ def _gen_cqlsh_cmd(self, command, keyspace, timeout, connect_timeout): return f'{cqlsh_cmd} {options} -e {command} {host}' def run_cqlsh(self, cmd, keyspace=None, timeout=120, verbose=True, split=False, connect_timeout=60, - num_retry_on_failure=1): + num_retry_on_failure=1, retry_interval=3): """Runs CQL command using cqlsh utility""" cmd = self._gen_cqlsh_cmd(command=cmd, keyspace=keyspace, timeout=timeout, connect_timeout=connect_timeout) @@ -2760,6 +2760,7 @@ def run_cqlsh(self, cmd, keyspace=None, timeout=120, verbose=True, split=False, num_retry_on_failure -= 1 if not num_retry_on_failure: raise + time.sleep(retry_interval) # stdout of cqlsh example: # pk diff --git a/sdcm/nemesis.py b/sdcm/nemesis.py index d715606a01..f5bcbdd4a3 100644 --- a/sdcm/nemesis.py +++ b/sdcm/nemesis.py @@ -4929,8 +4929,6 @@ def disrupt_bootstrap_streaming_error(self): self.cluster.decommission(new_node, timeout=7200) def disrupt_disable_binary_gossip_execute_major_compaction(self): - def are_gate_closed_messages_raised(log_reader): - return bool(list(log_reader)) with nodetool_context(node=self.target_node, start_command="disablebinary", end_command="enablebinary"): self.target_node.run_nodetool("statusbinary") self.target_node.run_nodetool("status") @@ -4943,15 +4941,15 @@ def are_gate_closed_messages_raised(log_reader): self.target_node.run_nodetool("statusgossip") self.target_node.run_nodetool("statusbinary") time.sleep(30) - gate_closed_log_reader = self.target_node.follow_system_log(patterns=['gate closed']) - gate_closed_appearing = bool(wait_for(func=are_gate_closed_messages_raised, - log_reader=gate_closed_log_reader, - timeout=100, - step=5, - text="Waiting for 'gate closed' exceptions", - throw_exc=False)) - assert not gate_closed_appearing, \ - "After re-enabling binary and gossip, 'gate closed' messages continue to appear" + try: + self.cluster.wait_for_nodes_up_and_normal(nodes=[self.target_node]) + self.target_node.run_cqlsh( + "SELECT * FROM system_schema.keyspaces;", num_retry_on_failure=20, retry_interval=3) + except Exception: # pylint: disable=broad-except + # NOTE: restart the target node because it was the remedy for the problems with CQL workability + self.log.warning("'%s' node will be restarted to make the CQL work again", self.target_node) + self.target_node.restart_scylla_server() + raise def disrupt_method_wrapper(method, is_exclusive=False): # pylint: disable=too-many-statements