Skip to content

Commit

Permalink
fix(nemesis): make disrupt_mgmt_restore not delete keyspace
Browse files Browse the repository at this point in the history
Restore procedure is built on uploading sstables.
When you drop keyspace after restoring a backup a 'DROP KEYSPACE' is
committed with higher timestamp than records from sstables in the backup.
As result scylla sees that older and ignores them on second restore.

This is expected behavior on both `s-m` and `scylla` side.
  • Loading branch information
Dmitry Kropachev authored and fruch committed Jan 11, 2024
1 parent 990f3ed commit 086c12d
Showing 1 changed file with 41 additions and 50 deletions.
91 changes: 41 additions & 50 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2847,57 +2847,48 @@ def execute_data_validation_thread(command_template, keyspace_name, number_of_ro
if self.cluster.params.get('cluster_backend') != 'aws':
raise UnsupportedNemesis("The restore test only supports AWS at the moment")

try:
mgr_cluster = self.cluster.get_cluster_manager()
cluster_backend = self.cluster.params.get('cluster_backend')
persistent_manager_snapshots_dict = get_persistent_snapshots()
target_bucket = persistent_manager_snapshots_dict[cluster_backend]["bucket"]
chosen_snapshot_tag, chosen_snapshot_info = (
choose_snapshot(persistent_manager_snapshots_dict[cluster_backend]))

self.log.info("Restoring the keyspace %s", chosen_snapshot_info["keyspace_name"])
location_list = [f"{self.cluster.params.get('backup_bucket_backend')}:{target_bucket}"]
test_keyspaces = self.cluster.get_test_keyspaces()
if chosen_snapshot_info["keyspace_name"] not in test_keyspaces:
self.log.info("Restoring the schema of the keyspace '%s'",
chosen_snapshot_info["keyspace_name"])
restore_task = mgr_cluster.create_restore_task(restore_schema=True, location_list=location_list,
snapshot_tag=chosen_snapshot_tag)

restore_task.wait_and_get_final_status(step=10,
timeout=6*60) # giving 6 minutes to restore the schema
assert restore_task.status == TaskStatus.DONE, \
f'Schema restoration of {chosen_snapshot_tag} has failed!'
self.cluster.restart_scylla() # After schema restoration, you should restart the nodes

restore_task = mgr_cluster.create_restore_task(restore_data=True,
location_list=location_list,
mgr_cluster = self.cluster.get_cluster_manager()
cluster_backend = self.cluster.params.get('cluster_backend')
persistent_manager_snapshots_dict = get_persistent_snapshots()
target_bucket = persistent_manager_snapshots_dict[cluster_backend]["bucket"]
chosen_snapshot_tag, chosen_snapshot_info = (
choose_snapshot(persistent_manager_snapshots_dict[cluster_backend]))

self.log.info("Restoring the keyspace %s", chosen_snapshot_info["keyspace_name"])
location_list = [f"{self.cluster.params.get('backup_bucket_backend')}:{target_bucket}"]
test_keyspaces = self.cluster.get_test_keyspaces()
if chosen_snapshot_info["keyspace_name"] not in test_keyspaces:
self.log.info("Restoring the schema of the keyspace '%s'",
chosen_snapshot_info["keyspace_name"])
restore_task = mgr_cluster.create_restore_task(restore_schema=True, location_list=location_list,
snapshot_tag=chosen_snapshot_tag)
restore_task.wait_and_get_final_status(step=30, timeout=chosen_snapshot_info["expected_timeout"])
assert restore_task.status == TaskStatus.DONE, f'Data restoration of {chosen_snapshot_tag} has failed!'

manager_version = mgr_cluster.sctool.parsed_client_version
if manager_version < LooseVersion("3.2"):
mgr_task = mgr_cluster.create_repair_task()
task_final_status = mgr_task.wait_and_get_final_status(timeout=chosen_snapshot_info["expected_timeout"])
assert task_final_status == TaskStatus.DONE, 'Task: {} final status is: {}.'.format(
mgr_task.id, str(mgr_task.status))

confirmation_stress_template = (
persistent_manager_snapshots_dict)[cluster_backend]["confirmation_stress_template"]
stress_queue = execute_data_validation_thread(command_template=confirmation_stress_template,
keyspace_name=chosen_snapshot_info["keyspace_name"],
number_of_rows=chosen_snapshot_info["number_of_rows"])
for stress in stress_queue:
self.tester.verify_stress_thread(cs_thread_pool=stress)
finally:
if chosen_snapshot_info:
if "keyspace_name" in chosen_snapshot_info.keys():
keyspace = chosen_snapshot_info["keyspace_name"]
InfoEvent(message=f'Removing test {keyspace=}', severity=Severity.WARNING)
with self.cluster.cql_connection_patient(self.target_node, connect_timeout=600) as session:
session.execute(SimpleStatement(
f'DROP KEYSPACE IF EXISTS "{keyspace}"'), timeout=300)

restore_task.wait_and_get_final_status(step=10,
timeout=6*60) # giving 6 minutes to restore the schema
assert restore_task.status == TaskStatus.DONE, \
f'Schema restoration of {chosen_snapshot_tag} has failed!'
self.cluster.restart_scylla() # After schema restoration, you should restart the nodes

restore_task = mgr_cluster.create_restore_task(restore_data=True,
location_list=location_list,
snapshot_tag=chosen_snapshot_tag)
restore_task.wait_and_get_final_status(step=30, timeout=chosen_snapshot_info["expected_timeout"])
assert restore_task.status == TaskStatus.DONE, f'Data restoration of {chosen_snapshot_tag} has failed!'

manager_version = mgr_cluster.sctool.parsed_client_version
if manager_version < LooseVersion("3.2"):
mgr_task = mgr_cluster.create_repair_task()
task_final_status = mgr_task.wait_and_get_final_status(timeout=chosen_snapshot_info["expected_timeout"])
assert task_final_status == TaskStatus.DONE, 'Task: {} final status is: {}.'.format(
mgr_task.id, str(mgr_task.status))

confirmation_stress_template = (
persistent_manager_snapshots_dict)[cluster_backend]["confirmation_stress_template"]
stress_queue = execute_data_validation_thread(command_template=confirmation_stress_template,
keyspace_name=chosen_snapshot_info["keyspace_name"],
number_of_rows=chosen_snapshot_info["number_of_rows"])
for stress in stress_queue:
self.tester.verify_stress_thread(cs_thread_pool=stress)

def _delete_existing_backups(self, mgr_cluster):
deleted_tasks = []
Expand Down

0 comments on commit 086c12d

Please sign in to comment.