test(upgrade): add upgrade test for checking latency diff using latte

Ref: scylladb/scylla-enterprise#4294 Ref: scylladb/qa-tasks#1742 (cherry picked from commit 1201241)
scylladb · Sep 23, 2024 · c2ad3f8 · c2ad3f8
1 parent 86608a0
commit c2ad3f8
Show file tree

Hide file tree

Showing 3 changed files with 175 additions and 1 deletion.
diff --git a/...olling-upgrade/upgrade_custom/rolling-upgrade-custom-d1-w2-latency-regression.jenkinsfile b/...olling-upgrade/upgrade_custom/rolling-upgrade-custom-d1-w2-latency-regression.jenkinsfile
@@ -0,0 +1,14 @@
+#!groovy
+
+// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
+def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
+
+// NOTE: case for covering scylla-enterprise#4294 bug
+rollingUpgradePipeline(
+    backend: 'gce',
+    region: 'us-east1',
+    base_versions: '',
+    linux_distro: 'ubuntu-jammy',
+    test_name: 'upgrade_test.UpgradeTest.test_cluster_upgrade_latency_regression',
+    test_config: "test-cases/upgrades/rolling-upgrade-latency-regression.yaml",
+)
diff --git a/test-cases/upgrades/rolling-upgrade-latency-regression.yaml b/test-cases/upgrades/rolling-upgrade-latency-regression.yaml
@@ -0,0 +1,72 @@
+test_duration: 360
+
+n_db_nodes: 3
+n_loaders: 1
+n_monitor_nodes: 1
+
+prepare_write_cmd:
+  # NOTE: --duration in these commands is number of rows that will be written.
+  #       Time gets specified with 's', 'm' or 'h' letters.
+  - >-
+    latte run --tag latte-prepare --duration 115272000 --request-timeout 60 --retry-interval '2s,10s'
+    --sampling 5s --threads 30 --connections 3 --concurrency 180 --rate 180100 -P offset=0
+    --function custom -P codes="\"T13F1\"" -P row_count=115272000
+    scylla-qa-internal/custom_d1/workload2/latte/custom_d1_workload2.rn
+stress_before_upgrade:
+  - >-
+    latte run --tag latte-before --duration 115272000 --request-timeout 30 --retry-interval '500ms,5s'
+    --sampling 5s --threads 30 --connections 3 --concurrency 180 --rate 160100 -P offset=0
+    --function custom -P row_count=115272000 --warmup 3202000
+    -P print_applied_func_names=2 -P codes="\"T13F3\"" --generate-report
+    scylla-qa-internal/custom_d1/workload2/latte/custom_d1_workload2.rn
+stress_during_entire_upgrade:
+  - >-
+    latte run --tag latte-during --duration 115272000 --request-timeout 30 --retry-interval '500ms,5s'
+    --sampling 5s --threads 30 --connections 3 --concurrency 180 --rate 100100 -P offset=0
+    --function custom -P row_count=115272000
+    -P print_applied_func_names=2 -P codes="\"T13F3\""
+    scylla-qa-internal/custom_d1/workload2/latte/custom_d1_workload2.rn
+stress_after_cluster_upgrade:
+  - >-
+    latte run --tag latte-after --duration 115272000 --request-timeout 30 --retry-interval '500ms,5s'
+    --sampling 5s --threads 30 --connections 3 --concurrency 180 --rate 160100 -P offset=0
+    --function custom -P row_count=115272000 --warmup 3202000
+    -P print_applied_func_names=2 -P codes="\"T13F3\"" --generate-report
+    scylla-qa-internal/custom_d1/workload2/latte/custom_d1_workload2.rn
+
+scylla_linux_distro: 'ubuntu-jammy'
+gce_image_db: 'https://www.googleapis.com/compute/v1/projects/ubuntu-os-cloud/global/images/family/ubuntu-2204-lts'
+gce_instance_type_db: 'n2-highmem-32'
+gce_instance_type_loader: 'e2-highcpu-32'
+
+prepare_wait_no_compactions_timeout: 60 # (minutes)
+num_nodes_to_rollback: 1  # max is "n_db_nodes -1"
+upgrade_sstables: true
+
+nemesis_class_name: 'NoOpMonkey'
+nemesis_during_prepare: false
+use_mgmt: false
+
+user_prefix: 'rolling-upgrade-ltncy-rgrssn'
+
+server_encrypt: true
+authenticator: 'PasswordAuthenticator'
+authenticator_user: 'cassandra'
+authenticator_password: 'cassandra'
+use_legacy_cluster_init: false
+internode_compression: 'all'
+
+# NOTE: number of local SSDs which can be attached to the 'n2-highmem-32' instance type
+#       must be divisible by 4 (platform requirement).
+gce_n_local_ssd_disk_db: 4
+# NOTE: each local SSD on GCE has 375Gb, so PD size must match 'ssd-num'*'ssd-size' formula.
+gce_pd_ssd_disk_size_db: 1500
+gce_setup_hybrid_raid: true
+
+use_preinstalled_scylla: false
+
+scylla_d_overrides_files: [
+  'scylla-qa-internal/custom_d1/workload1/scylla.d/cpuset.conf',
+  'scylla-qa-internal/custom_d1/workload1/scylla.d/io.conf',
+  'scylla-qa-internal/custom_d1/workload1/scylla.d/io_properties.yaml',
+]
diff --git a/upgrade_test.py b/upgrade_test.py
@@ -952,7 +952,7 @@ def _run_stress_workload(self, workload_name: str, wait_for_finish: bool = False
             for thread_pool in workload_thread_pools:
                 self.verify_stress_thread(thread_pool)
         else:
-            InfoEvent(message='Sleeping for 60s to let cassandra-stress start before the next steps...').publish()
+            InfoEvent(message='Sleeping for 60s to let the stress command(s) start before the next steps...').publish()
             time.sleep(60)
         return workload_thread_pools
 
@@ -1029,6 +1029,94 @@ def test_generic_cluster_upgrade(self):
 
         self._run_stress_workload("stress_after_cluster_upgrade", wait_for_finish=True)
 
+    def test_cluster_upgrade_latency_regression(self):
+        """Check latency regression after a ScyllaDB cluster upgrade using latte stress commands.
+
+        Number of 'before' and 'after' commands must match. Their latency values will be compaired.
+
+        - Write initial latte data (prepare_write_cmd?)
+        - Wait for end of compactions
+        - Read latte data generating report file (stress_before_upgrade)
+        - Run a read latte stress (stress_during_entire_upgrade) not waiting for it's end
+        - Upgrade the DB cluster
+        * self.run_raft_topology_upgrade_procedure()
+        - Wait for the end of the stress command (stress_during_entire_upgrade)
+        - Wait for end of compactions
+        - Read latte data (stress_after_cluster_upgrade) generating report file
+        - Compare latte report files and raise SCT ERROR event if latencies are worse for more than 10%
+        """
+
+        InfoEvent(message="Step1 - Populate DB data").publish()
+        # Prepare keyspace and tables for truncate test
+        self.fill_db_data_for_truncate_test(insert_rows=NUMBER_OF_ROWS_FOR_TRUNCATE_TEST)
+        self.run_prepare_write_cmd()
+
+        InfoEvent(message="Step2 - Run 'read' command before upgrade").publish()
+        step = itertools_count(start=1)
+        stress_before_upgrade_thread_pools = self._run_stress_workload(
+            "stress_before_upgrade", wait_for_finish=False)
+        stress_before_upgrade_results = []
+        for stress_before_upgrade_thread_pool in stress_before_upgrade_thread_pools:
+            stress_before_upgrade_results.append(self.get_stress_results(stress_before_upgrade_thread_pool))
+        stress_during_entire_upgrade_thread_pools = self._run_stress_workload(
+            "stress_during_entire_upgrade", wait_for_finish=False)
+
+        InfoEvent(message="Step3 - Upgrade cluster to '%s' version" % self.params.get('new_version')).publish()
+
+        InfoEvent(message="Upgrade part of nodes nodes before roll-back").publish()
+        nodes_to_upgrade = self.shuffle_nodes_and_alternate_dcs(list(self.db_cluster.nodes))
+        upgraded_nodes = []
+        for node_to_upgrade in nodes_to_upgrade[:self.params.get('num_nodes_to_rollback')]:
+            self._start_and_wait_for_node_upgrade(node_to_upgrade, step=next(step))
+            upgraded_nodes.append(node_to_upgrade)
+
+        # Rollback all nodes that where upgraded (not necessarily in the same order)
+        random.shuffle(upgraded_nodes)
+        InfoEvent(message="Roll-back following nodes: %s" % ", ".join(
+            node.name for node in upgraded_nodes)).publish()
+        for node in upgraded_nodes:
+            self._start_and_wait_for_node_rollback(node, step=next(step))
+
+        InfoEvent(message="Upgrade all nodes").publish()
+        for node_to_upgrade in nodes_to_upgrade:
+            self._start_and_wait_for_node_upgrade(node_to_upgrade, step=next(step))
+        InfoEvent(message="All nodes were upgraded successfully").publish()
+
+        InfoEvent(message="Step4 - Run raft topology upgrade procedure").publish()
+        self.run_raft_topology_upgrade_procedure()
+
+        InfoEvent(message="Step5 - Wait for stress_during_entire_upgrade to finish").publish()
+        for stress_during_entire_upgrade_thread_pool in stress_during_entire_upgrade_thread_pools:
+            self.verify_stress_thread(stress_during_entire_upgrade_thread_pool)
+        self.wait_no_compactions_running(n=240, sleep_time=30)
+
+        InfoEvent(message="Step6 - run 'stress_after_cluster_upgrade' stress command(s)").publish()
+        time.sleep(60)
+        stress_after_upgrade_thread_pools = self._run_stress_workload(
+            "stress_after_cluster_upgrade", wait_for_finish=False)
+        stress_after_upgrade_results = []
+        for stress_after_upgrade_thread_pool in stress_after_upgrade_thread_pools:
+            stress_after_upgrade_results.append(self.get_stress_results(stress_after_upgrade_thread_pool))
+
+        self.log.info(
+            "Going to compare following READ stress results:\nbefore upgrade: %s\nafter upgrade: %s",
+            stress_before_upgrade_results, stress_after_upgrade_results)
+        assert len(stress_before_upgrade_results) > 0
+        for stress_before_upgrade_result in stress_before_upgrade_results:
+            assert len(stress_before_upgrade_result) > 0
+        assert len(stress_before_upgrade_results) == len(stress_after_upgrade_results)
+        for stress_after_upgrade_result in stress_after_upgrade_results:
+            assert len(stress_after_upgrade_result) > 0
+        for i in range(len(stress_before_upgrade_results)):
+            for j in range(len(stress_before_upgrade_results[i])):
+                assert 'latency 99th percentile' in stress_before_upgrade_results[i][j]
+                current_latency_before = float(stress_before_upgrade_results[i][j]['latency 99th percentile'])
+                assert current_latency_before > 0
+                assert 'latency 99th percentile' in stress_after_upgrade_results[i][j]
+                current_latency_after = float(stress_after_upgrade_results[i][j]['latency 99th percentile'])
+                assert current_latency_after > 0
+                assert current_latency_after / current_latency_before < 1.2
+
     def test_kubernetes_scylla_upgrade(self):
         """
         Run a set of different cql queries against various types/tables before