scylladb · temichus · Sep 24, 2024 · Sep 17, 2024 · soyacz · Sep 19, 2024
diff --git a/...ins-pipelines/performance_staging/perf-regression-latency-mv-read-concurrency.jenkinsfile b/...ins-pipelines/performance_staging/perf-regression-latency-mv-read-concurrency.jenkinsfile
@@ -0,0 +1,12 @@
+#!groovy
+
+// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
+def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
+
+perfRegressionParallelPipeline(
+    backend: "aws",
+    test_name: "performance_regression_test.PerformanceRegressionMaterializedViewLatencyTest",
+    test_config: """["test-cases/performance/perf-regression-latency-mv-read-concurrency.yaml"]""",
+    sub_tests: ["test_read_mv_latency"],
+    email_recipients: 'wojciech.mitros@scylladb.com,artsiom.mishuta@scylladb.com,piodul@scylladb.com'
+)
diff --git a/performance_regression_test.py b/performance_regression_test.py
@@ -30,6 +30,7 @@
 from sdcm.sct_events.system import HWPerforanceEvent, InfoEvent
 from sdcm.utils.decorators import log_run_info, latency_calculator_decorator
 from sdcm.utils.csrangehistogram import CSHistogramTagTypes
+from sdcm.utils.nemesis_utils.indexes import wait_for_view_to_be_built
 
 KB = 1024
 
@@ -891,3 +892,71 @@ def test_latency_write_with_upgrade(self):
     def test_latency_mixed_with_upgrade(self):
         self._prepare_latency_with_upgrade()
         self.run_workload_and_upgrade(stress_cmd=self.params.get('stress_cmd_m'))
+
+
+class PerformanceRegressionMaterializedViewLatencyTest(PerformanceRegressionTest):
+    """
+    the idea is to reproduce the hardest scenario for MV
+    based on internal doc "Consistency problems in materialized views"
+
+    modifying a column that is a regular column in the base table,
+    but in the materialized view is one of the primary key columns.
+    Other types of materialized view updates are easier to handle,
+    once we figure out how to do the hardest case correctly, all of the other cases will be solved as well.
+
+    currently this problem is not solved.
+    The test is just reproducer of this problem and should not be used in regular runs
+
+    test steps:
+    1 - 3 node cluster with 2 tables
+    2 - do special prepare CMD for table 1, and use table 2 as for latency PERF TEST (prepare_write_cmd)
+    3 - start read workload for table 2 - measure latency for table 2 (10min) (stress_cmd_r)
+    4 - do a special rewrite workload for table 1 to measure latency for table 2 (while changing for table 1 applying )(stress_cmd_no_mv)
+    5 - create MV, and wait for MV to sync - measure latency for table 2 (while MV is syncing )
+    6- do special rewrite workload for table 1 again - measure latency for table 2 (while changing for table 1 applying ) (stress_cmd_mv)
+    """
+
+    def test_read_mv_latency(self):
+        self.run_fstrim_on_all_db_nodes()
+        self.preload_data()  # prepare_write_cmd
+        self.wait_no_compactions_running()
+        self.run_fstrim_on_all_db_nodes()
+
+        self.create_test_stats(sub_type="read", append_sub_test_to_name=False, test_index="mv-overloading-latency-read")
+        self.run_stress_thread(stress_cmd=self.params.get('stress_cmd_r'), stress_num=1,
+                               stats_aggregate_cmds=False)
+
+        self.steady_state_read_workload_latency()  # stress_cmd_r
+        self.do_rewrite_workload()  # stress_cmd_no_mv + #stress_cmd_r
+        self.wait_mv_sync()  # stress_cmd_r
+        self.do_rewrite_workload_with_mv()  # stress_cmd_mv + #stress_cmd_r
+        self.loaders.kill_stress_thread()
+        self.check_latency_during_ops()
+
+    @latency_calculator_decorator
+    def steady_state_read_workload_latency(self):
+        InfoEvent(message='start_read_workload_latency begin').publish()
+        time.sleep(15*60)
+        InfoEvent(message='start_read_workload_latency ended').publish()
+
+    @latency_calculator_decorator
+    def do_rewrite_workload(self):
+        base_cmd = self.params.get('stress_cmd_no_mv')
+        stress_queue = self.run_stress_thread(stress_cmd=base_cmd, stress_num=1, stats_aggregate_cmds=False)
+        results = self.get_stress_results(queue=stress_queue, store_results=False)
+        self.display_results(results, test_name='do_rewrite_workload')
+
+    @latency_calculator_decorator
+    def wait_mv_sync(self):
+        node1 = self.db_cluster.nodes[0]
+        node1.run_cqlsh(
+            "CREATE TABLE IF NOT EXISTS scylla_bench.test (pk bigint,ck bigint,v blob,PRIMARY KEY(pk, ck)) WITH compression = { }")
+        node1.run_cqlsh("CREATE MATERIALIZED VIEW IF NOT EXISTS scylla_bench.view_test AS SELECT * FROM scylla_bench.test where v IS NOT NULL AND ck IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (v, pk, ck)")
+        wait_for_view_to_be_built(node1, 'scylla_bench', 'view_test', timeout=1000)
+
+    @latency_calculator_decorator
+    def do_rewrite_workload_with_mv(self):
+        base_cmd = self.params.get('stress_cmd_mv')
+        stress_queue = self.run_stress_thread(stress_cmd=base_cmd, stress_num=1, stats_aggregate_cmds=False)
+        results = self.get_stress_results(queue=stress_queue, store_results=False)
+        self.display_results(results, test_name='do_rewrite_workload_with_mv')
diff --git a/test-cases/performance/perf-regression-latency-mv-read-concurrency.yaml b/test-cases/performance/perf-regression-latency-mv-read-concurrency.yaml
@@ -0,0 +1,31 @@
+test_duration: 680
+prepare_write_cmd: ["cassandra-stress write no-warmup cl=ALL n=100000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=1000 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=1..2000000",
+                    "scylla-bench -workload=sequential -mode=write -replication-factor=2 -partition-count=10 -partition-offset=0 -clustering-row-count=1000000 -clustering-row-size=uniform:100..5120 -concurrency=1000 -rows-per-request=10 -timeout=30s -connection-count 1000  -consistency-level=all",
+                    "scylla-bench -workload=sequential -mode=write -replication-factor=2 -partition-count=10 -partition-offset=10 -clustering-row-count=1000000 -clustering-row-size=uniform:100..5120 -concurrency=1000 -rows-per-request=10 -timeout=30s -connection-count 1000  -consistency-level=all",
+                    "scylla-bench -workload=sequential -mode=write -replication-factor=2 -partition-count=10 -partition-offset=20 -clustering-row-count=1000000 -clustering-row-size=uniform:100..5120 -concurrency=1000 -rows-per-request=10 -timeout=30s -connection-count 1000  -consistency-level=all"]
+
+stress_cmd_r: "cassandra-stress read cl=ALL duration=600m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate 'threads=10 throttle=100/s' -col 'size=FIXED(128) n=FIXED(8)' -pop 'dist=gauss(1..100000,50000,50000)' "
+stress_cmd_no_mv: "scylla-bench -workload=uniform -mode=write -replication-factor=2 -partition-count=30 -clustering-row-count=1000000 -clustering-row-size=uniform:100..5120 -concurrency=500 -max-rate=16000 -rows-per-request=1 -timeout=30s -connection-count 500  -consistency-level=one -iterations=0 -duration=15m"
+stress_cmd_mv: "scylla-bench -workload=uniform -mode=write -replication-factor=2 -partition-count=30 -clustering-row-count=1000000 -clustering-row-size=uniform:100..5120 -concurrency=500 -max-rate=4000 -rows-per-request=1 -timeout=30s -connection-count 500  -consistency-level=one -iterations=0 -duration=15m"
+
+n_db_nodes: 3
+n_loaders: 2
+n_monitor_nodes: 1
+
+instance_type_loader: 'c6i.2xlarge'
+instance_type_monitor: 't3.large'
+instance_type_db: 'i4i.2xlarge'
+
+user_prefix: 'perf-latency-mv-overloaded'
+space_node_threshold: 644245094
+ami_id_db_scylla_desc: 'VERSION_DESC'
+
+round_robin: true
+append_scylla_args: '--blocked-reactor-notify-ms 5 --abort-on-lsa-bad-alloc 1 --abort-on-seastar-bad-alloc --abort-on-internal-error 1 --abort-on-ebadf 1'
+backtrace_decoding: false
+print_kernel_callstack: true
+
+store_perf_results: true
+use_prepared_loaders: true
+use_hdr_cs_histogram: true
+custom_es_index: 'mv-overloading-latency-read'