Skip to content

Commit

Permalink
test(upgrade): add upgrade test for checking latency diff using latte
Browse files Browse the repository at this point in the history
Ref: scylladb/scylla-enterprise#4294
Ref: scylladb/qa-tasks#1742
(cherry picked from commit 1201241)
  • Loading branch information
vponomaryov committed Sep 23, 2024
1 parent 86608a0 commit c2ad3f8
Show file tree
Hide file tree
Showing 3 changed files with 175 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!groovy

// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)

// NOTE: case for covering scylla-enterprise#4294 bug
rollingUpgradePipeline(
backend: 'gce',
region: 'us-east1',
base_versions: '',
linux_distro: 'ubuntu-jammy',
test_name: 'upgrade_test.UpgradeTest.test_cluster_upgrade_latency_regression',
test_config: "test-cases/upgrades/rolling-upgrade-latency-regression.yaml",
)
72 changes: 72 additions & 0 deletions test-cases/upgrades/rolling-upgrade-latency-regression.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
test_duration: 360

n_db_nodes: 3
n_loaders: 1
n_monitor_nodes: 1

prepare_write_cmd:
# NOTE: --duration in these commands is number of rows that will be written.
# Time gets specified with 's', 'm' or 'h' letters.
- >-
latte run --tag latte-prepare --duration 115272000 --request-timeout 60 --retry-interval '2s,10s'
--sampling 5s --threads 30 --connections 3 --concurrency 180 --rate 180100 -P offset=0
--function custom -P codes="\"T13F1\"" -P row_count=115272000
scylla-qa-internal/custom_d1/workload2/latte/custom_d1_workload2.rn
stress_before_upgrade:
- >-
latte run --tag latte-before --duration 115272000 --request-timeout 30 --retry-interval '500ms,5s'
--sampling 5s --threads 30 --connections 3 --concurrency 180 --rate 160100 -P offset=0
--function custom -P row_count=115272000 --warmup 3202000
-P print_applied_func_names=2 -P codes="\"T13F3\"" --generate-report
scylla-qa-internal/custom_d1/workload2/latte/custom_d1_workload2.rn
stress_during_entire_upgrade:
- >-
latte run --tag latte-during --duration 115272000 --request-timeout 30 --retry-interval '500ms,5s'
--sampling 5s --threads 30 --connections 3 --concurrency 180 --rate 100100 -P offset=0
--function custom -P row_count=115272000
-P print_applied_func_names=2 -P codes="\"T13F3\""
scylla-qa-internal/custom_d1/workload2/latte/custom_d1_workload2.rn
stress_after_cluster_upgrade:
- >-
latte run --tag latte-after --duration 115272000 --request-timeout 30 --retry-interval '500ms,5s'
--sampling 5s --threads 30 --connections 3 --concurrency 180 --rate 160100 -P offset=0
--function custom -P row_count=115272000 --warmup 3202000
-P print_applied_func_names=2 -P codes="\"T13F3\"" --generate-report
scylla-qa-internal/custom_d1/workload2/latte/custom_d1_workload2.rn
scylla_linux_distro: 'ubuntu-jammy'
gce_image_db: 'https://www.googleapis.com/compute/v1/projects/ubuntu-os-cloud/global/images/family/ubuntu-2204-lts'
gce_instance_type_db: 'n2-highmem-32'
gce_instance_type_loader: 'e2-highcpu-32'

prepare_wait_no_compactions_timeout: 60 # (minutes)
num_nodes_to_rollback: 1 # max is "n_db_nodes -1"
upgrade_sstables: true

nemesis_class_name: 'NoOpMonkey'
nemesis_during_prepare: false
use_mgmt: false

user_prefix: 'rolling-upgrade-ltncy-rgrssn'

server_encrypt: true
authenticator: 'PasswordAuthenticator'
authenticator_user: 'cassandra'
authenticator_password: 'cassandra'
use_legacy_cluster_init: false
internode_compression: 'all'

# NOTE: number of local SSDs which can be attached to the 'n2-highmem-32' instance type
# must be divisible by 4 (platform requirement).
gce_n_local_ssd_disk_db: 4
# NOTE: each local SSD on GCE has 375Gb, so PD size must match 'ssd-num'*'ssd-size' formula.
gce_pd_ssd_disk_size_db: 1500
gce_setup_hybrid_raid: true

use_preinstalled_scylla: false

scylla_d_overrides_files: [
'scylla-qa-internal/custom_d1/workload1/scylla.d/cpuset.conf',
'scylla-qa-internal/custom_d1/workload1/scylla.d/io.conf',
'scylla-qa-internal/custom_d1/workload1/scylla.d/io_properties.yaml',
]
90 changes: 89 additions & 1 deletion upgrade_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -952,7 +952,7 @@ def _run_stress_workload(self, workload_name: str, wait_for_finish: bool = False
for thread_pool in workload_thread_pools:
self.verify_stress_thread(thread_pool)
else:
InfoEvent(message='Sleeping for 60s to let cassandra-stress start before the next steps...').publish()
InfoEvent(message='Sleeping for 60s to let the stress command(s) start before the next steps...').publish()
time.sleep(60)
return workload_thread_pools

Expand Down Expand Up @@ -1029,6 +1029,94 @@ def test_generic_cluster_upgrade(self):

self._run_stress_workload("stress_after_cluster_upgrade", wait_for_finish=True)

def test_cluster_upgrade_latency_regression(self):
"""Check latency regression after a ScyllaDB cluster upgrade using latte stress commands.
Number of 'before' and 'after' commands must match. Their latency values will be compaired.
- Write initial latte data (prepare_write_cmd?)
- Wait for end of compactions
- Read latte data generating report file (stress_before_upgrade)
- Run a read latte stress (stress_during_entire_upgrade) not waiting for it's end
- Upgrade the DB cluster
* self.run_raft_topology_upgrade_procedure()
- Wait for the end of the stress command (stress_during_entire_upgrade)
- Wait for end of compactions
- Read latte data (stress_after_cluster_upgrade) generating report file
- Compare latte report files and raise SCT ERROR event if latencies are worse for more than 10%
"""

InfoEvent(message="Step1 - Populate DB data").publish()
# Prepare keyspace and tables for truncate test
self.fill_db_data_for_truncate_test(insert_rows=NUMBER_OF_ROWS_FOR_TRUNCATE_TEST)
self.run_prepare_write_cmd()

InfoEvent(message="Step2 - Run 'read' command before upgrade").publish()
step = itertools_count(start=1)
stress_before_upgrade_thread_pools = self._run_stress_workload(
"stress_before_upgrade", wait_for_finish=False)
stress_before_upgrade_results = []
for stress_before_upgrade_thread_pool in stress_before_upgrade_thread_pools:
stress_before_upgrade_results.append(self.get_stress_results(stress_before_upgrade_thread_pool))
stress_during_entire_upgrade_thread_pools = self._run_stress_workload(
"stress_during_entire_upgrade", wait_for_finish=False)

InfoEvent(message="Step3 - Upgrade cluster to '%s' version" % self.params.get('new_version')).publish()

InfoEvent(message="Upgrade part of nodes nodes before roll-back").publish()
nodes_to_upgrade = self.shuffle_nodes_and_alternate_dcs(list(self.db_cluster.nodes))
upgraded_nodes = []
for node_to_upgrade in nodes_to_upgrade[:self.params.get('num_nodes_to_rollback')]:
self._start_and_wait_for_node_upgrade(node_to_upgrade, step=next(step))
upgraded_nodes.append(node_to_upgrade)

# Rollback all nodes that where upgraded (not necessarily in the same order)
random.shuffle(upgraded_nodes)
InfoEvent(message="Roll-back following nodes: %s" % ", ".join(
node.name for node in upgraded_nodes)).publish()
for node in upgraded_nodes:
self._start_and_wait_for_node_rollback(node, step=next(step))

InfoEvent(message="Upgrade all nodes").publish()
for node_to_upgrade in nodes_to_upgrade:
self._start_and_wait_for_node_upgrade(node_to_upgrade, step=next(step))
InfoEvent(message="All nodes were upgraded successfully").publish()

InfoEvent(message="Step4 - Run raft topology upgrade procedure").publish()
self.run_raft_topology_upgrade_procedure()

InfoEvent(message="Step5 - Wait for stress_during_entire_upgrade to finish").publish()
for stress_during_entire_upgrade_thread_pool in stress_during_entire_upgrade_thread_pools:
self.verify_stress_thread(stress_during_entire_upgrade_thread_pool)
self.wait_no_compactions_running(n=240, sleep_time=30)

InfoEvent(message="Step6 - run 'stress_after_cluster_upgrade' stress command(s)").publish()
time.sleep(60)
stress_after_upgrade_thread_pools = self._run_stress_workload(
"stress_after_cluster_upgrade", wait_for_finish=False)
stress_after_upgrade_results = []
for stress_after_upgrade_thread_pool in stress_after_upgrade_thread_pools:
stress_after_upgrade_results.append(self.get_stress_results(stress_after_upgrade_thread_pool))

self.log.info(
"Going to compare following READ stress results:\nbefore upgrade: %s\nafter upgrade: %s",
stress_before_upgrade_results, stress_after_upgrade_results)
assert len(stress_before_upgrade_results) > 0
for stress_before_upgrade_result in stress_before_upgrade_results:
assert len(stress_before_upgrade_result) > 0
assert len(stress_before_upgrade_results) == len(stress_after_upgrade_results)
for stress_after_upgrade_result in stress_after_upgrade_results:
assert len(stress_after_upgrade_result) > 0
for i in range(len(stress_before_upgrade_results)):
for j in range(len(stress_before_upgrade_results[i])):
assert 'latency 99th percentile' in stress_before_upgrade_results[i][j]
current_latency_before = float(stress_before_upgrade_results[i][j]['latency 99th percentile'])
assert current_latency_before > 0
assert 'latency 99th percentile' in stress_after_upgrade_results[i][j]
current_latency_after = float(stress_after_upgrade_results[i][j]['latency 99th percentile'])
assert current_latency_after > 0
assert current_latency_after / current_latency_before < 1.2

def test_kubernetes_scylla_upgrade(self):
"""
Run a set of different cql queries against various types/tables before
Expand Down

0 comments on commit c2ad3f8

Please sign in to comment.