diff --git a/scenarios/README.md b/scenarios/README.md deleted file mode 100644 index a7f2655..0000000 --- a/scenarios/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# Scenario-based testing for CockroachDB - -This directory contains example configurations to test CockroachDB -operational scenarios. diff --git a/scenarios/deco_short_uprepl/conf.cfg b/scenarios/deco_short_uprepl/conf.cfg deleted file mode 100644 index 9c4ca44..0000000 --- a/scenarios/deco_short_uprepl/conf.cfg +++ /dev/null @@ -1,18 +0,0 @@ -# define what a "client" is -include workers/workload_kv.cfg -# define what a "nemesis" is -include nemeses/decommission.cfg - -# define 4 nodes, 2 clients, 1 nemesis -include servers.cfg -include workers.cfg -include nemesis_n1.cfg - -# simplest scenario: initialize, then activate the nemesis for a short time -include scripts/short.cfg - -# verify "good behavior" -include workers/check_stable_behavior_kv.cfg - -# ensure that initialization also waits for upreplication -include wait_upreplication.cfg diff --git a/scenarios/include/nemeses/decommission.cfg b/scenarios/include/nemeses/decommission.cfg deleted file mode 100644 index 7ab65ed..0000000 --- a/scenarios/include/nemeses/decommission.cfg +++ /dev/null @@ -1,6 +0,0 @@ -title decommission nemesis -role nemesis_decommission - :stop $COCKROACH node decommission $node_id --insecure --url $(cat ../$node/pgurl) --wait all >deco.log 2>&1 - :restart $COCKROACH node recommission $node_id --insecure --url $(cat ../$node/pgurl) >reco.log 2>&1 -end -parameter nemesis defaults to nemesis_decommission diff --git a/scenarios/include/nemeses/kill.cfg b/scenarios/include/nemeses/kill.cfg deleted file mode 100644 index a57888f..0000000 --- a/scenarios/include/nemeses/kill.cfg +++ /dev/null @@ -1,6 +0,0 @@ -title kill nemesis -role nemesis_kill - :stop kill -9 $(cat ../$node/pid) - :restart ../$node/actions/start.sh -end -parameter nemesis defaults to nemesis_kill diff --git a/scenarios/include/nemeses/quit.cfg b/scenarios/include/nemeses/quit.cfg deleted file mode 100644 index c1e9884..0000000 --- a/scenarios/include/nemeses/quit.cfg +++ /dev/null @@ -1,6 +0,0 @@ -title quit nemesis -role nemesis_quit - :stop $COCKROACH quit --url $(cat ../$node/pgurl) - :restart ../$node/actions/start.sh -end -parameter nemesis defaults to nemesis_quit diff --git a/scenarios/include/nemeses/roachprod_kill.cfg b/scenarios/include/nemeses/roachprod_kill.cfg deleted file mode 100644 index ef5e3cc..0000000 --- a/scenarios/include/nemeses/roachprod_kill.cfg +++ /dev/null @@ -1,8 +0,0 @@ -title roachprod kill/restart nemesis -role nemesis_rp_kill - :stop HOME=$ROACHPROD_HOME/.. \ - $ROACHPROD stop $(cat ../$node/roachprod_target) >>kill.log 2>&1 - :restart HOME=$ROACHPROD_HOME/.. TMPDIR=/tmp \ - $ROACHPROD start $(cat ../$node/roachprod_target) >>restart.log 2>&1 -end -parameter nemesis defaults to nemesis_rp_kill diff --git a/scenarios/include/nemesis_n1.cfg b/scenarios/include/nemesis_n1.cfg deleted file mode 100644 index 7d7f8bc..0000000 --- a/scenarios/include/nemesis_n1.cfg +++ /dev/null @@ -1,14 +0,0 @@ -title pain on n1 - -cast - badguy plays ~nemesis~ with node=n1 p=26257 h=8080 node_id=1 -end - -script - scene F entails for badguy: stop - scene F mood starts orange - scene F mood ends red - scene R entails for badguy: restart - scene R mood starts yellow - scene R mood ends clear -end diff --git a/scenarios/include/scripts/cycle.cfg b/scenarios/include/scripts/cycle.cfg deleted file mode 100644 index 972de6a..0000000 --- a/scenarios/include/scripts/cycle.cfg +++ /dev/null @@ -1,5 +0,0 @@ -script - # let acts 1 to 3 alone, these are used to initialize the server and - # calibrate expected values. - storyline . . . ... FR ...... -end diff --git a/scenarios/include/scripts/long.cfg b/scenarios/include/scripts/long.cfg deleted file mode 100644 index 0c276ac..0000000 --- a/scenarios/include/scripts/long.cfg +++ /dev/null @@ -1,5 +0,0 @@ -script - # let acts 1 to 3 alone, these are used to initialize the server and - # calibrate expected values. - storyline . . . ... F ............... R ........ -end diff --git a/scenarios/include/scripts/short.cfg b/scenarios/include/scripts/short.cfg deleted file mode 100644 index 3a245a6..0000000 --- a/scenarios/include/scripts/short.cfg +++ /dev/null @@ -1,5 +0,0 @@ -script - # let acts 1 to 3 alone, these are used to initialize the server and - # calibrate expected values. - storyline . . . F ... R ... -end diff --git a/scenarios/include/servers.cfg b/scenarios/include/servers.cfg deleted file mode 100644 index a46fa26..0000000 --- a/scenarios/include/servers.cfg +++ /dev/null @@ -1,18 +0,0 @@ -parameter cluster_size defaults to 4 -title ~cluster_size~ nodes - -parameter serverconf defaults to local -include servers/cockroach_~serverconf~.cfg - -cast - n* play ~cluster_size~ nodes \ - with nodei=$(($i+1)) \ - pbase=$i \ - peers=localhost:$((26257+($i+~cluster_size~-1)%~cluster_size~)),localhost:$((26257+($i+~cluster_size~+1)%~cluster_size~)) -end - -script - scene s entails for every node: start - scene i entails for n1: init - storyline si... -end diff --git a/scenarios/include/servers/cockroach_local.cfg b/scenarios/include/servers/cockroach_local.cfg deleted file mode 100644 index 6e18f63..0000000 --- a/scenarios/include/servers/cockroach_local.cfg +++ /dev/null @@ -1,17 +0,0 @@ -role node - :start export p=$((26257+$pbase)) h=$((8080+$pbase)); \ - export TMPDIR=/tmp; \ - echo "postgresql://root@localhost:$p/?sslmode=disable">pgurl; \ - $COCKROACH start \\ - --join=$peers --listen-addr=localhost:$p --http-addr=localhost:$h --insecure \\ - --background --pid-file=pid - :init $COCKROACH init --insecure --url $(cat pgurl) - :reset kill $(cat pid) && sleep 1 && kill -9 $(cat pid) || true; rm -rf cockroach-data - cleanup if test -e pid; then kill $(cat pid) && sleep 1 && kill -9 $(cat pid) || true; fi - spotlight tail -F cockroach-data/logs/cockroach.log | \\ - stdbuf -oL grep '^[WEF]' | \\ - stdbuf -oL grep -v 'server is not accepting clients' - signal warning event at ^W(?P)\s+\d+\s+\S+\s+(?P.*)$ - signal error event at ^E(?P)\s+\d+\s+\S+\s+(?P.*)$ - signal fatal event at ^F(?P)\s+\d+\s+\S+\s+(?P.*)$ -end diff --git a/scenarios/include/servers/cockroach_roachprod.cfg b/scenarios/include/servers/cockroach_roachprod.cfg deleted file mode 100644 index db82a4d..0000000 --- a/scenarios/include/servers/cockroach_roachprod.cfg +++ /dev/null @@ -1,17 +0,0 @@ -role node - :start export HOME=$ROACHPROD_HOME/.. TMPDIR=/tmp; \\ - $ROACHPROD pgurl $CLUSTER:$nodei --external | sed "s/'//g" >pgurl && \\ - echo $CLUSTER:$nodei >roachprod_target && \\ - $ROACHPROD start $CLUSTER:$nodei - :init true - :reset HOME=$ROACHPROD_HOME/.. $ROACHPROD wipe $CLUSTER:$nodei - cleanup export HOME=$ROACHPROD_HOME/..; \ - $ROACHPROD stop $CLUSTER:$nodei; \ - $ROACHPROD get $CLUSTER:$nodei logs || true - spotlight HOME=$ROACHPROD_HOME/.. $ROACHPROD run $CLUSTER:$nodei "tail -F logs/cockroach.log" | \\ - stdbuf -oL grep '^[WEF]' | \\ - stdbuf -oL grep -v 'server is not accepting clients' - signal warning event at ^W(?P)\s+\d+\s+\S+\s+(?P.*)$ - signal error event at ^E(?P)\s+\d+\s+\S+\s+(?P.*)$ - signal fatal event at ^F(?P)\s+\d+\s+\S+\s+(?P.*)$ -end diff --git a/scenarios/include/wait_upreplication.cfg b/scenarios/include/wait_upreplication.cfg deleted file mode 100644 index 82211bb..0000000 --- a/scenarios/include/wait_upreplication.cfg +++ /dev/null @@ -1,14 +0,0 @@ -role wait_upreplication - :wait $COCKROACH sql --url $(cat ../$node/pgurl) \\ - -e "select if((select min(array_length(replicas,1)) from crdb_internal.ranges)>=3, 0, if(pg_sleep(.5), crdb_internal.force_retry('1h'), 0))"; sleep 2 -end - -cast - waiter plays wait_upreplication with node=n2 -end - -script - scene U entails for waiter: wait - # Insert a wait as first step in act 2. - edit s/^([^ ]*) /$1 U/ -end diff --git a/scenarios/include/workers.cfg b/scenarios/include/workers.cfg deleted file mode 100644 index 51837d4..0000000 --- a/scenarios/include/workers.cfg +++ /dev/null @@ -1,19 +0,0 @@ -parameter nworkers defaults to 2 -title ~nworkers~ workers - -cast - w* play ~nworkers~ workers \ - with node=n$(($i+1)) pprof=$((33330+$i)) from=$((~cluster_size~+$i+1)) -end - -script - scene p entails for every worker: prepare - scene w entails for every worker: start - scene w mood ends blue - # after scene w where the worker starts, we are in ramp-up mode (blue) until scene c. - scene c mood ends clear - # let the first act alone, we assume the first act is used to prepare the server. - # act 2 is used to prepare and start the workload. - # act 3 is used for the ramp-up period. - storyline . pw ...c -end diff --git a/scenarios/include/workers/check_stable_behavior_kv.cfg b/scenarios/include/workers/check_stable_behavior_kv.cfg deleted file mode 100644 index 5ba8d85..0000000 --- a/scenarios/include/workers/check_stable_behavior_kv.cfg +++ /dev/null @@ -1,36 +0,0 @@ -attention blue period is considered ramp-up, used to calibrate expectations - -audience - ref audits only while mood == 'blue' - ref collects tp_ref_values as top 5 [w2 throughput] - ref computes tp_ref as avg(tp_ref_values) - ref watches tp_ref - ref collects lat99_ref_values as bottom 5 [w2 lat99] - ref computes lat99_ref as avg(lat99_ref_values) - ref watches lat99_ref - # we want to see at least one measurement. No measurements - # indicates either the workload did not start properly, or - # the signal regexp is not working. - ref expects eventually: count(tp_ref_values) > 0 && count(lat99_ref_values) > 0 - ref only helps - - throughput_ok audits only while mood != 'blue' - throughput_ok expects always: [w2 throughput] >= (tp_ref / 2) - throughput_ok watches w2 throughput - throughput_ok watches tp_ref - throughput_ok measures client ops/s - - latency_ok audits only while mood != 'blue' - latency_ok expects always: [w2 lat99] <= (3 * lat99_ref) - latency_ok watches w2 lat99 - latency_ok watches lat99_ref - latency_ok measures client latency (ms) - - tp_node1 watches w1 throughput - tp_node1 measures client ops/s - - server_events watches every node warning - server_events watches every node error - server_events watches every node fatal - server_events measures occurrences (y value is source index) -end diff --git a/scenarios/include/workers/workload_kv.cfg b/scenarios/include/workers/workload_kv.cfg deleted file mode 100644 index 91c727c..0000000 --- a/scenarios/include/workers/workload_kv.cfg +++ /dev/null @@ -1,20 +0,0 @@ -role worker - :prepare $COCKROACH workload init kv \\ - --drop --db db$node --pprofport $pprof \\ - $(cat ../$node/pgurl) - :start $COCKROACH workload run kv --db db$node --pprofport $pprof \\ - --concurrency 1 --max-rate 200 --cycle-length 10 \\ - --display-every .25s --display-format=incremental-json --tolerate-errors \\ - $(cat ../$node/pgurl) \\ - & echo $!>wpid - :stop kill $(cat wpid) && sleep 1 && kill -9 $(cat wpid) || true; rm -f start.log - cleanup if test -e wpid; then kill $(cat wpid) && sleep 1 && kill -9 $(cat wpid) || true; fi; touch start.log - # Note: we need to "touch" the file prior to looking because - # otherwise "tail" may wait too long to detect the first output. - spotlight touch start.log; tail -F start.log - signal errors delta at .*"time":"(?P)".*"errs":(?P[^,]*),.* - signal throughput scalar at .*"time":"(?P)".*"avgt":(?P[^,]*),.* - signal lat50 scalar at .*"time":"(?P)".*"p50l":(?P[^,]*),.* - signal lat95 scalar at .*"time":"(?P)".*"p95l":(?P[^,]*),.* - signal lat99 scalar at .*"time":"(?P)".*"p99l":(?P[^,]*),.* -end diff --git a/scenarios/include/workers/workload_kv_roachprod.cfg b/scenarios/include/workers/workload_kv_roachprod.cfg deleted file mode 100644 index 44bb1c7..0000000 --- a/scenarios/include/workers/workload_kv_roachprod.cfg +++ /dev/null @@ -1,22 +0,0 @@ -role worker - :prepare export HOME=$ROACHPROD_HOME/..; \\ - $ROACHPROD run $CLUSTER:$from \\ - "./cockroach workload init kv --drop --db db$node $(cat ../$node/pgurl)" - :start export HOME=$ROACHPROD_HOME/..; \\ - $ROACHPROD run $CLUSTER:$from \\ - "./cockroach workload run kv --db db$node \\ - --concurrency 1 --max-rate 200 --cycle-length 10 \\ - --display-every .25s --display-format=incremental-json --tolerate-errors \\ - $(cat ../$node/pgurl)" \\ - & echo $!>wpid - :stop kill $(cat wpid) && sleep 1 && kill -9 $(cat wpid) || true; rm -f start.log - cleanup if test -e wpid; then kill $(cat wpid) && sleep 1 && kill -9 $(cat wpid) || true; fi; touch start.log - # Note: we need to "touch" the file prior to looking because - # otherwise "tail" may wait too long to detect the first output. - spotlight touch start.log; tail -F start.log - signal errors delta at .*"time":"(?P)".*"errs":(?P[^,]*),.* - signal throughput scalar at .*"time":"(?P)".*"avgt":(?P[^,]*),.* - signal lat50 scalar at .*"time":"(?P)".*"p50l":(?P[^,]*),.* - signal lat95 scalar at .*"time":"(?P)".*"p95l":(?P[^,]*),.* - signal lat99 scalar at .*"time":"(?P)".*"p99l":(?P[^,]*),.* -end diff --git a/scenarios/quit_short/conf.cfg b/scenarios/quit_short/conf.cfg deleted file mode 100644 index 8b6a790..0000000 --- a/scenarios/quit_short/conf.cfg +++ /dev/null @@ -1,15 +0,0 @@ -# define what a "client" is -include workers/workload_kv.cfg -# define what a "nemesis" is -include nemeses/quit.cfg - -# define 3 nodes, 2 clients, 1 nemesis -include servers.cfg -include workers.cfg -include nemesis_n1.cfg - -# simplest scenario: initialize, then activate the nemesis for a short time -include scripts/short.cfg - -# verify "good behavior" -include workers/check_stable_behavior_kv.cfg diff --git a/scenarios/quit_short/latency_spike_after_quit.cfg b/scenarios/quit_short/latency_spike_after_quit.cfg deleted file mode 100644 index 6dc1400..0000000 --- a/scenarios/quit_short/latency_spike_after_quit.cfg +++ /dev/null @@ -1,17 +0,0 @@ -include conf.cfg - -audience - quit_lat_spike audits only while mood == 'red' - quit_lat_spike expects like latency_ok - quit_lat_spike only helps -end - -script - repeat from F - repeat time 5m -end - -interpretation - ignore disappointment - foul upon quit_lat_spike disappointment -end diff --git a/scenarios/quit_short/latency_spike_after_restart.cfg b/scenarios/quit_short/latency_spike_after_restart.cfg deleted file mode 100644 index 397792a..0000000 --- a/scenarios/quit_short/latency_spike_after_restart.cfg +++ /dev/null @@ -1,20 +0,0 @@ -include conf.cfg - -audience - tp_delta collects prev2 as last 2 [kv2 throughput] - tp_delta computes tp_up_ramp as first(prev2) == 0 && last(prev2) > 0 - - restart_lat_spike audits only while mood == 'yellow' || mood == 'clear' - restart_lat_spike expects always: tp_up_ramp || [kv2 lat99] <= (3 * lat99_ref) - restart_lat_spike only helps -end - -script - repeat from F - repeat time 5m -end - -interpretation - ignore disappointment - foul upon restart_lat_spike disappointment -end diff --git a/scenarios/quit_short/latency_spike_during_quit.cfg b/scenarios/quit_short/latency_spike_during_quit.cfg deleted file mode 100644 index 7f1fb61..0000000 --- a/scenarios/quit_short/latency_spike_during_quit.cfg +++ /dev/null @@ -1,17 +0,0 @@ -include conf.cfg - -audience - quit_lat_spike audits only while mood == 'orange' - quit_lat_spike expects like latency_ok - quit_lat_spike only helps -end - -script - repeat from F - repeat time 5m -end - -interpretation - ignore disappointment - foul upon quit_lat_spike disappointment -end diff --git a/scenarios/quit_short/tp_down_after_quit.cfg b/scenarios/quit_short/tp_down_after_quit.cfg deleted file mode 100644 index d0a3a26..0000000 --- a/scenarios/quit_short/tp_down_after_quit.cfg +++ /dev/null @@ -1,21 +0,0 @@ -include conf.cfg - -audience - tp_down audits only while mood == 'red' - tp_down expects like throughput_ok - tp_down only helps -end - -script - scene k entails for every client: stop - scene r entails for every node: reset - scene r mood starts clear - edit s/$/ kFr / - repeat from ^ - repeat time 5m -end - -interpretation - ignore disappointment - foul upon tp_down disappointment -end diff --git a/scenarios/quit_short_uprepl/conf.cfg b/scenarios/quit_short_uprepl/conf.cfg deleted file mode 100644 index e972d4e..0000000 --- a/scenarios/quit_short_uprepl/conf.cfg +++ /dev/null @@ -1,18 +0,0 @@ -# define what a "client" is -include workers/workload_kv.cfg -# define what a "nemesis" is -include nemeses/quit.cfg - -# define 4 nodes, 2 clients, 1 nemesis -include servers.cfg -include workers.cfg -include nemesis_n1.cfg - -# simplest scenario: initialize, then activate the nemesis for a short time -include scripts/short.cfg - -# verify "good behavior" -include workers/check_stable_behavior_kv.cfg - -# ensure that initialization also waits for upreplication -include wait_upreplication.cfg diff --git a/scenarios/quit_short_uprepl/latency_spike_after_quit.cfg b/scenarios/quit_short_uprepl/latency_spike_after_quit.cfg deleted file mode 100644 index 6dc1400..0000000 --- a/scenarios/quit_short_uprepl/latency_spike_after_quit.cfg +++ /dev/null @@ -1,17 +0,0 @@ -include conf.cfg - -audience - quit_lat_spike audits only while mood == 'red' - quit_lat_spike expects like latency_ok - quit_lat_spike only helps -end - -script - repeat from F - repeat time 5m -end - -interpretation - ignore disappointment - foul upon quit_lat_spike disappointment -end diff --git a/scenarios/quit_short_uprepl/latency_spike_after_restart.cfg b/scenarios/quit_short_uprepl/latency_spike_after_restart.cfg deleted file mode 100644 index 397792a..0000000 --- a/scenarios/quit_short_uprepl/latency_spike_after_restart.cfg +++ /dev/null @@ -1,20 +0,0 @@ -include conf.cfg - -audience - tp_delta collects prev2 as last 2 [kv2 throughput] - tp_delta computes tp_up_ramp as first(prev2) == 0 && last(prev2) > 0 - - restart_lat_spike audits only while mood == 'yellow' || mood == 'clear' - restart_lat_spike expects always: tp_up_ramp || [kv2 lat99] <= (3 * lat99_ref) - restart_lat_spike only helps -end - -script - repeat from F - repeat time 5m -end - -interpretation - ignore disappointment - foul upon restart_lat_spike disappointment -end diff --git a/scenarios/quit_short_uprepl/latency_spike_during_quit.cfg b/scenarios/quit_short_uprepl/latency_spike_during_quit.cfg deleted file mode 100644 index 7f1fb61..0000000 --- a/scenarios/quit_short_uprepl/latency_spike_during_quit.cfg +++ /dev/null @@ -1,17 +0,0 @@ -include conf.cfg - -audience - quit_lat_spike audits only while mood == 'orange' - quit_lat_spike expects like latency_ok - quit_lat_spike only helps -end - -script - repeat from F - repeat time 5m -end - -interpretation - ignore disappointment - foul upon quit_lat_spike disappointment -end diff --git a/scenarios/quit_short_uprepl/tp_down_after_quit.cfg b/scenarios/quit_short_uprepl/tp_down_after_quit.cfg deleted file mode 100644 index 60ab67a..0000000 --- a/scenarios/quit_short_uprepl/tp_down_after_quit.cfg +++ /dev/null @@ -1,21 +0,0 @@ -include conf.cfg - -audience - tp_down audits only while mood == 'red' - tp_down expects like throughput_ok - tp_down only helps -end - -script - scene k entails for every client: stop - scene r entails for every node: reset - scene r mood starts clear - edit s/$/ kFr / - repeat from ^ - repeat time 2m -end - -interpretation - ignore disappointment - foul upon tp_down disappointment -end diff --git a/scenarios/rp_deco_short_uprepl/conf.cfg b/scenarios/rp_deco_short_uprepl/conf.cfg deleted file mode 100644 index 882d0b6..0000000 --- a/scenarios/rp_deco_short_uprepl/conf.cfg +++ /dev/null @@ -1,19 +0,0 @@ -# define what a "client" is -include workers/workload_kv_roachprod.cfg -# define what a "nemesis" is -include nemeses/decommission.cfg - -# define 3 nodes, 2 clients, 1 nemesis -parameter serverconf defaults to roachprod -include servers.cfg -include workers.cfg -include nemesis_n1.cfg - -# simplest scenario: initialize, then activate the nemesis for a short time -include scripts/short.cfg - -# verify "good behavior" -include workers/check_stable_behavior_kv.cfg - -# ensure that initialization also waits for upreplication -include wait_upreplication.cfg diff --git a/scenarios/rp_quit_short/conf.cfg b/scenarios/rp_quit_short/conf.cfg deleted file mode 100644 index 1ea9160..0000000 --- a/scenarios/rp_quit_short/conf.cfg +++ /dev/null @@ -1,19 +0,0 @@ -# define what a "client" is -include workers/workload_kv_roachprod.cfg -# define what a "nemesis" is -include nemeses/quit.cfg - -# define 3 nodes, 2 clients, 1 nemesis -parameter serverconf defaults to roachprod -include servers.cfg -include workers.cfg -include nemesis_n1.cfg - -# simplest scenario: initialize, then activate the nemesis for a short time -include scripts/short.cfg - -# verify "good behavior" -include workers/check_stable_behavior_kv.cfg - -# ensure that initialization also waits for upreplication -include wait_upreplication.cfg