Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CAD-2907 workbench: block time budgeting support #2888

Merged
merged 20 commits into from
Jun 30, 2021
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
ec452b8
CAD-2907 workbench: profile name parametrisation fix
deepfire May 20, 2021
5436dea
CAD-2907 worbench: 10-* profiles, for a 10-node cluster
deepfire May 20, 2021
f9df3ab
CAD-2839 workbench: a bit less unnecessary deps on local tools
deepfire May 25, 2021
165e981
CAD-2907 workbench: save genesis.json
deepfire Jun 1, 2021
f92856a
CAD-2907 workbench block-propagation analysis: fast re-iteration mode
deepfire May 26, 2021
27ad44d
CAD-2907 locli: drop the Seq
deepfire May 26, 2021
a59d62b
CAD-2907 locli: block event CDFs
deepfire May 26, 2021
781a8d3
CAD-2907 locli: more generic rendering of summaries
deepfire Jun 1, 2021
c709cb3
CAD-2907 locli: reorganise modules a bit, as analyses expose concern…
deepfire Jun 2, 2021
7a8139c
CAD-2907 locli: switch MachTimeline to generalised distribution rend…
deepfire Jun 2, 2021
7e5ef34
CAD-2907 locli: completely replace rendering with the data-driven me…
deepfire Jun 3, 2021
0172888
CAD-2907 locli: block propagation timeline
deepfire Jun 3, 2021
1b23407
CAD-2907 locli: more thorough error processing, to accomodate anomal…
deepfire Jun 4, 2021
4de060b
CAD-2907 locli: filter out unsatisfactory observations
deepfire Jun 24, 2021
ab1e929
CAD-2907 bench: add the latency probe script
deepfire Jun 24, 2021
05ba0fd
CAD-2907 workbench: ensure sponge@moreutils is available
deepfire Jun 28, 2021
840cc55
CAD-2907 workbench: adopt legacy AWS run format
deepfire Jun 25, 2021
cb0e36f
CAD-2907 workbench: support side-loaded cardano-ops run directory
deepfire Jun 28, 2021
76638ea
CAD-2907 workbench: mass-parallel performance timeline analysis
deepfire Jun 28, 2021
092dd27
CAD-2907 locli: appease hlint
deepfire Jun 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ profiles:
profile-names:
@./nix/workbench/wb profile-names

CLUSTER_PROFILE = default-alzo
CLUSTER_PROFILE ?= default-alzo
CLUSTER_ARGS_EXTRA ?=

cluster-shell:
Expand Down
42 changes: 42 additions & 0 deletions bench/script/probe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash

hostmap=$(jq '
.public_ip
| values
| map({ "\(.hostname)": .public_ip})
| add' last-meta.json)

echo 'obtaining latency matrix for the hostmap:'
jq . -C <<<$hostmap

nixops ssh-for-each --parallel -- "
self=\$(hostname)

function probe() {
local host=\$1 ip=\$2

ping -qAc21 \$ip |
grep 'rtt\|transmitted' |
sed 's_, \|/_\n_g' |
sed 's_ packets transmitted\| received\| packet loss\|time \|rtt min\|avg\|max\|mdev = \|ipg\|ewma \| ms\|ms\|%__g' |
grep -v '^$' |
jq '{ source: \"'\$self'\"
, target: \"'\$host'\"

, sent: .[0]
, received: .[1]
, percents_lost: .[2]
, duration_ms: .[3]
, ipg: .[8]
, ewma: .[9]

, rtt: { min: .[4], avg: .[5], max: .[6], mdev: .[7] }
}' --slurp --compact-output
}

hostmap=${hostmap@Q}

for host in \$(jq 'keys | .[]' <<<\$hostmap --raw-output)
do ip=\$(jq '.[\$host]' --arg host \$host <<<\$hostmap --raw-output)
probe \$host \$ip\ &
done"
2 changes: 1 addition & 1 deletion nix/supervisord-cluster/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ let
{ port, ... }: cfg: recursiveUpdate cfg
({
AlonzoGenesisFile = "../genesis/alonzo-genesis.json";
ShelleyGenesisFile = "../genesis/genesis.json";
ShelleyGenesisFile = "../genesis.json";
ByronGenesisFile = "../genesis/byron/genesis.json";
} // optionalAttrs enableEKG {
hasEKG = port + supervisord.portShiftEkg;
Expand Down
178 changes: 125 additions & 53 deletions nix/workbench/analyse.sh
Original file line number Diff line number Diff line change
@@ -1,105 +1,177 @@
usage_analyse() {
usage "analyse" "Analyse cluster runs" <<EOF
block-propagation RUN-NAME
Block propagation analysis for the entire cluster
Block propagation analysis for the entire cluster.

machine-timeline RUN-NAME MACH-NAME
Produce a general performance timeline for MACH-NAME

Options of 'analyse' command:

--reanalyse Skip the preparatory steps and launch 'locli' directly
--time Time the 'locli' executable runs
EOF
}

analyse() {
local skip_preparation= time= dump_logobjects=
while test $# -gt 0
do case "$1" in
--reanalyse | --re ) skip_preparation='true';;
--dump-logobjects ) dump_logobjects='true';;
--time ) time='eval time';;
* ) break;; esac; shift; done

local op=${1:-$(usage_analyse)}; shift

case "$op" in
block-propagation | bp )
local usage="USAGE: wb analyse $op [RUN-NAME=current]"

local name=${1:-current}
local dir=$(run get "$name")
local adir=$dir/analysis
if test -z "$dir"
then fail "malformed run: $name"; fi

msg "analysing run $(jq .meta.tag "$dir"/meta.json --raw-output)"
mkdir -p "$adir"

## 0. subset what we care about into the keyfile
local keyfile=$adir/substring-keys
locli analyse substring-keys | grep -v 'Temporary modify' > "$keyfile"
cat >>"$keyfile" <<EOF
TraceForgedBlock
AddedToCurrentChain
TraceChainSyncServerReadBlocked.AddBlock
TraceChainSyncServerRead.AddBlock
TraceBlockFetchServerSendBlock
TraceDownloadedHeader
CompletedBlockFetch
EOF
## 1. enumerate logs, filter by keyfile & consolidate
local logdirs=("$dir"/node-*/)

msg "filtering logs in: $dir/node-* "
local jq_args=(
--sort-keys
--compact-output
$(wb backend lostream-fixup-jqargs "$dir")
' delpaths([["app"],["env"],["loc"],["msg"],["ns"],["sev"]])
'"$(wb backend lostream-fixup-jqexpr)"
)
for d in "${logdirs[@]}"
do ## TODO: supervisor-specific logfile layout
grep -hFf "$keyfile" $(ls "$d"/stdout* | tac) | jq "${jq_args[@]}" > \
"$adir"/logs-$(basename "$d").flt.json &
done
wait

msg "log sizes: (files: $(ls "$adir"/*.flt.json | wc -l), lines: $(cat "$adir"/*.flt.json | wc -l))"
## 1. enumerate logs, filter by keyfile & consolidate
local logdirs=("$dir"/node-*/ "$dir"/analysis/node-*/)

if test -z "$skip_preparation" -o -z "$(ls "$adir"/logs-node-*.flt.json 2>/dev/null)"
then
msg "filtering logs in: $dir/node-* "
local jq_args=(
--sort-keys
--compact-output
$(wb backend lostream-fixup-jqargs "$dir")
' delpaths([["app"],["env"],["loc"],["msg"],["ns"],["sev"]])
'"$(wb backend lostream-fixup-jqexpr)"
)
for d in "${logdirs[@]}"
do local logfiles="$(ls "$d"/stdout* 2>/dev/null | tac) $(ls "$d"/node-*.json 2>/dev/null)"
if test -z "$logfiles"
then msg "no logs in $d, skipping.."; fi
grep -hFf "$keyfile" $logfiles |
jq "${jq_args[@]}" --arg dirHostname "$(basename "$d")" \
> "$adir"/logs-$(basename "$d").flt.json &
done
wait
fi

msg "log sizes: (files: $(ls "$adir"/*.flt.json 2>/dev/null | wc -l), lines: $(cat "$adir"/*.flt.json | wc -l))"

msg "analysing.."
local locli_args=(
--genesis "$dir"/genesis/genesis.json
--genesis "$dir"/genesis.json
--run-metafile "$dir"/meta.json
## ->
# --logobjects-json "$adir"/logs-cluster.logobjects.json
--analysis-json "$adir"/block-event-stream.json
--timeline-pretty "$adir"/block-propagation.txt
--analysis-json "$adir"/block-propagation.json
)
if test -n "$dump_logobjects"; then
locli_args+=(--logobjects-json "$adir"/logs-cluster.logobjects.json); fi

locli 'analyse' 'block-propagation' \
${time} locli 'analyse' 'block-propagation' \
"${locli_args[@]}" "$adir"/*.flt.json;;

grep-filtered-logs | grep | g )
local usage="USAGE: wb analyse $op BLOCK [MACHSPEC=*] [RUN-NAME=current]"
local expr=$1
local mach=${2:-*}
local name=${3:-current}
local dir=$(run get "$name")
local adir=$dir/analysis

grep -h "$expr" "$adir"/logs-$mach.flt.json;;

list-blocks | blocks | bs )
local usage="USAGE: wb analyse $op [RUN-NAME=current]"
local name=${1:-current}
local dir=$(run get "$name")
local adir=$dir/analysis

fgrep -h "TraceForgedBlock" "$adir"/*.flt.json |
jq '{ at: .at, host: .host } * .data | del(.peer) | del(.slot)' -c |
sort | uniq;;

block-propagation-block | bpb )
local usage="USAGE: wb analyse $op BLOCK [RUN-NAME=current]"
local block=$1
local name=${2:-current}
local dir=$(run get "$name")
local adir=$dir/analysis

grep -h "$block" "$adir"/*.flt.json |
grep 'AddBlock\|TraceForgedBlock\|AddedToCurrentChain' |
jq '{ at: .at, host: .host } * .data | del(.peer) | del(.slot)' -c |
sort --stable | uniq;;

machine-timeline | machine | mt )
local usage="USAGE: wb analyse $op [RUN-NAME=current] [MACH-NAME=node-1]"
local name=${1:-current}
local mach=${2:-node-1}
local dir=$(run get "$name")
local adir=$dir/analysis

msg "analysing run $(jq .meta.tag "$dir"/meta.json --raw-output)"
mkdir -p "$adir"

## 0. subset what we care about into the keyfile
local keyfile=$adir/substring-keys
locli analyse substring-keys | grep -v 'Temporary modify' > "$keyfile"

## 1. enumerate logs, filter by keyfile & consolidate
local logs=("$dir"/$mach/stdout) consolidated="$adir"/logs-$mach.json
grep -hFf "$keyfile" "${logs[@]}" > "$consolidated"
msg "analysing logs of: $mach (lines: $(wc -l "$consolidated"))"

local locli_args=(
--genesis "$dir"/genesis/genesis.json
--run-metafile "$dir"/meta.json
## ->
--logobjects-json "$adir"/logs-$mach.logobjects.json
--slotstats-json "$adir"/logs-$mach.slotstats.json
--timeline-pretty "$adir"/logs-$mach.timeline.txt
--stats-csv "$adir"/logs-$mach.stats.csv
--analysis-json "$adir"/logs-$mach.analysis.json
# --timeline-csv "$adir"/logs-$mach.timeline.csv
# --cpu-spans-histogram-png "$adir"/logs-"$mach".cpu85-span-lens.png
# --derived-vectors-0-csv "$adir"/logs-$mach".derived.1.csv
# --derived-vectors-1-csv "$adir"/logs-$mach.derived.1.csv
)
if test "$mach" = 'all'
then local machs=($(wb run list-hosts $name))
else local machs=($mach); fi

local num_jobs="\j"
local num_threads=$(grep processor /proc/cpuinfo | wc -l)
for mach in ${machs[*]}
do ## Limit parallelism:
sleep 0.5s
while ((${num_jobs@P} >= num_threads - 4))
do wait -n; sleep 0.$(((RANDOM % 5) + 1))s; done
(
## 1. enumerate logs, filter by keyfile & consolidate
local logs=($(ls "$dir"/$mach/stdout* 2>/dev/null | tac) $(ls "$dir"/$mach/node-*.json 2>/dev/null) $(ls "$dir"/analysis/$mach/node-*.json 2>/dev/null)) consolidated="$adir"/logs-$mach.json

if test -z "${logs[*]}"
then msg "no logs for $mach in run $name"; continue; fi

if test -z "$skip_preparation" -o -z "$(ls "$adir"/logs-$mach.json 2>/dev/null)"
then grep -hFf "$keyfile" "${logs[@]}" > "$consolidated"; fi

msg "analysing logs of: $mach (lines: $(wc -l "$consolidated"))"
local locli_args=(
--genesis "$dir"/genesis.json
--run-metafile "$dir"/meta.json
## ->
--timeline-pretty "$adir"/logs-$mach.timeline.txt
--stats-csv "$adir"/logs-$mach.stats.csv
--analysis-json "$adir"/logs-$mach.analysis.json
# --slotstats-json "$adir"/logs-$mach.slotstats.json
# --timeline-csv "$adir"/logs-$mach.timeline.csv
# --cpu-spans-histogram-png "$adir"/logs-"$mach".cpu85-span-lens.png
# --derived-vectors-0-csv "$adir"/logs-$mach".derived.1.csv
# --derived-vectors-1-csv "$adir"/logs-$mach.derived.1.csv
)
if test -n "$dump_logobjects"; then
locli_args+=(--logobjects-json "$adir"/logs-$mach.logobjects.json); fi

${time} locli 'analyse' 'machine-timeline' \
"${locli_args[@]}" "$consolidated"
) &
done

locli 'analyse' 'machine-timeline' \
"${locli_args[@]}" "$consolidated";;
wait
msg "analysis machine-timeline: All done.";;

* ) usage_analyse;; esac
}
3 changes: 2 additions & 1 deletion nix/workbench/backend.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ case "${op}" in
record-extended-env-config ) $WORKBENCH_BACKEND "$@";;
describe-run ) $WORKBENCH_BACKEND "$@";;
pre-run-hook ) $WORKBENCH_BACKEND "$@";;
start-run ) $WORKBENCH_BACKEND "$@";;
start-run ) cp "$2"/genesis/genesis.json "$2"/genesis.json
$WORKBENCH_BACKEND "$@";;
lostream-fixup-jqargs ) $WORKBENCH_BACKEND "$@";;
lostream-fixup-jqexpr ) $WORKBENCH_BACKEND "$@";;

Expand Down
9 changes: 5 additions & 4 deletions nix/workbench/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ let
runWorkbenchJqOnly =
name: command:
runCommand name {} ''
${workbench' [jq]}/bin/wb ${command} > $out
${workbench' [jq moreutils]}/bin/wb ${command} > $out
'';

runJq =
Expand Down Expand Up @@ -133,7 +133,8 @@ let
function workbench-prebuild-executables() {
${optionalString useCabalRun
''
git log -n1 --alternate-refs --pretty=format:"%Cblue%h %Cred%cr %Cgreen%D %Cblue%s%Creset"
git log -n1 --alternate-refs --pretty=format:"%Cred%cr %Cblue%h %Cgreen%D %Cblue%s%Creset" --color | sed "s/^/$(git diff --exit-code --quiet && echo ' ' || echo 'local changes + ')/"
echo
echo -n "workbench: prebuilding executables (because of useCabalRun):"
for exe in cardano-cli cardano-node cardano-topology locli
do echo -n " $exe"
Expand Down Expand Up @@ -171,7 +172,7 @@ let
with envArgs; rec {
inherit cardanoLib stateDir;

JSON = runWorkbench "environment.json"
JSON = runWorkbenchJqOnly "environment.json"
''env compute-config \
--cache-dir "${cacheDir}" \
--base-port ${toString basePort} \
Expand All @@ -194,7 +195,7 @@ let
profiles = genAttrs profile-names mkProfile;

profilesJSON =
runWorkbench "all-profiles.json" "profiles generate-all";
runWorkbenchJqOnly "all-profiles.json" "profiles generate-all";
};

initialiseProfileRunDirShellScript =
Expand Down
9 changes: 6 additions & 3 deletions nix/workbench/locli/locli.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,17 @@ library
Cardano.Analysis.Profile
Cardano.Analysis.TopHandler

Cardano.Unlog.BlockProp
Cardano.Analysis.BlockProp
Cardano.Analysis.Driver
Cardano.Analysis.MachTimeline

Cardano.Unlog.Commands
Cardano.Unlog.LogObject
Cardano.Unlog.Parsers
Cardano.Unlog.Render
Cardano.Unlog.Resources
Cardano.Unlog.Run
Cardano.Unlog.SlotStats
Cardano.Unlog.Summary
Cardano.Unlog.Timeline

other-modules: Paths_locli

Expand All @@ -54,6 +56,7 @@ library
, process
, scientific
, split
, statistics
, template-haskell
, text
, text-short
Expand Down
Loading