Skip to content

Commit

Permalink
Merge #2888
Browse files Browse the repository at this point in the history
2888: CAD-2907 workbench: block time budgeting support r=deepfire a=deepfire

Workbench & `locli` improvements, primarily aimed at block time budgeting analysis support

- assorted fixes & cleanups
- `10-*` profiles, for a local 10-node cluster
- `wb reanalyse *` re-analysis invocation fast path
- locli: drop `Seq`, reverting to lists
- block propagation event CDFs
- generic, data-driven rendering of summaries and timelines
- module reorganisation
- block propagation timeline
- thorough log anomaly processing:  accumulation and attributed reporting
- block propagation event filtering
- cluster latency probe script
- support processing of the legacy, AWS benchmark runs
- support side-loaded alternate benchmark run roots, for direct on-AWS processing
- parallel processing for the machine performance timeline analysis

Co-authored-by: Kosyrev Serge <serge.kosyrev@iohk.io>
  • Loading branch information
iohk-bors[bot] and deepfire committed Jun 30, 2021
2 parents 2cbb08f + 092dd27 commit dfea540
Show file tree
Hide file tree
Showing 26 changed files with 2,116 additions and 606 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ profiles:
profile-names:
@./nix/workbench/wb profile-names

CLUSTER_PROFILE = default-alzo
CLUSTER_PROFILE ?= default-alzo
CLUSTER_ARGS_EXTRA ?=

cluster-shell:
Expand Down
42 changes: 42 additions & 0 deletions bench/script/probe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash

hostmap=$(jq '
.public_ip
| values
| map({ "\(.hostname)": .public_ip})
| add' last-meta.json)

echo 'obtaining latency matrix for the hostmap:'
jq . -C <<<$hostmap

nixops ssh-for-each --parallel -- "
self=\$(hostname)
function probe() {
local host=\$1 ip=\$2
ping -qAc21 \$ip |
grep 'rtt\|transmitted' |
sed 's_, \|/_\n_g' |
sed 's_ packets transmitted\| received\| packet loss\|time \|rtt min\|avg\|max\|mdev = \|ipg\|ewma \| ms\|ms\|%__g' |
grep -v '^$' |
jq '{ source: \"'\$self'\"
, target: \"'\$host'\"
, sent: .[0]
, received: .[1]
, percents_lost: .[2]
, duration_ms: .[3]
, ipg: .[8]
, ewma: .[9]
, rtt: { min: .[4], avg: .[5], max: .[6], mdev: .[7] }
}' --slurp --compact-output
}
hostmap=${hostmap@Q}
for host in \$(jq 'keys | .[]' <<<\$hostmap --raw-output)
do ip=\$(jq '.[\$host]' --arg host \$host <<<\$hostmap --raw-output)
probe \$host \$ip\ &
done"
2 changes: 1 addition & 1 deletion nix/supervisord-cluster/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ let
{ port, ... }: cfg: recursiveUpdate cfg
({
AlonzoGenesisFile = "../genesis/alonzo-genesis.json";
ShelleyGenesisFile = "../genesis/genesis.json";
ShelleyGenesisFile = "../genesis.json";
ByronGenesisFile = "../genesis/byron/genesis.json";
} // optionalAttrs enableEKG {
hasEKG = port + supervisord.portShiftEkg;
Expand Down
178 changes: 125 additions & 53 deletions nix/workbench/analyse.sh
Original file line number Diff line number Diff line change
@@ -1,105 +1,177 @@
usage_analyse() {
usage "analyse" "Analyse cluster runs" <<EOF
block-propagation RUN-NAME
Block propagation analysis for the entire cluster
Block propagation analysis for the entire cluster.
machine-timeline RUN-NAME MACH-NAME
Produce a general performance timeline for MACH-NAME
Options of 'analyse' command:
--reanalyse Skip the preparatory steps and launch 'locli' directly
--time Time the 'locli' executable runs
EOF
}

analyse() {
local skip_preparation= time= dump_logobjects=
while test $# -gt 0
do case "$1" in
--reanalyse | --re ) skip_preparation='true';;
--dump-logobjects ) dump_logobjects='true';;
--time ) time='eval time';;
* ) break;; esac; shift; done

local op=${1:-$(usage_analyse)}; shift

case "$op" in
block-propagation | bp )
local usage="USAGE: wb analyse $op [RUN-NAME=current]"

local name=${1:-current}
local dir=$(run get "$name")
local adir=$dir/analysis
if test -z "$dir"
then fail "malformed run: $name"; fi

msg "analysing run $(jq .meta.tag "$dir"/meta.json --raw-output)"
mkdir -p "$adir"

## 0. subset what we care about into the keyfile
local keyfile=$adir/substring-keys
locli analyse substring-keys | grep -v 'Temporary modify' > "$keyfile"
cat >>"$keyfile" <<EOF
TraceForgedBlock
AddedToCurrentChain
TraceChainSyncServerReadBlocked.AddBlock
TraceChainSyncServerRead.AddBlock
TraceBlockFetchServerSendBlock
TraceDownloadedHeader
CompletedBlockFetch
EOF
## 1. enumerate logs, filter by keyfile & consolidate
local logdirs=("$dir"/node-*/)

msg "filtering logs in: $dir/node-* "
local jq_args=(
--sort-keys
--compact-output
$(wb backend lostream-fixup-jqargs "$dir")
' delpaths([["app"],["env"],["loc"],["msg"],["ns"],["sev"]])
'"$(wb backend lostream-fixup-jqexpr)"
)
for d in "${logdirs[@]}"
do ## TODO: supervisor-specific logfile layout
grep -hFf "$keyfile" $(ls "$d"/stdout* | tac) | jq "${jq_args[@]}" > \
"$adir"/logs-$(basename "$d").flt.json &
done
wait

msg "log sizes: (files: $(ls "$adir"/*.flt.json | wc -l), lines: $(cat "$adir"/*.flt.json | wc -l))"
## 1. enumerate logs, filter by keyfile & consolidate
local logdirs=("$dir"/node-*/ "$dir"/analysis/node-*/)

if test -z "$skip_preparation" -o -z "$(ls "$adir"/logs-node-*.flt.json 2>/dev/null)"
then
msg "filtering logs in: $dir/node-* "
local jq_args=(
--sort-keys
--compact-output
$(wb backend lostream-fixup-jqargs "$dir")
' delpaths([["app"],["env"],["loc"],["msg"],["ns"],["sev"]])
'"$(wb backend lostream-fixup-jqexpr)"
)
for d in "${logdirs[@]}"
do local logfiles="$(ls "$d"/stdout* 2>/dev/null | tac) $(ls "$d"/node-*.json 2>/dev/null)"
if test -z "$logfiles"
then msg "no logs in $d, skipping.."; fi
grep -hFf "$keyfile" $logfiles |
jq "${jq_args[@]}" --arg dirHostname "$(basename "$d")" \
> "$adir"/logs-$(basename "$d").flt.json &
done
wait
fi

msg "log sizes: (files: $(ls "$adir"/*.flt.json 2>/dev/null | wc -l), lines: $(cat "$adir"/*.flt.json | wc -l))"

msg "analysing.."
local locli_args=(
--genesis "$dir"/genesis/genesis.json
--genesis "$dir"/genesis.json
--run-metafile "$dir"/meta.json
## ->
# --logobjects-json "$adir"/logs-cluster.logobjects.json
--analysis-json "$adir"/block-event-stream.json
--timeline-pretty "$adir"/block-propagation.txt
--analysis-json "$adir"/block-propagation.json
)
if test -n "$dump_logobjects"; then
locli_args+=(--logobjects-json "$adir"/logs-cluster.logobjects.json); fi

locli 'analyse' 'block-propagation' \
${time} locli 'analyse' 'block-propagation' \
"${locli_args[@]}" "$adir"/*.flt.json;;

grep-filtered-logs | grep | g )
local usage="USAGE: wb analyse $op BLOCK [MACHSPEC=*] [RUN-NAME=current]"
local expr=$1
local mach=${2:-*}
local name=${3:-current}
local dir=$(run get "$name")
local adir=$dir/analysis

grep -h "$expr" "$adir"/logs-$mach.flt.json;;

list-blocks | blocks | bs )
local usage="USAGE: wb analyse $op [RUN-NAME=current]"
local name=${1:-current}
local dir=$(run get "$name")
local adir=$dir/analysis

fgrep -h "TraceForgedBlock" "$adir"/*.flt.json |
jq '{ at: .at, host: .host } * .data | del(.peer) | del(.slot)' -c |
sort | uniq;;

block-propagation-block | bpb )
local usage="USAGE: wb analyse $op BLOCK [RUN-NAME=current]"
local block=$1
local name=${2:-current}
local dir=$(run get "$name")
local adir=$dir/analysis

grep -h "$block" "$adir"/*.flt.json |
grep 'AddBlock\|TraceForgedBlock\|AddedToCurrentChain' |
jq '{ at: .at, host: .host } * .data | del(.peer) | del(.slot)' -c |
sort --stable | uniq;;

machine-timeline | machine | mt )
local usage="USAGE: wb analyse $op [RUN-NAME=current] [MACH-NAME=node-1]"
local name=${1:-current}
local mach=${2:-node-1}
local dir=$(run get "$name")
local adir=$dir/analysis

msg "analysing run $(jq .meta.tag "$dir"/meta.json --raw-output)"
mkdir -p "$adir"

## 0. subset what we care about into the keyfile
local keyfile=$adir/substring-keys
locli analyse substring-keys | grep -v 'Temporary modify' > "$keyfile"

## 1. enumerate logs, filter by keyfile & consolidate
local logs=("$dir"/$mach/stdout) consolidated="$adir"/logs-$mach.json
grep -hFf "$keyfile" "${logs[@]}" > "$consolidated"
msg "analysing logs of: $mach (lines: $(wc -l "$consolidated"))"

local locli_args=(
--genesis "$dir"/genesis/genesis.json
--run-metafile "$dir"/meta.json
## ->
--logobjects-json "$adir"/logs-$mach.logobjects.json
--slotstats-json "$adir"/logs-$mach.slotstats.json
--timeline-pretty "$adir"/logs-$mach.timeline.txt
--stats-csv "$adir"/logs-$mach.stats.csv
--analysis-json "$adir"/logs-$mach.analysis.json
# --timeline-csv "$adir"/logs-$mach.timeline.csv
# --cpu-spans-histogram-png "$adir"/logs-"$mach".cpu85-span-lens.png
# --derived-vectors-0-csv "$adir"/logs-$mach".derived.1.csv
# --derived-vectors-1-csv "$adir"/logs-$mach.derived.1.csv
)
if test "$mach" = 'all'
then local machs=($(wb run list-hosts $name))
else local machs=($mach); fi

local num_jobs="\j"
local num_threads=$(grep processor /proc/cpuinfo | wc -l)
for mach in ${machs[*]}
do ## Limit parallelism:
sleep 0.5s
while ((${num_jobs@P} >= num_threads - 4))
do wait -n; sleep 0.$(((RANDOM % 5) + 1))s; done
(
## 1. enumerate logs, filter by keyfile & consolidate
local logs=($(ls "$dir"/$mach/stdout* 2>/dev/null | tac) $(ls "$dir"/$mach/node-*.json 2>/dev/null) $(ls "$dir"/analysis/$mach/node-*.json 2>/dev/null)) consolidated="$adir"/logs-$mach.json

if test -z "${logs[*]}"
then msg "no logs for $mach in run $name"; continue; fi

if test -z "$skip_preparation" -o -z "$(ls "$adir"/logs-$mach.json 2>/dev/null)"
then grep -hFf "$keyfile" "${logs[@]}" > "$consolidated"; fi

msg "analysing logs of: $mach (lines: $(wc -l "$consolidated"))"
local locli_args=(
--genesis "$dir"/genesis.json
--run-metafile "$dir"/meta.json
## ->
--timeline-pretty "$adir"/logs-$mach.timeline.txt
--stats-csv "$adir"/logs-$mach.stats.csv
--analysis-json "$adir"/logs-$mach.analysis.json
# --slotstats-json "$adir"/logs-$mach.slotstats.json
# --timeline-csv "$adir"/logs-$mach.timeline.csv
# --cpu-spans-histogram-png "$adir"/logs-"$mach".cpu85-span-lens.png
# --derived-vectors-0-csv "$adir"/logs-$mach".derived.1.csv
# --derived-vectors-1-csv "$adir"/logs-$mach.derived.1.csv
)
if test -n "$dump_logobjects"; then
locli_args+=(--logobjects-json "$adir"/logs-$mach.logobjects.json); fi

${time} locli 'analyse' 'machine-timeline' \
"${locli_args[@]}" "$consolidated"
) &
done

locli 'analyse' 'machine-timeline' \
"${locli_args[@]}" "$consolidated";;
wait
msg "analysis machine-timeline: All done.";;

* ) usage_analyse;; esac
}
3 changes: 2 additions & 1 deletion nix/workbench/backend.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ case "${op}" in
record-extended-env-config ) $WORKBENCH_BACKEND "$@";;
describe-run ) $WORKBENCH_BACKEND "$@";;
pre-run-hook ) $WORKBENCH_BACKEND "$@";;
start-run ) $WORKBENCH_BACKEND "$@";;
start-run ) cp "$2"/genesis/genesis.json "$2"/genesis.json
$WORKBENCH_BACKEND "$@";;
lostream-fixup-jqargs ) $WORKBENCH_BACKEND "$@";;
lostream-fixup-jqexpr ) $WORKBENCH_BACKEND "$@";;

Expand Down
9 changes: 5 additions & 4 deletions nix/workbench/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ let
runWorkbenchJqOnly =
name: command:
runCommand name {} ''
${workbench' [jq]}/bin/wb ${command} > $out
${workbench' [jq moreutils]}/bin/wb ${command} > $out
'';

runJq =
Expand Down Expand Up @@ -133,7 +133,8 @@ let
function workbench-prebuild-executables() {
${optionalString useCabalRun
''
git log -n1 --alternate-refs --pretty=format:"%Cblue%h %Cred%cr %Cgreen%D %Cblue%s%Creset"
git log -n1 --alternate-refs --pretty=format:"%Cred%cr %Cblue%h %Cgreen%D %Cblue%s%Creset" --color | sed "s/^/$(git diff --exit-code --quiet && echo ' ' || echo 'local changes + ')/"
echo
echo -n "workbench: prebuilding executables (because of useCabalRun):"
for exe in cardano-cli cardano-node cardano-topology locli
do echo -n " $exe"
Expand Down Expand Up @@ -171,7 +172,7 @@ let
with envArgs; rec {
inherit cardanoLib stateDir;

JSON = runWorkbench "environment.json"
JSON = runWorkbenchJqOnly "environment.json"
''env compute-config \
--cache-dir "${cacheDir}" \
--base-port ${toString basePort} \
Expand All @@ -194,7 +195,7 @@ let
profiles = genAttrs profile-names mkProfile;

profilesJSON =
runWorkbench "all-profiles.json" "profiles generate-all";
runWorkbenchJqOnly "all-profiles.json" "profiles generate-all";
};

initialiseProfileRunDirShellScript =
Expand Down
9 changes: 6 additions & 3 deletions nix/workbench/locli/locli.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,17 @@ library
Cardano.Analysis.Profile
Cardano.Analysis.TopHandler

Cardano.Unlog.BlockProp
Cardano.Analysis.BlockProp
Cardano.Analysis.Driver
Cardano.Analysis.MachTimeline

Cardano.Unlog.Commands
Cardano.Unlog.LogObject
Cardano.Unlog.Parsers
Cardano.Unlog.Render
Cardano.Unlog.Resources
Cardano.Unlog.Run
Cardano.Unlog.SlotStats
Cardano.Unlog.Summary
Cardano.Unlog.Timeline

other-modules: Paths_locli

Expand All @@ -54,6 +56,7 @@ library
, process
, scientific
, split
, statistics
, template-haskell
, text
, text-short
Expand Down
Loading

0 comments on commit dfea540

Please sign in to comment.