Skip to content

Commit

Permalink
roachtest: create "perf" artifacts, write histograms there, collect them
Browse files Browse the repository at this point in the history
Before the roachtest rewrite in #30977 we used to collect the logs directory
from remote hosts unconditionally. We also used to write stats.json histogram
files to the logs directory. This combination enabled the collection of
histogram files for roachperf. Since the new roachtest only copies logs
to the test runner in the case of failure, we have not gotten any perf
artifacts since the change.

This log policy is sane for logs and other debug artifacts like heap profiles
that exist there but has been a bit of a bummer as we've missed out on the
dopamine rush due to seeing charts go up and to the right in bespoke D3 glory.
The truth is that we weren't all that excited about the previous behavior with
regards to perf data either. Sometimes tests would fail and their lousy results
would mess with our pretty pictures
(https://github.com/cockroachdb/roachperf/issues/37).

This PR introduces a new concept, perf artifacts, which live in a different
directory, the perfArtifactsDir ("perf") and has the opposite fetching policy
from the logs; the "perf" directory is copied from each remove host into the
test's artifactsDir only upon successful completion of the test.

One downside of this as implemented is that it can spam the test's log (but
fortunately not roachtest's logs anymore, <3 @andreimatei). #38890 attempts to
address this somewhat. It shouldn't been too big of a problem because it only
happens on tests that already passed.

This change also necessitates a small change to roachperf to look in this new
"perf" directory (and respect the rewrite's test artifact directory structure).

Fixes #38871.

Release note: None
  • Loading branch information
ajwerner committed Jul 16, 2019
1 parent 02d52a6 commit 3cf2306
Show file tree
Hide file tree
Showing 11 changed files with 32 additions and 13 deletions.
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/indexes.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func registerNIndexes(r *testRegistry, secondaryIndexes int) {
payload := " --payload=256"
concurrency := ifLocal("", " --concurrency="+strconv.Itoa(nodes*32))
duration := " --duration=" + ifLocal("10s", "30m")
runCmd := fmt.Sprintf("./workload run indexes --histograms=logs/stats.json"+
runCmd := fmt.Sprintf("./workload run indexes --histograms="+perfArtifactsDir+"/stats.json"+
payload+concurrency+duration+" {pgurl%s}", gatewayNodes)
c.Run(ctx, loadNode, runCmd)
return nil
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/interleavedpartitioned.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func registerInterleaved(r *testRegistry) {
c.Run(ctx, cockroachEast.randNode(), cmdInit)

duration := " --duration " + ifLocal("10s", "10m")
histograms := " --histograms logs/stats.json"
histograms := " --histograms=" + perfArtifactsDir + "/stats.json"

createCmd := func(locality string, cockroachNodes nodeListOption) string {
return fmt.Sprintf(
Expand Down
7 changes: 3 additions & 4 deletions pkg/cmd/roachtest/kv.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func registerKV(r *testRegistry) {
splits := " --splits=1000"
duration := " --duration=" + ifLocal("10s", "10m")
readPercent := fmt.Sprintf(" --read-percent=%d", opts.readPercent)

histograms := "--histograms=" + perfArtifactsDir + "/stats.json"
var batchSize string
if opts.batchSize > 0 {
batchSize = fmt.Sprintf(" --batch=%d", opts.batchSize)
Expand All @@ -64,9 +64,8 @@ func registerKV(r *testRegistry) {
splits = "" // no splits
sequential = " --sequential"
}

cmd := fmt.Sprintf("./workload run kv --init --histograms=logs/stats.json"+
concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+
cmd := fmt.Sprintf("./workload run kv --init"+
histograms+concurrency+splits+duration+readPercent+batchSize+blockSize+sequential+
" {pgurl:1-%d}", nodes)
c.Run(ctx, c.Node(nodes+1), cmd)
return nil
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/ledger.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func registerLedger(r *testRegistry) {
concurrency := ifLocal("", " --concurrency="+fmt.Sprint(nodes*32))
duration := " --duration=" + ifLocal("10s", "30m")

cmd := fmt.Sprintf("./workload run ledger --init --histograms=logs/stats.json"+
cmd := fmt.Sprintf("./workload run ledger --init --histograms="+perfArtifactsDir+"/stats.json"+
concurrency+duration+" {pgurl%s}", gatewayNodes)
c.Run(ctx, loadNode, cmd)
return nil
Expand Down
4 changes: 3 additions & 1 deletion pkg/cmd/roachtest/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ func runNetworkTPCC(ctx context.Context, t *test, origC *cluster, nodes int) {
}

cmd := fmt.Sprintf(
"./workload run tpcc --warehouses=%d --wait=false --histograms=logs/stats.json --duration=%s {pgurl:2-%d}",
"./workload run tpcc --warehouses=%d --wait=false"+
" --histograms="+perfArtifactsDir+"/stats.json"+
" --duration=%s {pgurl:2-%d}",
warehouses, duration, c.spec.NodeCount-1)
return c.RunL(ctx, tpccL, workerNode, cmd)
})
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func runQueue(ctx context.Context, t *test, c *cluster) {
init = " --init"
}
cmd := fmt.Sprintf(
"./workload run queue --histograms=logs/stats.json"+
"./workload run queue --histograms="+perfArtifactsDir+"/stats.json"+
init+
concurrency+
duration+
Expand Down
5 changes: 5 additions & 0 deletions pkg/cmd/roachtest/test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ type testSpec struct {
Run func(ctx context.Context, t *test, c *cluster)
}

// perfArtifactsDir is the directory on cluster nodes in which perf artifacts
// reside. Upon success this directory is copied into test artifactsDir from
// each node in the cluster.
const perfArtifactsDir = "perf"

// matchOrSkip returns true if the filter matches the test. If the filter does
// not match the test because the tag filter does not match, the test is
// matched, but marked as skipped.
Expand Down
13 changes: 13 additions & 0 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,19 @@ func (r *testRunner) runWorker(
if err != nil {
return err
}
} else {
// Upon success fetch the perf artifacts from the remote hosts.
// If there's an error, oh well, don't do anything rash like fail the test
// which already passed.
//
// TODO(ajwerner): this Get on all nodes has the unfortunate side effect
// of logging an error in the test runner log for all of the nodes which
// do not have perf artifacts. Ideally we'd have the test tell us which
// nodes have the artifacts, or we'd go check explicitly, or we'd find a
// way for Get to not complain if a file does not exist.
if err := c.Get(ctx, l, perfArtifactsDir, t.artifactsDir+"/"+perfArtifactsDir); err != nil {
l.PrintfCtx(ctx, "failed to get perf artifacts: %v", err)
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/roachtest/tpcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ func runTPCC(ctx context.Context, t *test, c *cluster, opts tpccOptions) {
m.Go(func(ctx context.Context) error {
t.WorkerStatus("running tpcc")
cmd := fmt.Sprintf(
"./workload run tpcc --warehouses=%d --histograms=logs/stats.json "+
"./workload run tpcc --warehouses=%d --histograms="+perfArtifactsDir+"/stats.json "+
opts.Extra+" --ramp=%s --duration=%s {pgurl:1-%d}",
opts.Warehouses, rampDuration, opts.Duration, c.spec.NodeCount-1)
c.Run(ctx, workloadNode, cmd)
Expand Down Expand Up @@ -781,7 +781,7 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
}

t.Status(fmt.Sprintf("running benchmark, warehouses=%d", warehouses))
histogramsPath := fmt.Sprintf("logs/warehouses=%d/stats.json", activeWarehouses)
histogramsPath := fmt.Sprintf("%s/warehouses=%d/stats.json", perfArtifactsDir, activeWarehouses)
cmd := fmt.Sprintf("./workload run tpcc --warehouses=%d --active-warehouses=%d "+
"--tolerate-errors --ramp=%s --duration=%s%s {pgurl%s} "+
"--histograms=%s",
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/tpchbench.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func runTPCHBench(ctx context.Context, t *test, c *cluster, b tpchBenchSpec) {
cmd := fmt.Sprintf(
"./workload run querybench --db=tpch --concurrency=1 --query-file=%s "+
"--num-runs=%d --max-ops=%d --vectorized=%t {pgurl%s} "+
"--histograms=logs/stats.json --histograms-max-latency=%s",
"--histograms="+perfArtifactsDir+"/stats.json --histograms-max-latency=%s",
filename,
b.numRunsPerQuery,
maxOps,
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/ycsb.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func registerYCSB(r *testRegistry) {
duration := " --duration=" + ifLocal("10s", "10m")
cmd := fmt.Sprintf(
"./workload run ycsb --init --record-count=1000000 --splits=100"+
" --workload=%s --concurrency=64 --histograms=logs/stats.json"+
" --workload=%s --concurrency=64 --histograms="+perfArtifactsDir+"/stats.json"+
ramp+duration+" {pgurl:1-%d}",
wl, nodes)
c.Run(ctx, c.Node(nodes+1), cmd)
Expand Down

0 comments on commit 3cf2306

Please sign in to comment.