Skip to content

Commit

Permalink
roachtest: use structured errors
Browse files Browse the repository at this point in the history
`roachtest` predates `cockroachdb/errors` and as a result so far hasn't
capitalized many improvements. Today, roachtest errors are often noisy.

This commit adopts `cockroachdb/errors` and reaps some of the immediate
rewards, while making related improvements that would also have been
possible without the adoption.

The main user-facing change is that error messages in the output are now
a lot more concise; these would previously - sometimes - include the
entire stack trace. Now they contain only the topmost stack record of
the innermost error:

```
(test_test.go:129).func3: first error
(test_test.go:130).func3: second error
```

The full error continues to be logged, but we write it to files in
the artifacts, where they can be inspected just in case. This now
happens unconditionally for all errors, whereas the old code only
logged the stacks if the error was reported in a certain way.

Internally, the reorganization has also considerably simplified
roachtest. Stack frame offset tracking has been removed, since
`cockroachdb/errors` already handles it. Custom rendering code
was similarly significantly trimmed down.

In making this change, I opted to always have `roachtest` create
loggers backed by real files. This was previously elided in tests,
but this would have caused extra conditionals. It's better to
standardize on the way in which `roachtest` is always used in
actual invocations.

Looking ahead, structured errors open a few avenues:

- can assign the owner based on the type of failure. For example,
  cockroachdb#87106 wants consistency check failures to always go to the KV
  team, regardless of which test's cluster was being checked.

  Failures during an IMPORT/RESTORE (common across all tests) could be
  routed to the Disaster Recovery team by default (assuming we provide a
  wrapper around these operations that all tests use and which does this
  wrapping).
- Similarly, SSH failures can be special cased via a marking error
  and can be directed away from the owning team, which can't do
  anything about them anyway (cockroachdb#82398).
- We can conceivably start grouping failure modes by "error signature".
  That is, errors which have a "comparable" chain of errors (e.g. same
  types, and within formatted errors the same format string). Issues
  could then be reused only for compatible error signatures.

Release note: None
  • Loading branch information
tbg committed Oct 4, 2022
1 parent 22b953e commit c9a4c73
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 153 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/test/test_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ type Test interface {
VersionsBinaryOverride() map[string]string
Skip(args ...interface{})
Skipf(format string, args ...interface{})
Error(args ...interface{})
Errorf(string, ...interface{})
FailNow()
Fatal(args ...interface{})
Expand Down
213 changes: 88 additions & 125 deletions pkg/cmd/roachtest/test_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,10 @@
package main

import (
"bytes"
"context"
"fmt"
"io"
// For the debug http handlers.
_ "net/http/pprof"
"runtime"
"os"
"strings"
"time"

Expand All @@ -27,6 +24,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/version"
"github.com/cockroachdb/errors"
"github.com/petermattis/goid"
)

Expand Down Expand Up @@ -71,23 +69,33 @@ type testImpl struct {

mu struct {
syncutil.RWMutex
done bool
failed bool
timeout bool // if failed == true, this indicates whether the test timed out
// cancel, if set, is called from the t.Fatal() family of functions when the
// test is being marked as failed (i.e. when the failed field above is also
// set). This is used to cancel the context passed to t.spec.Run(), so async
// test goroutines can be notified.
done bool

cancel func()
failLoc struct {
file string
line int
}
failureMsg string

// Errors are all the errors passed to `addFailure`, in order of
// these calls.
//
// NB: the first failure is not always the relevant one due to:
// https://github.com/cockroachdb/cockroach/issues/44436
errors []error
// If len(errors)>0, this indicates whether the test timed out
// cancel, if set, is called from the t.Fatal() family of functions when the
// test is being marked as failed (i.e. when the failed field above is also
// set). This is used to cancel the context passed to t.spec.Run(), so async
// test goroutines can be notified.

// status is a map from goroutine id to status set by that goroutine. A
// special goroutine is indicated by runnerID; that one provides the test's
// "main status".
status map[int64]testStatus

// TODO(test-eng): this should just be an in-mem (ring) buffer attached to
// `t.L()`.
output []byte
}
// Map from version to path to the cockroach binary to be used when
Expand Down Expand Up @@ -246,6 +254,23 @@ func (t *testImpl) Skipf(format string, args ...interface{}) {
panic(errTestFatal)
}

// This creates an error from the first arg, and adds each subsequent arg
// as error detail
func argsToErr(depth int, args ...interface{}) error {
// NB: we'd probably not allow multiple arguments here and we'd want
// the one remaining arg to be an `error`, but we are trying to be
// compatible with `(*testing.T).Fatal`.
var err error
for _, arg := range args {
if err == nil {
err = errors.NewWithDepthf(depth+1, "%v", arg)
continue
}
err = errors.WithDetailf(err, "%v", arg)
}
return err
}

// Fatal marks the test as failed, prints the args to t.L(), and calls
// panic(errTestFatal). It can be called multiple times.
//
Expand All @@ -255,162 +280,100 @@ func (t *testImpl) Skipf(format string, args ...interface{}) {
// ATTENTION: Since this calls panic(errTestFatal), it should only be called
// from a test's closure. The test runner itself should never call this.
func (t *testImpl) Fatal(args ...interface{}) {
t.markFailedInner("" /* format */, args...)
t.addFailure(argsToErr(1, args...))
panic(errTestFatal)
}

// Fatalf is like Fatal, but takes a format string.
func (t *testImpl) Fatalf(format string, args ...interface{}) {
t.markFailedInner(format, args...)
t.addFailure(errors.NewWithDepthf(1, format, args...))
panic(errTestFatal)
}

// FailNow implements the TestingT interface.
func (t *testImpl) FailNow() {
t.Fatal()
t.addFailure(errors.NewWithDepthf(1, "FailNow called"))
panic(errTestFatal)
}

// Errorf implements the TestingT interface.
func (t *testImpl) Errorf(format string, args ...interface{}) {
t.markFailedInner(format, args...)
func (t *testImpl) Error(args ...interface{}) {
t.addFailure(argsToErr(1, args...))
}

func (t *testImpl) markFailedInner(format string, args ...interface{}) {
// Skip two frames: our own and the caller.
if format != "" {
t.printfAndFail(2 /* skip */, format, args...)
} else {
t.printAndFail(2 /* skip */, args...)
}
// Errorf implements the TestingT interface.
func (t *testImpl) Errorf(format string, args ...interface{}) {
t.addFailure(errors.NewWithDepthf(1, format, args...))
}

func (t *testImpl) printAndFail(skip int, args ...interface{}) {
var msg string
if len(args) == 1 {
// If we were passed only an error, then format it with "%+v" in order to
// get any stack traces.
if err, ok := args[0].(error); ok {
msg = fmt.Sprintf("%+v", err)
func formatFailure(b *strings.Builder, errs ...error) {
for i, err := range errs {
if i > 0 {
fmt.Fprintln(b)
}
file, line, fn, ok := errors.GetOneLineSource(err)
if !ok {
file, line, fn = "<unknown>", 0, "unknown"
}
fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, err)
}
if msg == "" {
msg = fmt.Sprint(args...)
}
t.failWithMsg(t.decorate(skip+1, msg))
}

func (t *testImpl) printfAndFail(skip int, format string, args ...interface{}) {
if format == "" {
panic(fmt.Sprintf("invalid empty format. args: %s", args))
}
t.failWithMsg(t.decorate(skip+1, fmt.Sprintf(format, args...)))
}

func (t *testImpl) failWithMsg(msg string) {
func (t *testImpl) addFailure(reportErr error) {
t.mu.Lock()
defer t.mu.Unlock()

prefix := ""
if t.mu.failed {
prefix = "[not the first failure] "
// NB: the first failure is not always the relevant one due to:
// https://github.com/cockroachdb/cockroach/issues/44436
//
// So we chain all failures together in the order in which we see
// them.
msg = "\n" + msg
t.mu.errors = append(t.mu.errors, reportErr)

var b strings.Builder
formatFailure(&b, reportErr)
msg := b.String()

t.L().Printf("test failure #%d: %s", len(t.mu.errors), msg)
// Also dump the verbose error (incl. all stack traces) to a log file, in case
// we need it. The stacks are sometimes helpful, but we don't want them in the
// main log as they are highly verbose.
{
cl, err := t.L().ChildLogger(
fmt.Sprintf("failure_%d", len(t.mu.errors)),
logger.QuietStderr, logger.QuietStdout,
)
if err == nil {
// We don't actually log through this logger since it adds an unrelated
// file:line caller (namely ours). The error already has stack traces
// so it's better to write only it to the file to avoid confusion.
path := cl.File.Name()
cl.Close() // we just wanted the filename
_ = os.WriteFile(path, []byte(fmt.Sprintf("%+v", reportErr)), 0644)
}
}
t.L().Printf("%stest failure: %s", prefix, msg)

t.mu.failed = true
t.mu.failureMsg += msg
t.mu.output = append(t.mu.output, msg...)
t.mu.output = append(t.mu.output, '\n')
if t.mu.cancel != nil {
t.mu.cancel()
}
}

// Args:
// skip: The number of stack frames to exclude from the result. 0 means that
//
// the caller will be the first frame identified. 1 means the caller's caller
// will be the first, etc.
func (t *testImpl) decorate(skip int, s string) string {
// Skip two extra frames to account for this function and runtime.Callers
// itself.
var pc [50]uintptr
n := runtime.Callers(2+skip, pc[:])
if n == 0 {
panic("zero callers found")
}

buf := new(bytes.Buffer)
frames := runtime.CallersFrames(pc[:n])
sep := "\t"
runnerFound := false
for {
if runnerFound {
break
}

frame, more := frames.Next()
if !more {
break
}
if frame.Function == t.runner {
runnerFound = true

// Handle the special case of the runner function being the caller of
// t.Fatal(). In that case, that's the line to be used for issue creation.
if t.mu.failLoc.file == "" {
t.mu.failLoc.file = frame.File
t.mu.failLoc.line = frame.Line
}
}
if !t.mu.failed && !runnerFound {
// Keep track of the highest stack frame that is lower than the t.runner
// stack frame. This is used to determine the author of that line of code
// and issue assignment.
t.mu.failLoc.file = frame.File
t.mu.failLoc.line = frame.Line
}
file := frame.File
if index := strings.LastIndexByte(file, '/'); index >= 0 {
file = file[index+1:]
}
fmt.Fprintf(buf, "%s%s:%d", sep, file, frame.Line)
sep = ","
}
buf.WriteString(": ")

lines := strings.Split(s, "\n")
if l := len(lines); l > 1 && lines[l-1] == "" {
lines = lines[:l-1]
}
for i, line := range lines {
if i > 0 {
buf.WriteString("\n\t\t")
}
buf.WriteString(line)
}
buf.WriteByte('\n')
return buf.String()
}

func (t *testImpl) duration() time.Duration {
return t.end.Sub(t.start)
}

func (t *testImpl) Failed() bool {
t.mu.RLock()
defer t.mu.RUnlock()
return t.mu.failed
return t.failedRLocked()
}

func (t *testImpl) failedRLocked() bool {
return len(t.mu.errors) > 0
}

func (t *testImpl) FailureMsg() string {
t.mu.RLock()
defer t.mu.RUnlock()
return t.mu.failureMsg
var b strings.Builder
formatFailure(&b, t.mu.errors...)
return b.String()
}

func (t *testImpl) ArtifactsDir() string {
Expand Down
Loading

0 comments on commit c9a4c73

Please sign in to comment.