roachtest: use structured errors

`roachtest` predates `cockroachdb/errors` and as a result so far hasn't capitalized many improvements. Today, roachtest errors are often noisy. This commit adopts `cockroachdb/errors` and reaps some of the immediate rewards, while making related improvements that would also have been possible without the adoption. The main user-facing change is that error messages in the output are now a lot more concise; these would previously - sometimes - include the entire stack trace. Now they contain only the topmost stack record of the innermost error: ``` (test_test.go:129).func3: first error (test_test.go:130).func3: second error ``` The full error continues to be logged, but we write it to files in the artifacts, where they can be inspected just in case. This now happens unconditionally for all errors, whereas the old code only logged the stacks if the error was reported in a certain way. Internally, the reorganization has also considerably simplified roachtest. Stack frame offset tracking has been removed, since `cockroachdb/errors` already handles it. Custom rendering code was similarly significantly trimmed down. In making this change, I opted to always have `roachtest` create loggers backed by real files. This was previously elided in tests, but this would have caused extra conditionals. It's better to standardize on the way in which `roachtest` is always used in actual invocations. Looking ahead, structured errors open a few avenues: - can assign the owner based on the type of failure. For example, cockroachdb#87106 wants consistency check failures to always go to the KV team, regardless of which test's cluster was being checked. Failures during an IMPORT/RESTORE (common across all tests) could be routed to the Disaster Recovery team by default (assuming we provide a wrapper around these operations that all tests use and which does this wrapping). - Similarly, SSH failures can be special cased via a marking error and can be directed away from the owning team, which can't do anything about them anyway (cockroachdb#82398). - We can conceivably start grouping failure modes by "error signature". That is, errors which have a "comparable" chain of errors (e.g. same types, and within formatted errors the same format string). Issues could then be reused only for compatible error signatures. Release note: None
tbg · Oct 4, 2022 · c9a4c73 · c9a4c73
1 parent 22b953e
commit c9a4c73
Show file tree

Hide file tree

Showing 6 changed files with 141 additions and 153 deletions.
diff --git a/pkg/cmd/roachtest/test/test_interface.go b/pkg/cmd/roachtest/test/test_interface.go
@@ -37,6 +37,7 @@ type Test interface {
 	VersionsBinaryOverride() map[string]string
 	Skip(args ...interface{})
 	Skipf(format string, args ...interface{})
+	Error(args ...interface{})
 	Errorf(string, ...interface{})
 	FailNow()
 	Fatal(args ...interface{})

diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go
@@ -11,13 +11,10 @@
 package main
 
 import (
-	"bytes"
 	"context"
 	"fmt"
 	"io"
-	// For the debug http handlers.
-	_ "net/http/pprof"
-	"runtime"
+	"os"
 	"strings"
 	"time"
 
@@ -27,6 +24,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
 	"github.com/cockroachdb/cockroach/pkg/util/version"
+	"github.com/cockroachdb/errors"
 	"github.com/petermattis/goid"
 )
 
@@ -71,23 +69,33 @@ type testImpl struct {
 
 	mu struct {
 		syncutil.RWMutex
-		done    bool
-		failed  bool
-		timeout bool // if failed == true, this indicates whether the test timed out
-		// cancel, if set, is called from the t.Fatal() family of functions when the
-		// test is being marked as failed (i.e. when the failed field above is also
-		// set). This is used to cancel the context passed to t.spec.Run(), so async
-		// test goroutines can be notified.
+		done bool
+
 		cancel  func()
 		failLoc struct {
 			file string
 			line int
 		}
-		failureMsg string
+
+		// Errors are all the errors passed to `addFailure`, in order of
+		// these calls.
+		//
+		// NB: the first failure is not always the relevant one due to:
+		// https://github.com/cockroachdb/cockroach/issues/44436
+		errors []error
+		// If len(errors)>0, this indicates whether the test timed out
+		// cancel, if set, is called from the t.Fatal() family of functions when the
+		// test is being marked as failed (i.e. when the failed field above is also
+		// set). This is used to cancel the context passed to t.spec.Run(), so async
+		// test goroutines can be notified.
+
 		// status is a map from goroutine id to status set by that goroutine. A
 		// special goroutine is indicated by runnerID; that one provides the test's
 		// "main status".
 		status map[int64]testStatus
+
+		// TODO(test-eng): this should just be an in-mem (ring) buffer attached to
+		// `t.L()`.
 		output []byte
 	}
 	// Map from version to path to the cockroach binary to be used when
@@ -246,6 +254,23 @@ func (t *testImpl) Skipf(format string, args ...interface{}) {
 	panic(errTestFatal)
 }
 
+// This creates an error from the first arg, and adds each subsequent arg
+// as error detail
+func argsToErr(depth int, args ...interface{}) error {
+	// NB: we'd probably not allow multiple arguments here and we'd want
+	// the one remaining arg to be an `error`, but we are trying to be
+	// compatible with `(*testing.T).Fatal`.
+	var err error
+	for _, arg := range args {
+		if err == nil {
+			err = errors.NewWithDepthf(depth+1, "%v", arg)
+			continue
+		}
+		err = errors.WithDetailf(err, "%v", arg)
+	}
+	return err
+}
+
 // Fatal marks the test as failed, prints the args to t.L(), and calls
 // panic(errTestFatal). It can be called multiple times.
 //
@@ -255,162 +280,100 @@ func (t *testImpl) Skipf(format string, args ...interface{}) {
 // ATTENTION: Since this calls panic(errTestFatal), it should only be called
 // from a test's closure. The test runner itself should never call this.
 func (t *testImpl) Fatal(args ...interface{}) {
-	t.markFailedInner("" /* format */, args...)
+	t.addFailure(argsToErr(1, args...))
 	panic(errTestFatal)
 }
 
 // Fatalf is like Fatal, but takes a format string.
 func (t *testImpl) Fatalf(format string, args ...interface{}) {
-	t.markFailedInner(format, args...)
+	t.addFailure(errors.NewWithDepthf(1, format, args...))
 	panic(errTestFatal)
 }
 
 // FailNow implements the TestingT interface.
 func (t *testImpl) FailNow() {
-	t.Fatal()
+	t.addFailure(errors.NewWithDepthf(1, "FailNow called"))
+	panic(errTestFatal)
 }
 
-// Errorf implements the TestingT interface.
-func (t *testImpl) Errorf(format string, args ...interface{}) {
-	t.markFailedInner(format, args...)
+func (t *testImpl) Error(args ...interface{}) {
+	t.addFailure(argsToErr(1, args...))
 }
 
-func (t *testImpl) markFailedInner(format string, args ...interface{}) {
-	// Skip two frames: our own and the caller.
-	if format != "" {
-		t.printfAndFail(2 /* skip */, format, args...)
-	} else {
-		t.printAndFail(2 /* skip */, args...)
-	}
+// Errorf implements the TestingT interface.
+func (t *testImpl) Errorf(format string, args ...interface{}) {
+	t.addFailure(errors.NewWithDepthf(1, format, args...))
 }
 
-func (t *testImpl) printAndFail(skip int, args ...interface{}) {
-	var msg string
-	if len(args) == 1 {
-		// If we were passed only an error, then format it with "%+v" in order to
-		// get any stack traces.
-		if err, ok := args[0].(error); ok {
-			msg = fmt.Sprintf("%+v", err)
+func formatFailure(b *strings.Builder, errs ...error) {
+	for i, err := range errs {
+		if i > 0 {
+			fmt.Fprintln(b)
 		}
+		file, line, fn, ok := errors.GetOneLineSource(err)
+		if !ok {
+			file, line, fn = "<unknown>", 0, "unknown"
+		}
+		fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, err)
 	}
-	if msg == "" {
-		msg = fmt.Sprint(args...)
-	}
-	t.failWithMsg(t.decorate(skip+1, msg))
-}
-
-func (t *testImpl) printfAndFail(skip int, format string, args ...interface{}) {
-	if format == "" {
-		panic(fmt.Sprintf("invalid empty format. args: %s", args))
-	}
-	t.failWithMsg(t.decorate(skip+1, fmt.Sprintf(format, args...)))
 }
 
-func (t *testImpl) failWithMsg(msg string) {
+func (t *testImpl) addFailure(reportErr error) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 
-	prefix := ""
-	if t.mu.failed {
-		prefix = "[not the first failure] "
-		// NB: the first failure is not always the relevant one due to:
-		// https://github.com/cockroachdb/cockroach/issues/44436
-		//
-		// So we chain all failures together in the order in which we see
-		// them.
-		msg = "\n" + msg
+	t.mu.errors = append(t.mu.errors, reportErr)
+
+	var b strings.Builder
+	formatFailure(&b, reportErr)
+	msg := b.String()
+
+	t.L().Printf("test failure #%d: %s", len(t.mu.errors), msg)
+	// Also dump the verbose error (incl. all stack traces) to a log file, in case
+	// we need it. The stacks are sometimes helpful, but we don't want them in the
+	// main log as they are highly verbose.
+	{
+		cl, err := t.L().ChildLogger(
+			fmt.Sprintf("failure_%d", len(t.mu.errors)),
+			logger.QuietStderr, logger.QuietStdout,
+		)
+		if err == nil {
+			// We don't actually log through this logger since it adds an unrelated
+			// file:line caller (namely ours). The error already has stack traces
+			// so it's better to write only it to the file to avoid confusion.
+			path := cl.File.Name()
+			cl.Close() // we just wanted the filename
+			_ = os.WriteFile(path, []byte(fmt.Sprintf("%+v", reportErr)), 0644)
+		}
 	}
-	t.L().Printf("%stest failure: %s", prefix, msg)
 
-	t.mu.failed = true
-	t.mu.failureMsg += msg
 	t.mu.output = append(t.mu.output, msg...)
+	t.mu.output = append(t.mu.output, '\n')
 	if t.mu.cancel != nil {
 		t.mu.cancel()
 	}
 }
 
-// Args:
-// skip: The number of stack frames to exclude from the result. 0 means that
-//
-//	the caller will be the first frame identified. 1 means the caller's caller
-//	will be the first, etc.
-func (t *testImpl) decorate(skip int, s string) string {
-	// Skip two extra frames to account for this function and runtime.Callers
-	// itself.
-	var pc [50]uintptr
-	n := runtime.Callers(2+skip, pc[:])
-	if n == 0 {
-		panic("zero callers found")
-	}
-
-	buf := new(bytes.Buffer)
-	frames := runtime.CallersFrames(pc[:n])
-	sep := "\t"
-	runnerFound := false
-	for {
-		if runnerFound {
-			break
-		}
-
-		frame, more := frames.Next()
-		if !more {
-			break
-		}
-		if frame.Function == t.runner {
-			runnerFound = true
-
-			// Handle the special case of the runner function being the caller of
-			// t.Fatal(). In that case, that's the line to be used for issue creation.
-			if t.mu.failLoc.file == "" {
-				t.mu.failLoc.file = frame.File
-				t.mu.failLoc.line = frame.Line
-			}
-		}
-		if !t.mu.failed && !runnerFound {
-			// Keep track of the highest stack frame that is lower than the t.runner
-			// stack frame. This is used to determine the author of that line of code
-			// and issue assignment.
-			t.mu.failLoc.file = frame.File
-			t.mu.failLoc.line = frame.Line
-		}
-		file := frame.File
-		if index := strings.LastIndexByte(file, '/'); index >= 0 {
-			file = file[index+1:]
-		}
-		fmt.Fprintf(buf, "%s%s:%d", sep, file, frame.Line)
-		sep = ","
-	}
-	buf.WriteString(": ")
-
-	lines := strings.Split(s, "\n")
-	if l := len(lines); l > 1 && lines[l-1] == "" {
-		lines = lines[:l-1]
-	}
-	for i, line := range lines {
-		if i > 0 {
-			buf.WriteString("\n\t\t")
-		}
-		buf.WriteString(line)
-	}
-	buf.WriteByte('\n')
-	return buf.String()
-}
-
 func (t *testImpl) duration() time.Duration {
 	return t.end.Sub(t.start)
 }
 
 func (t *testImpl) Failed() bool {
 	t.mu.RLock()
 	defer t.mu.RUnlock()
-	return t.mu.failed
+	return t.failedRLocked()
+}
+
+func (t *testImpl) failedRLocked() bool {
+	return len(t.mu.errors) > 0
 }
 
 func (t *testImpl) FailureMsg() string {
 	t.mu.RLock()
 	defer t.mu.RUnlock()
-	return t.mu.failureMsg
+	var b strings.Builder
+	formatFailure(&b, t.mu.errors...)
+	return b.String()
 }
 
 func (t *testImpl) ArtifactsDir() string {