From c9a4c73df14e54edd4421dd90030e2e2cb3a1e17 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Tue, 30 Aug 2022 14:15:11 +0200
Subject: [PATCH] roachtest: use structured errors

`roachtest` predates `cockroachdb/errors` and as a result so far hasn't
capitalized many improvements. Today, roachtest errors are often noisy.

This commit adopts `cockroachdb/errors` and reaps some of the immediate
rewards, while making related improvements that would also have been
possible without the adoption.

The main user-facing change is that error messages in the output are now
a lot more concise; these would previously - sometimes - include the
entire stack trace. Now they contain only the topmost stack record of
the innermost error:

```
(test_test.go:129).func3: first error
(test_test.go:130).func3: second error
```

The full error continues to be logged, but we write it to files in
the artifacts, where they can be inspected just in case. This now
happens unconditionally for all errors, whereas the old code only
logged the stacks if the error was reported in a certain way.

Internally, the reorganization has also considerably simplified
roachtest. Stack frame offset tracking has been removed, since
`cockroachdb/errors` already handles it. Custom rendering code
was similarly significantly trimmed down.

In making this change, I opted to always have `roachtest` create
loggers backed by real files. This was previously elided in tests,
but this would have caused extra conditionals. It's better to
standardize on the way in which `roachtest` is always used in
actual invocations.

Looking ahead, structured errors open a few avenues:

- can assign the owner based on the type of failure. For example,
  #87106 wants consistency check failures to always go to the KV
  team, regardless of which test's cluster was being checked.

  Failures during an IMPORT/RESTORE (common across all tests) could be
  routed to the Disaster Recovery team by default (assuming we provide a
  wrapper around these operations that all tests use and which does this
  wrapping).
- Similarly, SSH failures can be special cased via a marking error
  and can be directed away from the owning team, which can't do
  anything about them anyway (#82398).
- We can conceivably start grouping failure modes by "error signature".
  That is, errors which have a "comparable" chain of errors (e.g. same
  types, and within formatted errors the same format string). Issues
  could then be reused only for compatible error signatures.

Release note: None
---
 pkg/cmd/roachtest/test/test_interface.go      |   1 +
 pkg/cmd/roachtest/test_impl.go                | 213 ++++++++----------
 pkg/cmd/roachtest/test_runner.go              |  49 ++--
 pkg/cmd/roachtest/test_test.go                |  10 +-
 pkg/testutils/lint/passes/fmtsafe/fmtsafe.go  |   3 +-
 .../lint/passes/fmtsafe/functions.go          |  18 +-
 6 files changed, 141 insertions(+), 153 deletions(-)

diff --git a/pkg/cmd/roachtest/test/test_interface.go b/pkg/cmd/roachtest/test/test_interface.go
index 6ccbc94adbb9..7da9b1c030d5 100644
--- a/pkg/cmd/roachtest/test/test_interface.go
+++ b/pkg/cmd/roachtest/test/test_interface.go
@@ -37,6 +37,7 @@ type Test interface {
 	VersionsBinaryOverride() map[string]string
 	Skip(args ...interface{})
 	Skipf(format string, args ...interface{})
+	Error(args ...interface{})
 	Errorf(string, ...interface{})
 	FailNow()
 	Fatal(args ...interface{})
diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go
index d3725a639a54..49d18125c021 100644
--- a/pkg/cmd/roachtest/test_impl.go
+++ b/pkg/cmd/roachtest/test_impl.go
@@ -11,13 +11,10 @@
 package main
 
 import (
-	"bytes"
 	"context"
 	"fmt"
 	"io"
-	// For the debug http handlers.
-	_ "net/http/pprof"
-	"runtime"
+	"os"
 	"strings"
 	"time"
 
@@ -27,6 +24,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
 	"github.com/cockroachdb/cockroach/pkg/util/version"
+	"github.com/cockroachdb/errors"
 	"github.com/petermattis/goid"
 )
 
@@ -71,23 +69,33 @@ type testImpl struct {
 
 	mu struct {
 		syncutil.RWMutex
-		done    bool
-		failed  bool
-		timeout bool // if failed == true, this indicates whether the test timed out
-		// cancel, if set, is called from the t.Fatal() family of functions when the
-		// test is being marked as failed (i.e. when the failed field above is also
-		// set). This is used to cancel the context passed to t.spec.Run(), so async
-		// test goroutines can be notified.
+		done bool
+
 		cancel  func()
 		failLoc struct {
 			file string
 			line int
 		}
-		failureMsg string
+
+		// Errors are all the errors passed to `addFailure`, in order of
+		// these calls.
+		//
+		// NB: the first failure is not always the relevant one due to:
+		// https://github.com/cockroachdb/cockroach/issues/44436
+		errors []error
+		// If len(errors)>0, this indicates whether the test timed out
+		// cancel, if set, is called from the t.Fatal() family of functions when the
+		// test is being marked as failed (i.e. when the failed field above is also
+		// set). This is used to cancel the context passed to t.spec.Run(), so async
+		// test goroutines can be notified.
+
 		// status is a map from goroutine id to status set by that goroutine. A
 		// special goroutine is indicated by runnerID; that one provides the test's
 		// "main status".
 		status map[int64]testStatus
+
+		// TODO(test-eng): this should just be an in-mem (ring) buffer attached to
+		// `t.L()`.
 		output []byte
 	}
 	// Map from version to path to the cockroach binary to be used when
@@ -246,6 +254,23 @@ func (t *testImpl) Skipf(format string, args ...interface{}) {
 	panic(errTestFatal)
 }
 
+// This creates an error from the first arg, and adds each subsequent arg
+// as error detail
+func argsToErr(depth int, args ...interface{}) error {
+	// NB: we'd probably not allow multiple arguments here and we'd want
+	// the one remaining arg to be an `error`, but we are trying to be
+	// compatible with `(*testing.T).Fatal`.
+	var err error
+	for _, arg := range args {
+		if err == nil {
+			err = errors.NewWithDepthf(depth+1, "%v", arg)
+			continue
+		}
+		err = errors.WithDetailf(err, "%v", arg)
+	}
+	return err
+}
+
 // Fatal marks the test as failed, prints the args to t.L(), and calls
 // panic(errTestFatal). It can be called multiple times.
 //
@@ -255,148 +280,80 @@ func (t *testImpl) Skipf(format string, args ...interface{}) {
 // ATTENTION: Since this calls panic(errTestFatal), it should only be called
 // from a test's closure. The test runner itself should never call this.
 func (t *testImpl) Fatal(args ...interface{}) {
-	t.markFailedInner("" /* format */, args...)
+	t.addFailure(argsToErr(1, args...))
 	panic(errTestFatal)
 }
 
 // Fatalf is like Fatal, but takes a format string.
 func (t *testImpl) Fatalf(format string, args ...interface{}) {
-	t.markFailedInner(format, args...)
+	t.addFailure(errors.NewWithDepthf(1, format, args...))
 	panic(errTestFatal)
 }
 
 // FailNow implements the TestingT interface.
 func (t *testImpl) FailNow() {
-	t.Fatal()
+	t.addFailure(errors.NewWithDepthf(1, "FailNow called"))
+	panic(errTestFatal)
 }
 
-// Errorf implements the TestingT interface.
-func (t *testImpl) Errorf(format string, args ...interface{}) {
-	t.markFailedInner(format, args...)
+func (t *testImpl) Error(args ...interface{}) {
+	t.addFailure(argsToErr(1, args...))
 }
 
-func (t *testImpl) markFailedInner(format string, args ...interface{}) {
-	// Skip two frames: our own and the caller.
-	if format != "" {
-		t.printfAndFail(2 /* skip */, format, args...)
-	} else {
-		t.printAndFail(2 /* skip */, args...)
-	}
+// Errorf implements the TestingT interface.
+func (t *testImpl) Errorf(format string, args ...interface{}) {
+	t.addFailure(errors.NewWithDepthf(1, format, args...))
 }
 
-func (t *testImpl) printAndFail(skip int, args ...interface{}) {
-	var msg string
-	if len(args) == 1 {
-		// If we were passed only an error, then format it with "%+v" in order to
-		// get any stack traces.
-		if err, ok := args[0].(error); ok {
-			msg = fmt.Sprintf("%+v", err)
+func formatFailure(b *strings.Builder, errs ...error) {
+	for i, err := range errs {
+		if i > 0 {
+			fmt.Fprintln(b)
 		}
+		file, line, fn, ok := errors.GetOneLineSource(err)
+		if !ok {
+			file, line, fn = "<unknown>", 0, "unknown"
+		}
+		fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, err)
 	}
-	if msg == "" {
-		msg = fmt.Sprint(args...)
-	}
-	t.failWithMsg(t.decorate(skip+1, msg))
-}
-
-func (t *testImpl) printfAndFail(skip int, format string, args ...interface{}) {
-	if format == "" {
-		panic(fmt.Sprintf("invalid empty format. args: %s", args))
-	}
-	t.failWithMsg(t.decorate(skip+1, fmt.Sprintf(format, args...)))
 }
 
-func (t *testImpl) failWithMsg(msg string) {
+func (t *testImpl) addFailure(reportErr error) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 
-	prefix := ""
-	if t.mu.failed {
-		prefix = "[not the first failure] "
-		// NB: the first failure is not always the relevant one due to:
-		// https://github.com/cockroachdb/cockroach/issues/44436
-		//
-		// So we chain all failures together in the order in which we see
-		// them.
-		msg = "\n" + msg
+	t.mu.errors = append(t.mu.errors, reportErr)
+
+	var b strings.Builder
+	formatFailure(&b, reportErr)
+	msg := b.String()
+
+	t.L().Printf("test failure #%d: %s", len(t.mu.errors), msg)
+	// Also dump the verbose error (incl. all stack traces) to a log file, in case
+	// we need it. The stacks are sometimes helpful, but we don't want them in the
+	// main log as they are highly verbose.
+	{
+		cl, err := t.L().ChildLogger(
+			fmt.Sprintf("failure_%d", len(t.mu.errors)),
+			logger.QuietStderr, logger.QuietStdout,
+		)
+		if err == nil {
+			// We don't actually log through this logger since it adds an unrelated
+			// file:line caller (namely ours). The error already has stack traces
+			// so it's better to write only it to the file to avoid confusion.
+			path := cl.File.Name()
+			cl.Close() // we just wanted the filename
+			_ = os.WriteFile(path, []byte(fmt.Sprintf("%+v", reportErr)), 0644)
+		}
 	}
-	t.L().Printf("%stest failure: %s", prefix, msg)
 
-	t.mu.failed = true
-	t.mu.failureMsg += msg
 	t.mu.output = append(t.mu.output, msg...)
+	t.mu.output = append(t.mu.output, '\n')
 	if t.mu.cancel != nil {
 		t.mu.cancel()
 	}
 }
 
-// Args:
-// skip: The number of stack frames to exclude from the result. 0 means that
-//
-//	the caller will be the first frame identified. 1 means the caller's caller
-//	will be the first, etc.
-func (t *testImpl) decorate(skip int, s string) string {
-	// Skip two extra frames to account for this function and runtime.Callers
-	// itself.
-	var pc [50]uintptr
-	n := runtime.Callers(2+skip, pc[:])
-	if n == 0 {
-		panic("zero callers found")
-	}
-
-	buf := new(bytes.Buffer)
-	frames := runtime.CallersFrames(pc[:n])
-	sep := "\t"
-	runnerFound := false
-	for {
-		if runnerFound {
-			break
-		}
-
-		frame, more := frames.Next()
-		if !more {
-			break
-		}
-		if frame.Function == t.runner {
-			runnerFound = true
-
-			// Handle the special case of the runner function being the caller of
-			// t.Fatal(). In that case, that's the line to be used for issue creation.
-			if t.mu.failLoc.file == "" {
-				t.mu.failLoc.file = frame.File
-				t.mu.failLoc.line = frame.Line
-			}
-		}
-		if !t.mu.failed && !runnerFound {
-			// Keep track of the highest stack frame that is lower than the t.runner
-			// stack frame. This is used to determine the author of that line of code
-			// and issue assignment.
-			t.mu.failLoc.file = frame.File
-			t.mu.failLoc.line = frame.Line
-		}
-		file := frame.File
-		if index := strings.LastIndexByte(file, '/'); index >= 0 {
-			file = file[index+1:]
-		}
-		fmt.Fprintf(buf, "%s%s:%d", sep, file, frame.Line)
-		sep = ","
-	}
-	buf.WriteString(": ")
-
-	lines := strings.Split(s, "\n")
-	if l := len(lines); l > 1 && lines[l-1] == "" {
-		lines = lines[:l-1]
-	}
-	for i, line := range lines {
-		if i > 0 {
-			buf.WriteString("\n\t\t")
-		}
-		buf.WriteString(line)
-	}
-	buf.WriteByte('\n')
-	return buf.String()
-}
-
 func (t *testImpl) duration() time.Duration {
 	return t.end.Sub(t.start)
 }
@@ -404,13 +361,19 @@ func (t *testImpl) duration() time.Duration {
 func (t *testImpl) Failed() bool {
 	t.mu.RLock()
 	defer t.mu.RUnlock()
-	return t.mu.failed
+	return t.failedRLocked()
+}
+
+func (t *testImpl) failedRLocked() bool {
+	return len(t.mu.errors) > 0
 }
 
 func (t *testImpl) FailureMsg() string {
 	t.mu.RLock()
 	defer t.mu.RUnlock()
-	return t.mu.failureMsg
+	var b strings.Builder
+	formatFailure(&b, t.mu.errors...)
+	return b.String()
 }
 
 func (t *testImpl) ArtifactsDir() string {
diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
index 6bb2b33f280c..2e325dd98924 100644
--- a/pkg/cmd/roachtest/test_runner.go
+++ b/pkg/cmd/roachtest/test_runner.go
@@ -576,22 +576,23 @@ func (r *testRunner) runWorker(
 					testToRun.spec.Name, clusterCreateErr)
 			}
 		}
-		// Prepare the test's logger.
-		logPath := ""
-		var artifactsDir string
-		var artifactsSpec string
-		if artifactsRootDir != "" {
-			escapedTestName := teamCityNameEscape(testToRun.spec.Name)
-			runSuffix := "run_" + strconv.Itoa(testToRun.runNum)
-
-			artifactsDir = filepath.Join(filepath.Join(artifactsRootDir, escapedTestName), runSuffix)
-			logPath = filepath.Join(artifactsDir, "test.log")
-
-			// Map artifacts/TestFoo/run_?/** => TestFoo/run_?/**, i.e. collect the artifacts
-			// for this test exactly as they are laid out on disk (when the time
-			// comes).
-			artifactsSpec = fmt.Sprintf("%s/%s/** => %s/%s", filepath.Join(literalArtifactsDir, escapedTestName), runSuffix, escapedTestName, runSuffix)
+		// Prepare the test's logger. Always set this up with real files, using a
+		// temp dir if necessary. This simplifies testing.
+		if artifactsRootDir == "" {
+			artifactsRootDir, _ = os.MkdirTemp("", "roachtest-logger")
 		}
+
+		escapedTestName := teamCityNameEscape(testToRun.spec.Name)
+		runSuffix := "run_" + strconv.Itoa(testToRun.runNum)
+
+		artifactsDir := filepath.Join(filepath.Join(artifactsRootDir, escapedTestName), runSuffix)
+		logPath := filepath.Join(artifactsDir, "test.log")
+
+		// Map artifacts/TestFoo/run_?/** => TestFoo/run_?/**, i.e. collect the artifacts
+		// for this test exactly as they are laid out on disk (when the time
+		// comes).
+		artifactsSpec := fmt.Sprintf("%s/%s/** => %s/%s", filepath.Join(literalArtifactsDir, escapedTestName), runSuffix, escapedTestName, runSuffix)
+
 		testL, err := logger.RootLogger(logPath, teeOpt)
 		if err != nil {
 			return err
@@ -621,7 +622,7 @@ func (r *testRunner) runWorker(
 			oldName := t.spec.Name
 			oldOwner := t.spec.Owner
 			// Generate failure reason and mark the test failed to preclude fetching (cluster) artifacts.
-			t.printAndFail(0, clusterCreateErr)
+			t.addFailure(clusterCreateErr)
 			issueOutput := "test %s was skipped due to %s"
 			issueOutput = fmt.Sprintf(issueOutput, oldName, t.FailureMsg())
 			// N.B. issue title is of the form "roachtest: ${t.spec.Name} failed" (see UnitTestFormatter).
@@ -668,7 +669,7 @@ func (r *testRunner) runWorker(
 			shout(ctx, l, stdout, "test returned error: %s: %s", t.Name(), err)
 			// Mark the test as failed if it isn't already.
 			if !t.Failed() {
-				t.printAndFail(0 /* skip */, err)
+				t.addFailure(err)
 			}
 		} else {
 			msg := "test passed: %s (run %d)"
@@ -805,10 +806,10 @@ func (r *testRunner) runTest(
 		// during the post-flight checks; the test itself runs on a different
 		// goroutine and has similar code to terminate errTestFatal.
 		if err := recover(); err != nil && err != errTestFatal {
-			t.mu.Lock()
-			t.mu.failed = true
-			t.mu.output = append(t.mu.output, t.decorate(0 /* skip */, fmt.Sprint(err))...)
-			t.mu.Unlock()
+			if _, ok := err.(error); !ok {
+				err = errors.Newf("%v", err)
+			}
+			t.addFailure(err.(error))
 		}
 
 		t.mu.Lock()
@@ -817,9 +818,7 @@ func (r *testRunner) runTest(
 
 		durationStr := fmt.Sprintf("%.2fs", t.duration().Seconds())
 		if t.Failed() {
-			t.mu.Lock()
-			output := fmt.Sprintf("test artifacts and logs in: %s\n", t.ArtifactsDir()) + string(t.mu.output)
-			t.mu.Unlock()
+			output := fmt.Sprintf("test artifacts and logs in: %s\n%s", t.ArtifactsDir(), t.FailureMsg())
 
 			if teamCity {
 				shout(ctx, l, stdout, "##teamcity[testFailed name='%s' details='%s' flowId='%s']",
@@ -920,6 +919,8 @@ func (r *testRunner) runTest(
 			// produced by t.Fatal*().
 			if r := recover(); r != nil && r != errTestFatal {
 				// NB: we're careful to avoid t.Fatalf here, which re-panics.
+				// Note that the error will be logged to a file, and the stack will
+				// contain the source of the panic.
 				t.Errorf("test panicked: %v", r)
 			}
 		}()
diff --git a/pkg/cmd/roachtest/test_test.go b/pkg/cmd/roachtest/test_test.go
index 0ed208df8c98..b61659325479 100644
--- a/pkg/cmd/roachtest/test_test.go
+++ b/pkg/cmd/roachtest/test_test.go
@@ -15,6 +15,7 @@ import (
 	"context"
 	"io"
 	"math/rand"
+	"path/filepath"
 	"regexp"
 	"sort"
 	"strings"
@@ -184,6 +185,7 @@ func TestRunnerRun(t *testing.T) {
 			if exp := c.expOut; exp != "" && !strings.Contains(out, exp) {
 				t.Fatalf("'%s' not found in output:\n%s", exp, out)
 			}
+			t.Log(out)
 		})
 	}
 }
@@ -253,7 +255,13 @@ func setupRunnerTest(t *testing.T, r testRegistryImpl, testFilters []string) *ru
 	var stdout syncedBuffer
 	var stderr syncedBuffer
 	lopt := loggingOpt{
-		l:            nilLogger(),
+		l: func() *logger.Logger {
+			l, err := logger.RootLogger(filepath.Join(t.TempDir(), "test.log"), logger.NoTee)
+			if err != nil {
+				panic(err)
+			}
+			return l
+		}(),
 		tee:          logger.NoTee,
 		stdout:       &stdout,
 		stderr:       &stderr,
diff --git a/pkg/testutils/lint/passes/fmtsafe/fmtsafe.go b/pkg/testutils/lint/passes/fmtsafe/fmtsafe.go
index f55046ac48ee..f7f4029f3695 100644
--- a/pkg/testutils/lint/passes/fmtsafe/fmtsafe.go
+++ b/pkg/testutils/lint/passes/fmtsafe/fmtsafe.go
@@ -251,7 +251,8 @@ func checkCallExpr(pass *analysis.Pass, enclosingFnName string, call *ast.CallEx
 
 // Tip is exported for use in tests.
 var Tip = `
-Tip: use YourFuncf("descriptive prefix %%s", ...) or list new formatting wrappers in pkg/testutils/lint/passes/fmtsafe/functions.go.`
+Tip: use YourFuncf("descriptive prefix %%s", ...) or list new formatting wrappers in pkg/testutils/lint/passes/fmtsafe/functions.go.
+N.B. additional entry is required functions in the main package. See functions.go#requireConstFmt`
 
 func hasNoLintComment(pass *analysis.Pass, call *ast.CallExpr, idx int) bool {
 	fPos, f := findContainingFile(pass, call)
diff --git a/pkg/testutils/lint/passes/fmtsafe/functions.go b/pkg/testutils/lint/passes/fmtsafe/functions.go
index 8b3fbb104f59..5bd4fe3ff1b0 100644
--- a/pkg/testutils/lint/passes/fmtsafe/functions.go
+++ b/pkg/testutils/lint/passes/fmtsafe/functions.go
@@ -32,8 +32,14 @@ var requireConstMsg = map[string]bool{
 	"(*github.com/cockroachdb/cockroach/pkg/sql.optPlanningCtx).log": true,
 }
 
-// requireConstFmt records functions for which the string arg
-// before the final ellipsis must be a constant string.
+/*
+requireConstFmt records functions for which the string arg
+before the final ellipsis must be a constant string.
+
+Definitions surrounded in parentheses are functions attached to a struct.
+For functions defined in the main package, a *second* entry is required
+in the form (main.yourStruct).yourFuncF
+*/
 var requireConstFmt = map[string]bool{
 	// Logging things.
 	"log.Printf":           true,
@@ -85,6 +91,14 @@ var requireConstFmt = map[string]bool{
 	"(*github.com/cockroachdb/cockroach/pkg/util/grpcutil.grpcLogger).Errorf":   true,
 	"(*github.com/cockroachdb/cockroach/pkg/util/grpcutil.grpcLogger).Fatalf":   true,
 
+	// Both of these signatures need to be included for the linter to not flag
+	// roachtest testImpl.Errorf since it is in the main package
+	"(*main.testImpl).Errorf": true,
+	"(*github.com/cockroachdb/cockroach/pkg/cmd/roachtest.testImpl).Errorf": true,
+
+	"(*main.testImpl).Fatalf": true,
+	"(*github.com/cockroachdb/cockroach/pkg/cmd/roachtest.testImpl).Fatalf": true,
+
 	"(*github.com/cockroachdb/cockroach/pkg/kv/kvserver.raftLogger).Debugf":   true,
 	"(*github.com/cockroachdb/cockroach/pkg/kv/kvserver.raftLogger).Infof":    true,
 	"(*github.com/cockroachdb/cockroach/pkg/kv/kvserver.raftLogger).Warningf": true,