roachtest: after test, fail (or complain) if any CRDB nodes are dead

Do what's in the title. This will help us ensure that no test leaves dead nodes behind (maybe some tests do that today, but it'll be easy enough to fix). This gives us confidence that when a test passes, none of the nodes died. Additionally, for tests that fail, it will always be obvious that a node died because it is now printed as part of the test output including what gets posted into a Github issue. Example: I changed the test below so that it would start nodes 1-3, immediately stop node 1, and return, resulting in the following test failure: ``` --- FAIL: tpcc/nodes=3/w=max (7.24s) cluster.go:956,context.go:90,cluster.go:942,test.go:1160,test.go:1219: dead node detection: 4: skipped 1: dead 2: 83220 3: 83229 Error: 1: dead ``` I similarly verified that a short TPCC test would succeed without such modifications. I also checked the chaos runner code -- it makes sure to have all nodes restarted by the time it exits, so we should be good on that front as well. Release note: None
cockroachdb · Mar 28, 2019 · 875e7ff · 875e7ff
1 parent a53a22f
commit 875e7ff
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 21 deletions.
diff --git a/pkg/cmd/roachtest/chaos.go b/pkg/cmd/roachtest/chaos.go
@@ -94,6 +94,8 @@ func (ch *Chaos) Runner(c *cluster, m *monitor) func(context.Context) error {
 
 			select {
 			case <-ch.Stopper:
+				// NB: the roachtest harness checks that at the end of the test,
+				// all nodes that have data also have a running process.
 				l.Printf("restarting %v (chaos is done)\n", target)
 				c.Start(ctx, c.t.(*test), target)
 				return nil

diff --git a/pkg/cmd/roachtest/cli.go b/pkg/cmd/roachtest/cli.go
@@ -116,4 +116,7 @@ func runCLINodeStatus(ctx context.Context, t *test, c *cluster) {
 		"true true",
 		"false false",
 	})
+
+	// Start node again to satisfy roachtest.
+	c.Start(ctx, t, c.Node(3))
 }
diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go
@@ -930,6 +930,32 @@ func (c *cluster) FetchDebugZip(ctx context.Context) error {
 	})
 }
 
+// FailOnDeadNodes fails the test if nodes that have a populated data dir are
+// found to be not running. It prints both to t.l and the test output.
+func (c *cluster) FailOnDeadNodes(ctx context.Context, t *test) {
+	if c.nodes == 0 {
+		// No nodes can happen during unit tests and implies nothing to do.
+		return
+	}
+
+	// Don't hang forever.
+	_ = contextutil.RunWithTimeout(ctx, "detect dead nodes", time.Minute, func(ctx context.Context) error {
+		_, err := execCmdWithBuffer(
+			ctx, t.l, roachprod, "monitor", c.name, "--oneshot", "--ignore-empty-nodes",
+		)
+		// If there's an error, it means either that the monitor command failed
+		// completely, or that it found a dead node worth complaining about.
+		if err != nil {
+			if ctx.Err() != nil {
+				// Don't fail if we timed out.
+				return nil
+			}
+			t.Fatalf("dead node detection: %s", err)
+		}
+		return nil
+	})
+}
+
 // FetchDmesg grabs the dmesg logs if possible. This requires being able to run
 // `sudo dmesg` on the remote nodes.
 func (c *cluster) FetchDmesg(ctx context.Context) error {

diff --git a/pkg/cmd/roachtest/gossip.go b/pkg/cmd/roachtest/gossip.go
@@ -430,6 +430,11 @@ SELECT count(replicas)
 	}
 
 	g.checkConnectedAndFunctional(ctx, t, c)
+
+	// Stop our special snowflake process which won't be recognized by the test
+	// harness, and start it again on the regular.
+	c.Stop(ctx, c.Node(1))
+	c.Start(ctx, t, c.Node(1))
 }
 
 func runCheckLocalityIPAddress(ctx context.Context, t *test, c *cluster) {

diff --git a/pkg/cmd/roachtest/rapid_restart.go b/pkg/cmd/roachtest/rapid_restart.go
@@ -111,4 +111,10 @@ func runRapidRestart(ctx context.Context, t *test, c *cluster) {
 
 		t.l.Printf("%d OK\n", j)
 	}
+
+	// Clean up for the test harness. Usually we want to leave nodes running so
+	// that consistency checks can be run, but in this case there's not much
+	// there in the first place anyway.
+	c.Stop(ctx, nodes)
+	c.Wipe(ctx, nodes)
 }
diff --git a/pkg/cmd/roachtest/test.go b/pkg/cmd/roachtest/test.go
@@ -792,9 +792,12 @@ func (t *test) printAndFail(skip int, args ...interface{}) {
 }
 
 func (t *test) printfAndFail(skip int, format string, args ...interface{}) {
+	msg := t.decorate(skip+1, fmt.Sprintf(format, args...))
+	t.l.Printf("test failure: " + msg)
+
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	t.mu.output = append(t.mu.output, t.decorate(skip+1, fmt.Sprintf(format, args...))...)
+	t.mu.output = append(t.mu.output, msg...)
 	t.mu.failed = true
 	if t.mu.cancel != nil {
 		t.mu.cancel()
@@ -937,6 +940,7 @@ func (r *registry) runAsync(
 	l, err := rootLogger(logPath, teeOpt)
 	FatalIfErr(t, err)
 	t.l = l
+	out := io.MultiWriter(r.out, t.l.file)
 
 	if teamCity {
 		fmt.Printf("##teamcity[testStarted name='%s' flowId='%s']\n", t.Name(), t.Name())
@@ -949,7 +953,7 @@ func (r *registry) runAsync(
 		if len(details) > 0 {
 			detail = fmt.Sprintf(" [%s]", strings.Join(details, ","))
 		}
-		fmt.Fprintf(r.out, "=== RUN   %s%s\n", t.Name(), detail)
+		fmt.Fprintf(out, "=== RUN   %s%s\n", t.Name(), detail)
 	}
 	r.status.Lock()
 	r.status.running[t] = struct{}{}
@@ -1000,7 +1004,7 @@ func (r *registry) runAsync(
 					)
 				}
 
-				fmt.Fprintf(r.out, "--- FAIL: %s (%s)\n%s", t.Name(), dstr, output)
+				fmt.Fprintf(out, "--- FAIL: %s (%s)\n%s", t.Name(), dstr, output)
 				if postIssues && issues.CanPost() && t.spec.Run != nil {
 					authorEmail := getAuthorEmail(failLoc.file, failLoc.line)
 					branch := "<unknown branch>"
@@ -1013,21 +1017,21 @@ func (r *registry) runAsync(
 						"roachtest", t.Name(), "The test failed on "+branch+":\n"+string(output), authorEmail,
 						[]string{"O-roachtest"},
 					); err != nil {
-						fmt.Fprintf(r.out, "failed to post issue: %s\n", err)
+						fmt.Fprintf(out, "failed to post issue: %s\n", err)
 					}
 				}
 			} else if t.spec.Skip == "" {
-				fmt.Fprintf(r.out, "--- PASS: %s (%s)\n", t.Name(), dstr)
+				fmt.Fprintf(out, "--- PASS: %s (%s)\n", t.Name(), dstr)
 				// If `##teamcity[testFailed ...]` is not present before `##teamCity[testFinished ...]`,
 				// TeamCity regards the test as successful.
 			} else {
 				if teamCity {
 					fmt.Fprintf(r.out, "##teamcity[testIgnored name='%s' message='%s']\n",
 						t.Name(), teamCityEscape(t.spec.Skip))
 				}
-				fmt.Fprintf(r.out, "--- SKIP: %s (%s)\n\t%s\n", t.Name(), dstr, t.spec.Skip)
+				fmt.Fprintf(out, "--- SKIP: %s (%s)\n\t%s\n", t.Name(), dstr, t.spec.Skip)
 				if t.spec.SkipDetails != "" {
-					fmt.Fprintf(r.out, "Details: %s\n", t.spec.SkipDetails)
+					fmt.Fprintf(out, "Details: %s\n", t.spec.SkipDetails)
 				}
 			}
 
@@ -1155,7 +1159,19 @@ func (r *registry) runAsync(
 
 		// No subtests, so this is a leaf test.
 
-		timeout := time.Hour
+		timeout := c.expiration.Add(-10 * time.Minute).Sub(timeutil.Now())
+		if timeout <= 0 {
+			t.spec.Skip = fmt.Sprintf("cluster expired (%s)", timeout)
+			return
+		}
+
+		if t.spec.Timeout > 0 && timeout > t.spec.Timeout {
+			timeout = t.spec.Timeout
+		}
+
+		done := make(chan struct{})
+		defer close(done) // closed only after we've grabbed the debug info below
+
 		defer func() {
 			if t.Failed() {
 				if err := c.FetchDebugZip(ctx); err != nil {
@@ -1176,19 +1192,10 @@ func (r *registry) runAsync(
 				c.l.Printf("failed to download logs: %s", err)
 			}
 		}()
-
-		timeout = c.expiration.Add(-10 * time.Minute).Sub(timeutil.Now())
-		if timeout <= 0 {
-			t.spec.Skip = fmt.Sprintf("cluster expired (%s)", timeout)
-			return
-		}
-
-		if t.spec.Timeout > 0 && timeout > t.spec.Timeout {
-			timeout = t.spec.Timeout
-		}
-
-		done := make(chan struct{})
-		defer close(done)
+		// Detect dead nodes in an inner defer. Note that this will call t.Fatal
+		// when appropriate, which will cause the closure above to enter the
+		// t.Failed() branch.
+		defer c.FailOnDeadNodes(ctx, t)
 
 		runCtx, cancel := context.WithCancel(ctx)
 		t.mu.Lock()