Skip to content

Commit

Permalink
roachtest: after test, fail (or complain) if any CRDB nodes are dead
Browse files Browse the repository at this point in the history
Do what's in the title. This will help us ensure that no test leaves
dead nodes behind (maybe some tests do that today, but it'll be easy
enough to fix). This gives us confidence that when a test passes, none
of the nodes died. Additionally, for tests that fail, it will always be
obvious that a node died because it is now printed as part of the test
output including what gets posted into a Github issue.

Example:

I changed the test below so that it would start nodes 1-3, immediately
stop node 1, and return, resulting in the following test failure:

```
--- FAIL: tpcc/nodes=3/w=max (7.24s)
	cluster.go:956,context.go:90,cluster.go:942,test.go:1160,test.go:1219: dead node detection: 4: skipped
		1: dead
		2: 83220
		3: 83229
		Error:  1: dead
```

I similarly verified that a short TPCC test would succeed without such
modifications. I also checked the chaos runner code -- it makes sure to
have all nodes restarted by the time it exits, so we should be good
on that front as well.

Release note: None
  • Loading branch information
tbg committed Mar 28, 2019
1 parent a53a22f commit 875e7ff
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 21 deletions.
2 changes: 2 additions & 0 deletions pkg/cmd/roachtest/chaos.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ func (ch *Chaos) Runner(c *cluster, m *monitor) func(context.Context) error {

select {
case <-ch.Stopper:
// NB: the roachtest harness checks that at the end of the test,
// all nodes that have data also have a running process.
l.Printf("restarting %v (chaos is done)\n", target)
c.Start(ctx, c.t.(*test), target)
return nil
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/roachtest/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,7 @@ func runCLINodeStatus(ctx context.Context, t *test, c *cluster) {
"true true",
"false false",
})

// Start node again to satisfy roachtest.
c.Start(ctx, t, c.Node(3))
}
26 changes: 26 additions & 0 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,32 @@ func (c *cluster) FetchDebugZip(ctx context.Context) error {
})
}

// FailOnDeadNodes fails the test if nodes that have a populated data dir are
// found to be not running. It prints both to t.l and the test output.
func (c *cluster) FailOnDeadNodes(ctx context.Context, t *test) {
if c.nodes == 0 {
// No nodes can happen during unit tests and implies nothing to do.
return
}

// Don't hang forever.
_ = contextutil.RunWithTimeout(ctx, "detect dead nodes", time.Minute, func(ctx context.Context) error {
_, err := execCmdWithBuffer(
ctx, t.l, roachprod, "monitor", c.name, "--oneshot", "--ignore-empty-nodes",
)
// If there's an error, it means either that the monitor command failed
// completely, or that it found a dead node worth complaining about.
if err != nil {
if ctx.Err() != nil {
// Don't fail if we timed out.
return nil
}
t.Fatalf("dead node detection: %s", err)
}
return nil
})
}

// FetchDmesg grabs the dmesg logs if possible. This requires being able to run
// `sudo dmesg` on the remote nodes.
func (c *cluster) FetchDmesg(ctx context.Context) error {
Expand Down
5 changes: 5 additions & 0 deletions pkg/cmd/roachtest/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,11 @@ SELECT count(replicas)
}

g.checkConnectedAndFunctional(ctx, t, c)

// Stop our special snowflake process which won't be recognized by the test
// harness, and start it again on the regular.
c.Stop(ctx, c.Node(1))
c.Start(ctx, t, c.Node(1))
}

func runCheckLocalityIPAddress(ctx context.Context, t *test, c *cluster) {
Expand Down
6 changes: 6 additions & 0 deletions pkg/cmd/roachtest/rapid_restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,10 @@ func runRapidRestart(ctx context.Context, t *test, c *cluster) {

t.l.Printf("%d OK\n", j)
}

// Clean up for the test harness. Usually we want to leave nodes running so
// that consistency checks can be run, but in this case there's not much
// there in the first place anyway.
c.Stop(ctx, nodes)
c.Wipe(ctx, nodes)
}
49 changes: 28 additions & 21 deletions pkg/cmd/roachtest/test.go
Original file line number Diff line number Diff line change
Expand Up @@ -792,9 +792,12 @@ func (t *test) printAndFail(skip int, args ...interface{}) {
}

func (t *test) printfAndFail(skip int, format string, args ...interface{}) {
msg := t.decorate(skip+1, fmt.Sprintf(format, args...))
t.l.Printf("test failure: " + msg)

t.mu.Lock()
defer t.mu.Unlock()
t.mu.output = append(t.mu.output, t.decorate(skip+1, fmt.Sprintf(format, args...))...)
t.mu.output = append(t.mu.output, msg...)
t.mu.failed = true
if t.mu.cancel != nil {
t.mu.cancel()
Expand Down Expand Up @@ -937,6 +940,7 @@ func (r *registry) runAsync(
l, err := rootLogger(logPath, teeOpt)
FatalIfErr(t, err)
t.l = l
out := io.MultiWriter(r.out, t.l.file)

if teamCity {
fmt.Printf("##teamcity[testStarted name='%s' flowId='%s']\n", t.Name(), t.Name())
Expand All @@ -949,7 +953,7 @@ func (r *registry) runAsync(
if len(details) > 0 {
detail = fmt.Sprintf(" [%s]", strings.Join(details, ","))
}
fmt.Fprintf(r.out, "=== RUN %s%s\n", t.Name(), detail)
fmt.Fprintf(out, "=== RUN %s%s\n", t.Name(), detail)
}
r.status.Lock()
r.status.running[t] = struct{}{}
Expand Down Expand Up @@ -1000,7 +1004,7 @@ func (r *registry) runAsync(
)
}

fmt.Fprintf(r.out, "--- FAIL: %s (%s)\n%s", t.Name(), dstr, output)
fmt.Fprintf(out, "--- FAIL: %s (%s)\n%s", t.Name(), dstr, output)
if postIssues && issues.CanPost() && t.spec.Run != nil {
authorEmail := getAuthorEmail(failLoc.file, failLoc.line)
branch := "<unknown branch>"
Expand All @@ -1013,21 +1017,21 @@ func (r *registry) runAsync(
"roachtest", t.Name(), "The test failed on "+branch+":\n"+string(output), authorEmail,
[]string{"O-roachtest"},
); err != nil {
fmt.Fprintf(r.out, "failed to post issue: %s\n", err)
fmt.Fprintf(out, "failed to post issue: %s\n", err)
}
}
} else if t.spec.Skip == "" {
fmt.Fprintf(r.out, "--- PASS: %s (%s)\n", t.Name(), dstr)
fmt.Fprintf(out, "--- PASS: %s (%s)\n", t.Name(), dstr)
// If `##teamcity[testFailed ...]` is not present before `##teamCity[testFinished ...]`,
// TeamCity regards the test as successful.
} else {
if teamCity {
fmt.Fprintf(r.out, "##teamcity[testIgnored name='%s' message='%s']\n",
t.Name(), teamCityEscape(t.spec.Skip))
}
fmt.Fprintf(r.out, "--- SKIP: %s (%s)\n\t%s\n", t.Name(), dstr, t.spec.Skip)
fmt.Fprintf(out, "--- SKIP: %s (%s)\n\t%s\n", t.Name(), dstr, t.spec.Skip)
if t.spec.SkipDetails != "" {
fmt.Fprintf(r.out, "Details: %s\n", t.spec.SkipDetails)
fmt.Fprintf(out, "Details: %s\n", t.spec.SkipDetails)
}
}

Expand Down Expand Up @@ -1155,7 +1159,19 @@ func (r *registry) runAsync(

// No subtests, so this is a leaf test.

timeout := time.Hour
timeout := c.expiration.Add(-10 * time.Minute).Sub(timeutil.Now())
if timeout <= 0 {
t.spec.Skip = fmt.Sprintf("cluster expired (%s)", timeout)
return
}

if t.spec.Timeout > 0 && timeout > t.spec.Timeout {
timeout = t.spec.Timeout
}

done := make(chan struct{})
defer close(done) // closed only after we've grabbed the debug info below

defer func() {
if t.Failed() {
if err := c.FetchDebugZip(ctx); err != nil {
Expand All @@ -1176,19 +1192,10 @@ func (r *registry) runAsync(
c.l.Printf("failed to download logs: %s", err)
}
}()

timeout = c.expiration.Add(-10 * time.Minute).Sub(timeutil.Now())
if timeout <= 0 {
t.spec.Skip = fmt.Sprintf("cluster expired (%s)", timeout)
return
}

if t.spec.Timeout > 0 && timeout > t.spec.Timeout {
timeout = t.spec.Timeout
}

done := make(chan struct{})
defer close(done)
// Detect dead nodes in an inner defer. Note that this will call t.Fatal
// when appropriate, which will cause the closure above to enter the
// t.Failed() branch.
defer c.FailOnDeadNodes(ctx, t)

runCtx, cancel := context.WithCancel(ctx)
t.mu.Lock()
Expand Down

0 comments on commit 875e7ff

Please sign in to comment.