Skip to content

Commit

Permalink
roachtest: don't reuse clusters after test failure
Browse files Browse the repository at this point in the history
We've had a case where a cluster got messed up somehow and then a bunch
of tests that tried to reuse it failed. This patch employes a big hammer
and makes it so that we don't reuse a cluster after test failure (which
failure can be cluster related or not).

Release note: None
  • Loading branch information
andreimatei committed Jul 8, 2019
1 parent 74da75d commit 9d7c33d
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 16 deletions.
3 changes: 2 additions & 1 deletion pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1462,7 +1462,8 @@ func (c *cluster) doDestroy(ctx context.Context, l *logger) <-chan struct{} {
l.PrintfCtx(ctx, "destroying cluster %s...", c)
c.status("destroying cluster")
// We use a non-cancelable context for running this command. Once we got
// here, the cluster cannot be destroyed again, so we really want this command to succeed.
// here, the cluster cannot be destroyed again, so we really want this
// command to succeed.
if err := execCmd(context.Background(), l, roachprod, "destroy", c.name); err != nil {
l.ErrorfCtx(ctx, "error destroying cluster %s: %s", c, err)
} else {
Expand Down
40 changes: 25 additions & 15 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,29 +466,37 @@ func (r *testRunner) runWorker(
}
l.PrintfCtx(ctx, msg)
}
// If a test failed and debug was set, we bail.
if (err != nil || t.Failed()) && debug {
failureMsg := fmt.Sprintf("%s (%d) - ", testToRun.spec.Name, testToRun.runNum)
if err != nil {
failureMsg += err.Error()
if err != nil || t.Failed() {
failureMsg := fmt.Sprintf("%s (%d) - ", testToRun.spec.Name, testToRun.runNum)
if err != nil {
failureMsg += err.Error()
} else {
failureMsg += t.FailureMsg()
}

if debug {
// Save the cluster for future debugging.
c.Save(ctx, failureMsg, l)

// Continue with a fresh cluster.
c = nil
} else {
failureMsg += t.FailureMsg()
// On any test failure or error, we destroy the cluster. We could be
// more selective, but this sounds safer.
l.PrintfCtx(ctx, "destroying cluster %s because: %s", c, failureMsg)
c.Destroy(ctx, closeLogger, l)
c = nil
}
// Save the cluster for future debugging.
c.Save(ctx, failureMsg, l)

if err != nil {
return err
}

// Continue with a fresh cluster.
c = nil
}
}
}

// An error is returned in exceptional situations. The cluster cannot be reused
// if an error is returned.
// An error is returned if the test is still running (on another goroutine) when
// this returns. This happens when the test doesn't respond to cancellation.
// Returns true if the test is considered to have passed, false otherwise.
//
// Args:
Expand Down Expand Up @@ -629,8 +637,8 @@ func (r *testRunner) runTest(
l.PrintfCtx(ctx, "cluster needs to survive until %s, but has expiration: %s. Extending.",
minExp, c.expiration)
if err := c.Extend(ctx, extend, l); err != nil {
t.printfAndFail(0 /* skip */, "failed to extend cluster")
return false, err
t.printfAndFail(0 /* skip */, "failed to extend cluster: %s", err)
return false, nil
}
}

Expand Down Expand Up @@ -692,6 +700,8 @@ func (r *testRunner) runTest(
msg := "test timed out and afterwards failed to respond to cancelation"
t.l.PrintfCtx(ctx, msg)
collectClusterLogsOnce.Do(func() { r.collectClusterLogs(ctx, c, t.l) })
// We return an error here because the test goroutine is still running, so
// we want to alert the caller of this unusual situation.
return false, fmt.Errorf(msg)
}
}
Expand Down

0 comments on commit 9d7c33d

Please sign in to comment.