cockroachdb · craig · Jul 9, 2019 · Jul 8, 2019
@@ -1462,7 +1462,8 @@ func (c *cluster) doDestroy(ctx context.Context, l *logger) <-chan struct{} {
 			l.PrintfCtx(ctx, "destroying cluster %s...", c)
 			c.status("destroying cluster")
 			// We use a non-cancelable context for running this command. Once we got
-			// here, the cluster cannot be destroyed again, so we really want this command to succeed.
+			// here, the cluster cannot be destroyed again, so we really want this
+			// command to succeed.
 			if err := execCmd(context.Background(), l, roachprod, "destroy", c.name); err != nil {
 				l.ErrorfCtx(ctx, "error destroying cluster %s: %s", c, err)
 			} else {

@@ -466,29 +466,37 @@ func (r *testRunner) runWorker(
 			}
 			l.PrintfCtx(ctx, msg)
 		}
-		// If a test failed and debug was set, we bail.
-		if (err != nil || t.Failed()) && debug {
+		if err != nil || t.Failed() {
 			failureMsg := fmt.Sprintf("%s (%d) - ", testToRun.spec.Name, testToRun.runNum)
 			if err != nil {
 				failureMsg += err.Error()
 			} else {
 				failureMsg += t.FailureMsg()
 			}
-			// Save the cluster for future debugging.
-			c.Save(ctx, failureMsg, l)
+
+			if debug {
+				// Save the cluster for future debugging.
+				c.Save(ctx, failureMsg, l)
+
+				// Continue with a fresh cluster.
+				c = nil
+			} else {
+				// On any test failure or error, we destroy the cluster. We could be
+				// more selective, but this sounds safer.
+				l.PrintfCtx(ctx, "destroying cluster %s because: %s", c, failureMsg)
+				c.Destroy(ctx, closeLogger, l)
+				c = nil
+			}
 
 			if err != nil {
 				return err
 			}
-
-			// Continue with a fresh cluster.
-			c = nil
 		}
 	}
 }
 
-// An error is returned in exceptional situations. The cluster cannot be reused
-// if an error is returned.
+// An error is returned if the test is still running (on another goroutine) when
+// this returns. This happens when the test doesn't respond to cancellation.
 // Returns true if the test is considered to have passed, false otherwise.
 //
 // Args:
@@ -629,8 +637,8 @@ func (r *testRunner) runTest(
 		l.PrintfCtx(ctx, "cluster needs to survive until %s, but has expiration: %s. Extending.",
 			minExp, c.expiration)
 		if err := c.Extend(ctx, extend, l); err != nil {
-			t.printfAndFail(0 /* skip */, "failed to extend cluster")
-			return false, err
+			t.printfAndFail(0 /* skip */, "failed to extend cluster: %s", err)
+			return false, nil
 		}
 	}
 
@@ -692,6 +700,8 @@ func (r *testRunner) runTest(
 			msg := "test timed out and afterwards failed to respond to cancelation"
 			t.l.PrintfCtx(ctx, msg)
 			collectClusterLogsOnce.Do(func() { r.collectClusterLogs(ctx, c, t.l) })
+			// We return an error here because the test goroutine is still running, so
+			// we want to alert the caller of this unusual situation.
 			return false, fmt.Errorf(msg)
 		}
 	}