Skip to content

Commit

Permalink
Merge #38740
Browse files Browse the repository at this point in the history
38740: roachtest: collect cluster logs after timed out tests r=andreimatei a=andreimatei

Before this patch, a timed out test that wasn't reacting to its ctx
cancelation was not attempting to collect node logs.

Release note: None

Co-authored-by: Andrei Matei <andrei@cockroachlabs.com>
  • Loading branch information
craig[bot] and andreimatei committed Jul 8, 2019
2 parents 6299bd4 + 506025d commit 74da75d
Showing 1 changed file with 33 additions and 25 deletions.
58 changes: 33 additions & 25 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ type clusterAllocatorFn func(
//
// Errors are returned in exceptional circumstances, like when a cluster failed
// to be created or when a test timed out and failed to react to its
// cancelation. Upon return, an attempt is performed to destroy the cluster used
// cancellation. Upon return, an attempt is performed to destroy the cluster used
// by this worker. If an error is returned, we might have "leaked" cpu quota
// because the cluster destruction might have failed but we've still released
// the quota. Also, we might have "leaked" a test goroutine (in the test
Expand Down Expand Up @@ -646,6 +646,7 @@ func (r *testRunner) runTest(
// timeout.
success := false
done := make(chan struct{})
var collectClusterLogsOnce sync.Once
go func() {
defer close(done) // closed only after we've grabbed the debug info below

Expand All @@ -662,29 +663,7 @@ func (r *testRunner) runTest(
success = true
return
}

if err := c.FetchDebugZip(ctx); err != nil {
t.l.Printf("failed to download logs: %s", err)
}
if err := c.FetchDmesg(ctx); err != nil {
c.l.Printf("failed to fetch dmesg: %s", err)
}
if err := c.FetchJournalctl(ctx); err != nil {
c.l.Printf("failed to fetch journalctl: %s", err)
}
if err := c.FetchCores(ctx); err != nil {
c.l.Printf("failed to fetch cores: %s", err)
}
if err := c.CopyRoachprodState(ctx); err != nil {
c.l.Printf("failed to copy roachprod state: %s", err)
}
// NB: fetch the logs even when we have a debug zip because
// debug zip can't ever get the logs for down nodes.
// We only save artifacts for failed tests in CI, so this
// duplication is acceptable.
if err := c.FetchLogs(ctx); err != nil {
c.l.Printf("failed to download logs: %s", err)
}
collectClusterLogsOnce.Do(func() { r.collectClusterLogs(ctx, c, t.l) })
}()

// This is the call to actually run the test.
Expand All @@ -710,12 +689,41 @@ func (r *testRunner) runTest(
panic("expected success=false after a timeout")
}
case <-time.After(5 * time.Minute):
return false, fmt.Errorf("test timed out and afterwards failed to respond to cancelation")
msg := "test timed out and afterwards failed to respond to cancelation"
t.l.PrintfCtx(ctx, msg)
collectClusterLogsOnce.Do(func() { r.collectClusterLogs(ctx, c, t.l) })
return false, fmt.Errorf(msg)
}
}
return success, nil
}

func (r *testRunner) collectClusterLogs(ctx context.Context, c *cluster, l *logger) {
l.PrintfCtx(ctx, "collecting cluster logs")
if err := c.FetchDebugZip(ctx); err != nil {
l.Printf("failed to download logs: %s", err)
}
if err := c.FetchDmesg(ctx); err != nil {
l.Printf("failed to fetch dmesg: %s", err)
}
if err := c.FetchJournalctl(ctx); err != nil {
l.Printf("failed to fetch journalctl: %s", err)
}
if err := c.FetchCores(ctx); err != nil {
l.Printf("failed to fetch cores: %s", err)
}
if err := c.CopyRoachprodState(ctx); err != nil {
l.Printf("failed to copy roachprod state: %s", err)
}
// NB: fetch the logs even when we have a debug zip because
// debug zip can't ever get the logs for down nodes.
// We only save artifacts for failed tests in CI, so this
// duplication is acceptable.
if err := c.FetchLogs(ctx); err != nil {
l.Printf("failed to download logs: %s", err)
}
}

func callerName() string {
// Make room for the skip PC.
var pc [2]uintptr
Expand Down

0 comments on commit 74da75d

Please sign in to comment.