Skip to content

Commit

Permalink
Add remaining pprof profiles to nomad operator debug (#10748)
Browse files Browse the repository at this point in the history
* Add remaining pprof profiles to debug dump
* Refactor pprof profile capture
* Add WaitForFilesUntil and WaitForResultUntil utility functions
* Add CHANGELOG entry
  • Loading branch information
davemay99 committed Jun 21, 2021
1 parent fa1be62 commit b430baf
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 47 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

IMPROVEMENTS:
* cli: Added `-monitor` flag to `deployment status` command and automatically monitor deployments from `job run` command. [[GH-10661](https://github.com/hashicorp/nomad/pull/10661)]
* cli: Added remainder of available pprof profiles to `nomad operator debug` capture. [[GH-10748](https://github.com/hashicorp/nomad/issues/10748)]
* consul/connect: Validate Connect service upstream address uniqueness within task group [[GH-7833](https://github.com/hashicorp/nomad/issues/7833)]
* deps: Update gopsutil for multisocket cpuinfo detection performance fix [[GH-10761](https://github.com/hashicorp/nomad/pull/10790)]
* docker: Tasks using `network.mode = "bridge"` that don't set their `network_mode` will receive a `/etc/hosts` file that includes the pause container's hostname and any `extra_hosts`. [[GH-10766](https://github.com/hashicorp/nomad/issues/10766)]
Expand Down
82 changes: 45 additions & 37 deletions command/operator_debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -654,52 +654,60 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
}
}

bs, err = client.Agent().Trace(opts, nil)
if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof trace.prof, err: %v", path, err))
} else {
err := c.writeBytes(path, "trace.prof", bs)
if err != nil {
c.Ui.Error(err.Error())
}
// goroutine debug type 1 = legacy text format for human readable output
opts.Debug = 1
c.savePprofProfile(path, "goroutine", opts, client)

// goroutine debug type 2 = goroutine stacks in panic format
opts.Debug = 2
c.savePprofProfile(path, "goroutine", opts, client)

// Reset to pprof binary format
opts.Debug = 0

c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines
c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program
c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations
c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads

// This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
// c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives

// This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
// c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes
}

// savePprofProfile retrieves a pprof profile and writes to disk
func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opts api.PprofOptions, client *api.Client) {
fileName := fmt.Sprintf("%s.prof", profile)
if opts.Debug > 0 {
fileName = fmt.Sprintf("%s-debug%d.txt", profile, opts.Debug)
}

bs, err = client.Agent().Lookup("goroutine", opts, nil)
bs, err := retrievePprofProfile(profile, opts, client)
if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine.prof, err: %v", path, err))
} else {
err := c.writeBytes(path, "goroutine.prof", bs)
if err != nil {
c.Ui.Error(err.Error())
}
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %s", path, fileName, err.Error()))
}

// Gather goroutine text output - debug type 1
// debug type 1 writes the legacy text format for human readable output
opts.Debug = 1
bs, err = client.Agent().Lookup("goroutine", opts, nil)
err = c.writeBytes(path, fileName, bs)
if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug1.txt, err: %v", path, err))
} else {
err := c.writeBytes(path, "goroutine-debug1.txt", bs)
if err != nil {
c.Ui.Error(err.Error())
}
c.Ui.Error(fmt.Sprintf("%s: Failed to write file %s, err: %s", path, fileName, err.Error()))
}
}

// Gather goroutine text output - debug type 2
// When printing the "goroutine" profile, debug=2 means to print the goroutine
// stacks in the same form that a Go program uses when dying due to an unrecovered panic.
opts.Debug = 2
bs, err = client.Agent().Lookup("goroutine", opts, nil)
if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug2.txt, err: %v", path, err))
} else {
err := c.writeBytes(path, "goroutine-debug2.txt", bs)
if err != nil {
c.Ui.Error(err.Error())
}
// retrievePprofProfile gets a pprof profile from the node specified in opts using the API client
func retrievePprofProfile(profile string, opts api.PprofOptions, client *api.Client) (bs []byte, err error) {
switch profile {
case "cpuprofile":
bs, err = client.Agent().CPUProfile(opts, nil)
case "trace":
bs, err = client.Agent().Trace(opts, nil)
default:
bs, err = client.Agent().Lookup(profile, opts, nil)
}

return bs, err
}

// collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops
Expand Down
7 changes: 6 additions & 1 deletion command/operator_debug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,11 +365,16 @@ func TestDebug_CapturedFiles(t *testing.T) {

// Monitor files are only created when selected
filepath.Join(path, "server", "leader", "monitor.log"),

// Pprof profiles
filepath.Join(path, "server", "leader", "profile.prof"),
filepath.Join(path, "server", "leader", "trace.prof"),
filepath.Join(path, "server", "leader", "goroutine.prof"),
filepath.Join(path, "server", "leader", "goroutine-debug1.txt"),
filepath.Join(path, "server", "leader", "goroutine-debug2.txt"),
filepath.Join(path, "server", "leader", "heap.prof"),
filepath.Join(path, "server", "leader", "allocs.prof"),
filepath.Join(path, "server", "leader", "threadcreate.prof"),

// Multiple snapshots are collected, 00 is always created
filepath.Join(path, "nomad", "0000", "jobs.json"),
Expand All @@ -382,7 +387,7 @@ func TestDebug_CapturedFiles(t *testing.T) {
filepath.Join(path, "nomad", "0001", "metrics.json"),
}

testutil.WaitForFiles(t, serverFiles)
testutil.WaitForFilesUntil(t, serverFiles, 2*time.Minute)
}

func TestDebug_ExistingOutput(t *testing.T) {
Expand Down
46 changes: 37 additions & 9 deletions testutil/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (

"github.com/hashicorp/nomad/nomad/structs"
"github.com/kr/pretty"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

Expand All @@ -35,6 +34,23 @@ func WaitForResultRetries(retries int64, test testFn, error errorFn) {
}
}

// WaitForResultUntil waits the duration for the test to pass.
// Otherwise error is called after the deadline expires.
func WaitForResultUntil(until time.Duration, test testFn, errorFunc errorFn) {
var success bool
var err error
deadline := time.Now().Add(until)
for time.Now().Before(deadline) {
success, err = test()
if success {
return
}
// Sleep some arbitrary fraction of the deadline
time.Sleep(until / 30)
}
errorFunc(err)
}

// AssertUntil asserts the test function passes throughout the given duration.
// Otherwise error is called on failure.
func AssertUntil(until time.Duration, test testFn, error errorFn) {
Expand Down Expand Up @@ -217,16 +233,28 @@ func WaitForRunning(t testing.TB, rpc rpcFn, job *structs.Job) []*structs.AllocL

// WaitForFiles blocks until all the files in the slice are present
func WaitForFiles(t testing.TB, files []string) {
assert := assert.New(t)
WaitForResult(func() (bool, error) {
for _, f := range files {
exists := assert.FileExists(f)
if !exists {
return false, fmt.Errorf("expected file to exist %s", f)
}
}
return true, nil
return FilesExist(files), nil
}, func(err error) {
t.Fatalf("missing expected files: %v", err)
})
}

// WaitForFilesUntil blocks until duration or all the files in the slice are present
func WaitForFilesUntil(t testing.TB, files []string, until time.Duration) {
WaitForResultUntil(until, func() (bool, error) {
return FilesExist(files), nil
}, func(err error) {
t.Fatalf("missing expected files: %v", err)
})
}

// FilesExist verifies all files in the slice are present
func FilesExist(files []string) bool {
for _, f := range files {
if _, err := os.Stat(f); os.IsNotExist(err) {
return false
}
}
return true
}
42 changes: 42 additions & 0 deletions testutil/wait_test.go
Original file line number Diff line number Diff line change
@@ -1 +1,43 @@
package testutil

import (
"fmt"
"os"
"path/filepath"
"testing"
"time"

"github.com/stretchr/testify/require"
)

func TestWait_WaitForFilesUntil(t *testing.T) {

var files []string
for i := 1; i < 10; i++ {
filename := fmt.Sprintf("test%d.txt", i)
filepath := filepath.Join(os.TempDir(), filename)
files = append(files, filepath)

defer os.Remove(filepath)
}

go func() {
for _, filepath := range files {
t.Logf("Creating file %s...", filepath)
fh, err := os.Create(filepath)
fh.Close()

require.NoError(t, err, "error creating test file")
require.FileExists(t, filepath)

time.Sleep(250 * time.Millisecond)
}
}()

duration := 5 * time.Second
t.Log("Waiting 5 seconds for files...")
WaitForFilesUntil(t, files, duration)

t.Log("done")

}

0 comments on commit b430baf

Please sign in to comment.