Add remaining pprof profiles to nomad operator debug (#10748)

* Add remaining pprof profiles to debug dump * Refactor pprof profile capture * Add WaitForFilesUntil and WaitForResultUntil utility functions * Add CHANGELOG entry
hashicorp · Jun 21, 2021 · b430baf · b430baf
1 parent fa1be62
commit b430baf
Show file tree

Hide file tree

Showing 5 changed files with 131 additions and 47 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 IMPROVEMENTS:
 * cli: Added `-monitor` flag to `deployment status` command and automatically monitor deployments from `job run` command. [[GH-10661](https://github.com/hashicorp/nomad/pull/10661)]
+* cli: Added remainder of available pprof profiles to `nomad operator debug` capture. [[GH-10748](https://github.com/hashicorp/nomad/issues/10748)]
 * consul/connect: Validate Connect service upstream address uniqueness within task group [[GH-7833](https://github.com/hashicorp/nomad/issues/7833)]
 * deps: Update gopsutil for multisocket cpuinfo detection performance fix [[GH-10761](https://github.com/hashicorp/nomad/pull/10790)]
 * docker: Tasks using `network.mode = "bridge"` that don't set their `network_mode` will receive a `/etc/hosts` file that includes the pause container's hostname and any `extra_hosts`. [[GH-10766](https://github.com/hashicorp/nomad/issues/10766)]

diff --git a/command/operator_debug.go b/command/operator_debug.go
@@ -654,52 +654,60 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
 		}
 	}
 
-	bs, err = client.Agent().Trace(opts, nil)
-	if err != nil {
-		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof trace.prof, err: %v", path, err))
-	} else {
-		err := c.writeBytes(path, "trace.prof", bs)
-		if err != nil {
-			c.Ui.Error(err.Error())
-		}
+	// goroutine debug type 1 = legacy text format for human readable output
+	opts.Debug = 1
+	c.savePprofProfile(path, "goroutine", opts, client)
+
+	// goroutine debug type 2 = goroutine stacks in panic format
+	opts.Debug = 2
+	c.savePprofProfile(path, "goroutine", opts, client)
+
+	// Reset to pprof binary format
+	opts.Debug = 0
+
+	c.savePprofProfile(path, "goroutine", opts, client)    // Stack traces of all current goroutines
+	c.savePprofProfile(path, "trace", opts, client)        // A trace of execution of the current program
+	c.savePprofProfile(path, "heap", opts, client)         // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
+	c.savePprofProfile(path, "allocs", opts, client)       // A sampling of all past memory allocations
+	c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads
+
+	// This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
+	// c.savePprofProfile(path, "block", opts, client)        // Stack traces that led to blocking on synchronization primitives
+
+	// This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
+	// c.savePprofProfile(path, "mutex", opts, client)        // Stack traces of holders of contended mutexes
+}
+
+// savePprofProfile retrieves a pprof profile and writes to disk
+func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opts api.PprofOptions, client *api.Client) {
+	fileName := fmt.Sprintf("%s.prof", profile)
+	if opts.Debug > 0 {
+		fileName = fmt.Sprintf("%s-debug%d.txt", profile, opts.Debug)
 	}
 
-	bs, err = client.Agent().Lookup("goroutine", opts, nil)
+	bs, err := retrievePprofProfile(profile, opts, client)
 	if err != nil {
-		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine.prof, err: %v", path, err))
-	} else {
-		err := c.writeBytes(path, "goroutine.prof", bs)
-		if err != nil {
-			c.Ui.Error(err.Error())
-		}
+		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %s", path, fileName, err.Error()))
 	}
 
-	// Gather goroutine text output - debug type 1
-	// debug type 1 writes the legacy text format for human readable output
-	opts.Debug = 1
-	bs, err = client.Agent().Lookup("goroutine", opts, nil)
+	err = c.writeBytes(path, fileName, bs)
 	if err != nil {
-		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug1.txt, err: %v", path, err))
-	} else {
-		err := c.writeBytes(path, "goroutine-debug1.txt", bs)
-		if err != nil {
-			c.Ui.Error(err.Error())
-		}
+		c.Ui.Error(fmt.Sprintf("%s: Failed to write file %s, err: %s", path, fileName, err.Error()))
 	}
+}
 
-	// Gather goroutine text output - debug type 2
-	// When printing the "goroutine" profile, debug=2 means to print the goroutine
-	// stacks in the same form that a Go program uses when dying due to an unrecovered panic.
-	opts.Debug = 2
-	bs, err = client.Agent().Lookup("goroutine", opts, nil)
-	if err != nil {
-		c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug2.txt, err: %v", path, err))
-	} else {
-		err := c.writeBytes(path, "goroutine-debug2.txt", bs)
-		if err != nil {
-			c.Ui.Error(err.Error())
-		}
+// retrievePprofProfile gets a pprof profile from the node specified in opts using the API client
+func retrievePprofProfile(profile string, opts api.PprofOptions, client *api.Client) (bs []byte, err error) {
+	switch profile {
+	case "cpuprofile":
+		bs, err = client.Agent().CPUProfile(opts, nil)
+	case "trace":
+		bs, err = client.Agent().Trace(opts, nil)
+	default:
+		bs, err = client.Agent().Lookup(profile, opts, nil)
 	}
+
+	return bs, err
 }
 
 // collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops

diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go
@@ -365,11 +365,16 @@ func TestDebug_CapturedFiles(t *testing.T) {
 
 		// Monitor files are only created when selected
 		filepath.Join(path, "server", "leader", "monitor.log"),
+
+		// Pprof profiles
 		filepath.Join(path, "server", "leader", "profile.prof"),
 		filepath.Join(path, "server", "leader", "trace.prof"),
 		filepath.Join(path, "server", "leader", "goroutine.prof"),
 		filepath.Join(path, "server", "leader", "goroutine-debug1.txt"),
 		filepath.Join(path, "server", "leader", "goroutine-debug2.txt"),
+		filepath.Join(path, "server", "leader", "heap.prof"),
+		filepath.Join(path, "server", "leader", "allocs.prof"),
+		filepath.Join(path, "server", "leader", "threadcreate.prof"),
 
 		// Multiple snapshots are collected, 00 is always created
 		filepath.Join(path, "nomad", "0000", "jobs.json"),
@@ -382,7 +387,7 @@ func TestDebug_CapturedFiles(t *testing.T) {
 		filepath.Join(path, "nomad", "0001", "metrics.json"),
 	}
 
-	testutil.WaitForFiles(t, serverFiles)
+	testutil.WaitForFilesUntil(t, serverFiles, 2*time.Minute)
 }
 
 func TestDebug_ExistingOutput(t *testing.T) {

diff --git a/testutil/wait.go b/testutil/wait.go
@@ -8,7 +8,6 @@ import (
 
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/kr/pretty"
-	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 
@@ -35,6 +34,23 @@ func WaitForResultRetries(retries int64, test testFn, error errorFn) {
 	}
 }
 
+// WaitForResultUntil waits the duration for the test to pass.
+// Otherwise error is called after the deadline expires.
+func WaitForResultUntil(until time.Duration, test testFn, errorFunc errorFn) {
+	var success bool
+	var err error
+	deadline := time.Now().Add(until)
+	for time.Now().Before(deadline) {
+		success, err = test()
+		if success {
+			return
+		}
+		// Sleep some arbitrary fraction of the deadline
+		time.Sleep(until / 30)
+	}
+	errorFunc(err)
+}
+
 // AssertUntil asserts the test function passes throughout the given duration.
 // Otherwise error is called on failure.
 func AssertUntil(until time.Duration, test testFn, error errorFn) {
@@ -217,16 +233,28 @@ func WaitForRunning(t testing.TB, rpc rpcFn, job *structs.Job) []*structs.AllocL
 
 // WaitForFiles blocks until all the files in the slice are present
 func WaitForFiles(t testing.TB, files []string) {
-	assert := assert.New(t)
 	WaitForResult(func() (bool, error) {
-		for _, f := range files {
-			exists := assert.FileExists(f)
-			if !exists {
-				return false, fmt.Errorf("expected file to exist %s", f)
-			}
-		}
-		return true, nil
+		return FilesExist(files), nil
 	}, func(err error) {
 		t.Fatalf("missing expected files: %v", err)
 	})
 }
+
+// WaitForFilesUntil blocks until duration or all the files in the slice are present
+func WaitForFilesUntil(t testing.TB, files []string, until time.Duration) {
+	WaitForResultUntil(until, func() (bool, error) {
+		return FilesExist(files), nil
+	}, func(err error) {
+		t.Fatalf("missing expected files: %v", err)
+	})
+}
+
+// FilesExist verifies all files in the slice are present
+func FilesExist(files []string) bool {
+	for _, f := range files {
+		if _, err := os.Stat(f); os.IsNotExist(err) {
+			return false
+		}
+	}
+	return true
+}
diff --git a/testutil/wait_test.go b/testutil/wait_test.go
@@ -1 +1,43 @@
 package testutil
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestWait_WaitForFilesUntil(t *testing.T) {
+
+	var files []string
+	for i := 1; i < 10; i++ {
+		filename := fmt.Sprintf("test%d.txt", i)
+		filepath := filepath.Join(os.TempDir(), filename)
+		files = append(files, filepath)
+
+		defer os.Remove(filepath)
+	}
+
+	go func() {
+		for _, filepath := range files {
+			t.Logf("Creating file %s...", filepath)
+			fh, err := os.Create(filepath)
+			fh.Close()
+
+			require.NoError(t, err, "error creating test file")
+			require.FileExists(t, filepath)
+
+			time.Sleep(250 * time.Millisecond)
+		}
+	}()
+
+	duration := 5 * time.Second
+	t.Log("Waiting 5 seconds for files...")
+	WaitForFilesUntil(t, files, duration)
+
+	t.Log("done")
+
+}