Skip to content

Commit

Permalink
command/operator_debug: add pprof interval (#11938)
Browse files Browse the repository at this point in the history
  • Loading branch information
danishprakash committed Apr 4, 2022
1 parent 5de999d commit ff6ae5f
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 33 deletions.
3 changes: 3 additions & 0 deletions .changelog/11938.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
cli: added `-pprof-interval` to `nomad operator debug` command
```
66 changes: 58 additions & 8 deletions command/operator_debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ type OperatorDebugCommand struct {
collectDir string
duration time.Duration
interval time.Duration
pprofInterval time.Duration
pprofDuration time.Duration
logLevel string
maxNodes int
Expand Down Expand Up @@ -184,6 +185,10 @@ Debug Options:
-pprof-duration=<duration>
Duration for pprof collection. Defaults to 1s.
-pprof-interval=<pprof-interval>
The interval between pprof collections. Set interval equal to
duration to capture a single snapshot. Defaults to 250ms.
-server-id=<server1>,<server2>
Comma separated list of Nomad server names to monitor for logs, API
outputs, and pprof profiles. Accepts server names, "leader", or "all".
Expand Down Expand Up @@ -334,7 +339,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }

var duration, interval, output, pprofDuration, eventTopic string
var duration, interval, pprofInterval, output, pprofDuration, eventTopic string
var eventIndex int64
var nodeIDs, serverIDs string
var allowStale bool
Expand All @@ -351,6 +356,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags.BoolVar(&allowStale, "stale", false, "")
flags.StringVar(&output, "output", "", "")
flags.StringVar(&pprofDuration, "pprof-duration", "1s", "")
flags.StringVar(&pprofInterval, "pprof-interval", "250ms", "")
flags.BoolVar(&c.verbose, "verbose", false, "")

c.consul = &external{tls: &api.TLSConfig{}}
Expand Down Expand Up @@ -400,6 +406,14 @@ func (c *OperatorDebugCommand) Run(args []string) int {
return 1
}

// Parse the pprof capture interval
pi, err := time.ParseDuration(pprofInterval)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error parsing pprof-interval: %s: %s", pprofInterval, err.Error()))
return 1
}
c.pprofInterval = pi

// Parse the pprof capture duration
pd, err := time.ParseDuration(pprofDuration)
if err != nil {
Expand All @@ -408,6 +422,12 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
c.pprofDuration = pd

// Validate pprof interval
if pi.Seconds() > pd.Seconds() {
c.Ui.Error(fmt.Sprintf("pprof-interval %s must be less than pprof-duration %s", pprofInterval, pprofDuration))
return 1
}

// Parse event stream topic filter
t, err := topicsFromString(eventTopic)
if err != nil {
Expand Down Expand Up @@ -595,6 +615,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
c.Ui.Output(fmt.Sprintf(" Interval: %s", interval))
c.Ui.Output(fmt.Sprintf(" Duration: %s", duration))
c.Ui.Output(fmt.Sprintf(" pprof Interval: %s", pprofInterval))
if c.pprofDuration.Seconds() != 1 {
c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration))
}
Expand Down Expand Up @@ -663,7 +684,7 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
c.collectVault(clusterDir, vaultAddr)

c.collectAgentHosts(client)
c.collectPprofs(client)
go c.collectPeriodicPprofs(client)

c.collectPeriodic(client)

Expand Down Expand Up @@ -876,19 +897,47 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
c.writeJSON(path, "agent-host.json", host, err)
}

func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
duration := time.After(c.pprofDuration)
// Create a ticker to execute on every interval ticks
ticker := time.NewTicker(c.pprofInterval)

var pprofIntervalCount int
var name string

// Additionally, an out of loop execute to imitate first tick
c.collectPprofs(client, pprofIntervalCount)

for {
select {
case <-duration:
return

case <-ticker.C:
name = fmt.Sprintf("%04d", pprofIntervalCount)
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %s", name))
c.collectPprofs(client, pprofIntervalCount)
pprofIntervalCount++

case <-c.ctx.Done():
return
}
}
}

// collectPprofs captures the /agent/pprof for each listed node
func (c *OperatorDebugCommand) collectPprofs(client *api.Client) {
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
for _, n := range c.nodeIDs {
c.collectPprof(clientDir, n, client)
c.collectPprof(clientDir, n, client, interval)
}

for _, n := range c.serverIDs {
c.collectPprof(serverDir, n, client)
c.collectPprof(serverDir, n, client, interval)
}
}

// collectPprof captures pprof data for the node
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) {
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, interval int) {
pprofDurationSeconds := int(c.pprofDuration.Seconds())
opts := api.PprofOptions{Seconds: pprofDurationSeconds}
if path == serverDir {
Expand All @@ -898,10 +947,11 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
}

path = filepath.Join(path, id)
filename := fmt.Sprintf("profile_%04d.prof", interval)

bs, err := client.Agent().CPUProfile(opts, c.queryOpts())
if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof profile.prof, err: %v", path, err))
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %v", filename, path, err))
if structs.IsErrPermissionDenied(err) {
// All Profiles require the same permissions, so we only need to see
// one permission failure before we bail.
Expand All @@ -911,7 +961,7 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
return // only exit on 403
}
} else {
err := c.writeBytes(path, "profile.prof", bs)
err := c.writeBytes(path, filename, bs)
if err != nil {
c.Ui.Error(err.Error())
}
Expand Down
54 changes: 30 additions & 24 deletions command/operator_debug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ func TestDebug_ClientToServer(t *testing.T) {
},
{
name: "client1 address - verify no SIGSEGV panic",
args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-interval", "200ms", "-pprof-duration", "250ms"},
expectedCode: 0,
expectedOutputs: []string{"Created debug archive"},
},
Expand Down Expand Up @@ -211,12 +211,12 @@ func TestDebug_MultiRegion(t *testing.T) {
// Good
{
name: "no region - all servers, all clients",
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
},
{
name: "region1 - server1 address",
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region1 + "\n",
Expand All @@ -227,7 +227,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region1 - client1 address",
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region1 + "\n",
Expand All @@ -238,7 +238,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region2 - server2 address",
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region2 + "\n",
Expand All @@ -249,7 +249,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region2 - client2 address",
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region2 + "\n",
Expand All @@ -262,7 +262,7 @@ func TestDebug_MultiRegion(t *testing.T) {
// Bad
{
name: "invalid region - all servers, all clients",
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 1,
expectedError: "500 (No path to region)",
},
Expand Down Expand Up @@ -347,6 +347,11 @@ func TestDebug_Failures(t *testing.T) {
args: []string{"-pprof-duration", "baz"},
expectedCode: 1,
},
{
name: "Fails bad pprof interval",
args: []string{"-pprof-interval", "bar"},
expectedCode: 1,
},
{
name: "Fails bad address",
args: []string{"-address", url + "bogus"},
Expand Down Expand Up @@ -379,17 +384,18 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
// Setup mock UI
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir := t.TempDir()
defer os.Remove(testDir)

// Debug on the leader and all client nodes
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", os.TempDir()})
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", testDir})
assert.Equal(t, 0, code)

// Bad plugin name should be escaped before it reaches the sandbox test
require.NotContains(t, ui.ErrorWriter.String(), "file path escapes capture directory")
require.Contains(t, ui.OutputWriter.String(), "Starting debugger")

path := cmd.collectDir
defer os.Remove(path)

var pluginFiles []string
for _, pluginName := range cases {
Expand Down Expand Up @@ -436,7 +442,7 @@ func TestDebug_CapturedFiles(t *testing.T) {
"goroutine-debug2.txt",
"goroutine.prof",
"heap.prof",
"profile.prof",
"profile_0000.prof",
"threadcreate.prof",
"trace.prof",
}
Expand Down Expand Up @@ -470,25 +476,24 @@ func TestDebug_CapturedFiles(t *testing.T) {

ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir := t.TempDir()
defer os.Remove(testDir)

duration := 2 * time.Second
interval := 750 * time.Millisecond
waitTime := 2 * duration

code := cmd.Run([]string{
"-address", url,
"-output", os.TempDir(),
"-output", testDir,
"-server-id", serverName,
"-node-id", clientID,
"-duration", duration.String(),
"-interval", interval.String(),
"-pprof-duration", "0",
"-pprof-duration", "1s",
"-pprof-interval", "250ms",
})

// Get capture directory
path := cmd.collectDir
defer os.Remove(path)

// There should be no errors
require.Empty(t, ui.ErrorWriter.String())
require.Equal(t, 0, code)
Expand Down Expand Up @@ -529,11 +534,12 @@ func TestDebug_ExistingOutput(t *testing.T) {
// Fails existing output
format := "2006-01-02-150405Z"
stamped := "nomad-debug-" + time.Now().UTC().Format(format)
path := filepath.Join(os.TempDir(), stamped)
tempDir := t.TempDir()
path := filepath.Join(tempDir, stamped)
os.MkdirAll(path, 0755)
defer os.Remove(path)
defer os.Remove(tempDir)

code := cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms", "-interval", "50ms"})
code := cmd.Run([]string{"-output", tempDir, "-duration", "50ms", "-interval", "50ms"})
require.Equal(t, 2, code)
}

Expand Down Expand Up @@ -636,12 +642,12 @@ func TestDebug_WriteBytes_Nil(t *testing.T) {
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

testDir = os.TempDir()
testDir = t.TempDir()
defer os.Remove(testDir)
cmd.collectDir = testDir

testFile = "test_nil.json"
testPath = filepath.Join(testDir, testFile)
defer os.Remove(testPath)

// Write nil file at top level of collect directory
err := cmd.writeBytes("", testFile, testBytes)
Expand All @@ -655,7 +661,7 @@ func TestDebug_WriteBytes_PathEscapesSandbox(t *testing.T) {
var testDir, testFile string
var testBytes []byte

testDir = os.TempDir()
testDir = t.TempDir()
defer os.Remove(testDir)

testFile = "testing.json"
Expand Down Expand Up @@ -713,7 +719,7 @@ func TestDebug_CollectConsul(t *testing.T) {
c.consul = ce

// Setup capture directory
testDir := os.TempDir()
testDir := t.TempDir()
defer os.Remove(testDir)
c.collectDir = testDir

Expand Down Expand Up @@ -756,7 +762,7 @@ func TestDebug_CollectVault(t *testing.T) {
c.vault = ve

// Set capture directory
testDir := os.TempDir()
testDir := t.TempDir()
defer os.Remove(testDir)
c.collectDir = testDir

Expand Down
6 changes: 5 additions & 1 deletion website/content/docs/commands/operator/debug.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ true.

- `-pprof-duration=<duration>`: Duration for pprof collection. Defaults to 1s.

- `-pprof-interval=<pprof-interval>`: The interval between pprof
collections. Set interval equal to duration to capture a single
snapshot. Defaults to 250ms.

- `-server-id=<server1>,<server2>`: Comma separated list of Nomad server names to
monitor for logs, API outputs, and pprof profiles. Accepts server names, "leader", or
"all". Defaults to `all`.
Expand All @@ -75,7 +79,7 @@ true.
leadership, it may be necessary to get the configuration from a non-leader
server.

- `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event
- `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event
stream capture. Filter by comma delimited list of topic filters or "all".
Defaults to "none" (disabled). Refer to the [Events API](/api-docs/events) for
additional detail.
Expand Down

0 comments on commit ff6ae5f

Please sign in to comment.