Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

command/operator_debug: add pprof interval #11938

Merged
merged 10 commits into from
Apr 4, 2022
3 changes: 3 additions & 0 deletions .changelog/11938.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
cli: added `-pprof-interval` to `nomad operator debug` command
```
66 changes: 58 additions & 8 deletions command/operator_debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ type OperatorDebugCommand struct {
collectDir string
duration time.Duration
interval time.Duration
pprofInterval time.Duration
pprofDuration time.Duration
logLevel string
maxNodes int
Expand Down Expand Up @@ -184,6 +185,10 @@ Debug Options:
-pprof-duration=<duration>
Duration for pprof collection. Defaults to 1s.

-pprof-interval=<pprof-interval>
The interval between pprof collections. Set interval equal to
duration to capture a single snapshot. Defaults to 250ms.

-server-id=<server1>,<server2>
Comma separated list of Nomad server names to monitor for logs, API
outputs, and pprof profiles. Accepts server names, "leader", or "all".
Expand Down Expand Up @@ -334,7 +339,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }

var duration, interval, output, pprofDuration, eventTopic string
var duration, interval, pprofInterval, output, pprofDuration, eventTopic string
var eventIndex int64
var nodeIDs, serverIDs string
var allowStale bool
Expand All @@ -351,6 +356,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags.BoolVar(&allowStale, "stale", false, "")
flags.StringVar(&output, "output", "", "")
flags.StringVar(&pprofDuration, "pprof-duration", "1s", "")
flags.StringVar(&pprofInterval, "pprof-interval", "250ms", "")
flags.BoolVar(&c.verbose, "verbose", false, "")

c.consul = &external{tls: &api.TLSConfig{}}
Expand Down Expand Up @@ -400,6 +406,14 @@ func (c *OperatorDebugCommand) Run(args []string) int {
return 1
}

// Parse the pprof capture interval
pi, err := time.ParseDuration(pprofInterval)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error parsing pprof-interval: %s: %s", pprofInterval, err.Error()))
return 1
}
c.pprofInterval = pi

// Parse the pprof capture duration
pd, err := time.ParseDuration(pprofDuration)
if err != nil {
Expand All @@ -408,6 +422,12 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
c.pprofDuration = pd

// Validate pprof interval
if pi.Seconds() > pd.Seconds() {
c.Ui.Error(fmt.Sprintf("pprof-interval %s must be less than pprof-duration %s", pprofInterval, pprofDuration))
return 1
}

// Parse event stream topic filter
t, err := topicsFromString(eventTopic)
if err != nil {
Expand Down Expand Up @@ -595,6 +615,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
c.Ui.Output(fmt.Sprintf(" Interval: %s", interval))
c.Ui.Output(fmt.Sprintf(" Duration: %s", duration))
c.Ui.Output(fmt.Sprintf(" pprof Interval: %s", pprofInterval))
if c.pprofDuration.Seconds() != 1 {
c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration))
}
Expand Down Expand Up @@ -663,7 +684,7 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
c.collectVault(clusterDir, vaultAddr)

c.collectAgentHosts(client)
c.collectPprofs(client)
go c.collectPeriodicPprofs(client)
danishprakash marked this conversation as resolved.
Show resolved Hide resolved

c.collectPeriodic(client)

Expand Down Expand Up @@ -876,19 +897,47 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
c.writeJSON(path, "agent-host.json", host, err)
}

func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
tgross marked this conversation as resolved.
Show resolved Hide resolved
duration := time.After(c.pprofDuration)
// Create a ticker to execute on every interval ticks
ticker := time.NewTicker(c.pprofInterval)

var pprofIntervalCount int
var name string

// Additionally, an out of loop execute to imitate first tick
c.collectPprofs(client, pprofIntervalCount)

for {
select {
case <-duration:
return

case <-ticker.C:
name = fmt.Sprintf("%04d", pprofIntervalCount)
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %s", name))
c.collectPprofs(client, pprofIntervalCount)
pprofIntervalCount++

case <-c.ctx.Done():
return
}
}
}

// collectPprofs captures the /agent/pprof for each listed node
func (c *OperatorDebugCommand) collectPprofs(client *api.Client) {
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
for _, n := range c.nodeIDs {
c.collectPprof(clientDir, n, client)
c.collectPprof(clientDir, n, client, interval)
}

for _, n := range c.serverIDs {
c.collectPprof(serverDir, n, client)
c.collectPprof(serverDir, n, client, interval)
}
}

// collectPprof captures pprof data for the node
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) {
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, interval int) {
pprofDurationSeconds := int(c.pprofDuration.Seconds())
opts := api.PprofOptions{Seconds: pprofDurationSeconds}
if path == serverDir {
Expand All @@ -898,10 +947,11 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
}

path = filepath.Join(path, id)
filename := fmt.Sprintf("profile_%04d.prof", interval)

bs, err := client.Agent().CPUProfile(opts, c.queryOpts())
if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof profile.prof, err: %v", path, err))
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %v", filename, path, err))
if structs.IsErrPermissionDenied(err) {
// All Profiles require the same permissions, so we only need to see
// one permission failure before we bail.
Expand All @@ -911,7 +961,7 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
return // only exit on 403
}
} else {
err := c.writeBytes(path, "profile.prof", bs)
err := c.writeBytes(path, filename, bs)
if err != nil {
c.Ui.Error(err.Error())
}
Expand Down
54 changes: 30 additions & 24 deletions command/operator_debug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ func TestDebug_ClientToServer(t *testing.T) {
},
{
name: "client1 address - verify no SIGSEGV panic",
args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"},
args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-interval", "200ms", "-pprof-duration", "250ms"},
expectedCode: 0,
expectedOutputs: []string{"Created debug archive"},
},
Expand Down Expand Up @@ -211,12 +211,12 @@ func TestDebug_MultiRegion(t *testing.T) {
// Good
{
name: "no region - all servers, all clients",
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
},
{
name: "region1 - server1 address",
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region1 + "\n",
Expand All @@ -227,7 +227,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region1 - client1 address",
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region1 + "\n",
Expand All @@ -238,7 +238,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region2 - server2 address",
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region2 + "\n",
Expand All @@ -249,7 +249,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region2 - client2 address",
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region2 + "\n",
Expand All @@ -262,7 +262,7 @@ func TestDebug_MultiRegion(t *testing.T) {
// Bad
{
name: "invalid region - all servers, all clients",
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 1,
expectedError: "500 (No path to region)",
},
Expand Down Expand Up @@ -347,6 +347,11 @@ func TestDebug_Failures(t *testing.T) {
args: []string{"-pprof-duration", "baz"},
expectedCode: 1,
},
{
name: "Fails bad pprof interval",
args: []string{"-pprof-interval", "bar"},
expectedCode: 1,
},
{
name: "Fails bad address",
args: []string{"-address", url + "bogus"},
Expand Down Expand Up @@ -379,17 +384,18 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
// Setup mock UI
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir := t.TempDir()
defer os.Remove(testDir)

// Debug on the leader and all client nodes
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", os.TempDir()})
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", testDir})
assert.Equal(t, 0, code)

// Bad plugin name should be escaped before it reaches the sandbox test
require.NotContains(t, ui.ErrorWriter.String(), "file path escapes capture directory")
require.Contains(t, ui.OutputWriter.String(), "Starting debugger")

path := cmd.collectDir
defer os.Remove(path)

var pluginFiles []string
for _, pluginName := range cases {
Expand Down Expand Up @@ -436,7 +442,7 @@ func TestDebug_CapturedFiles(t *testing.T) {
"goroutine-debug2.txt",
"goroutine.prof",
"heap.prof",
"profile.prof",
"profile_0000.prof",
"threadcreate.prof",
"trace.prof",
}
Expand Down Expand Up @@ -470,25 +476,24 @@ func TestDebug_CapturedFiles(t *testing.T) {

ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
testDir := t.TempDir()
defer os.Remove(testDir)

duration := 2 * time.Second
interval := 750 * time.Millisecond
waitTime := 2 * duration

code := cmd.Run([]string{
"-address", url,
"-output", os.TempDir(),
"-output", testDir,
"-server-id", serverName,
"-node-id", clientID,
"-duration", duration.String(),
"-interval", interval.String(),
"-pprof-duration", "0",
"-pprof-duration", "1s",
"-pprof-interval", "250ms",
})

// Get capture directory
path := cmd.collectDir
defer os.Remove(path)

// There should be no errors
require.Empty(t, ui.ErrorWriter.String())
require.Equal(t, 0, code)
Expand Down Expand Up @@ -529,11 +534,12 @@ func TestDebug_ExistingOutput(t *testing.T) {
// Fails existing output
format := "2006-01-02-150405Z"
stamped := "nomad-debug-" + time.Now().UTC().Format(format)
path := filepath.Join(os.TempDir(), stamped)
tempDir := t.TempDir()
path := filepath.Join(tempDir, stamped)
os.MkdirAll(path, 0755)
defer os.Remove(path)
defer os.Remove(tempDir)

code := cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms", "-interval", "50ms"})
code := cmd.Run([]string{"-output", tempDir, "-duration", "50ms", "-interval", "50ms"})
require.Equal(t, 2, code)
}

Expand Down Expand Up @@ -636,12 +642,12 @@ func TestDebug_WriteBytes_Nil(t *testing.T) {
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

testDir = os.TempDir()
testDir = t.TempDir()
defer os.Remove(testDir)
cmd.collectDir = testDir

testFile = "test_nil.json"
testPath = filepath.Join(testDir, testFile)
defer os.Remove(testPath)

// Write nil file at top level of collect directory
err := cmd.writeBytes("", testFile, testBytes)
Expand All @@ -655,7 +661,7 @@ func TestDebug_WriteBytes_PathEscapesSandbox(t *testing.T) {
var testDir, testFile string
var testBytes []byte

testDir = os.TempDir()
testDir = t.TempDir()
defer os.Remove(testDir)

testFile = "testing.json"
Expand Down Expand Up @@ -713,7 +719,7 @@ func TestDebug_CollectConsul(t *testing.T) {
c.consul = ce

// Setup capture directory
testDir := os.TempDir()
testDir := t.TempDir()
defer os.Remove(testDir)
c.collectDir = testDir

Expand Down Expand Up @@ -756,7 +762,7 @@ func TestDebug_CollectVault(t *testing.T) {
c.vault = ve

// Set capture directory
testDir := os.TempDir()
testDir := t.TempDir()
defer os.Remove(testDir)
c.collectDir = testDir

Expand Down
6 changes: 5 additions & 1 deletion website/content/docs/commands/operator/debug.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ true.

- `-pprof-duration=<duration>`: Duration for pprof collection. Defaults to 1s.

- `-pprof-interval=<pprof-interval>`: The interval between pprof
collections. Set interval equal to duration to capture a single
snapshot. Defaults to 250ms.

- `-server-id=<server1>,<server2>`: Comma separated list of Nomad server names to
monitor for logs, API outputs, and pprof profiles. Accepts server names, "leader", or
"all". Defaults to `all`.
Expand All @@ -75,7 +79,7 @@ true.
leadership, it may be necessary to get the configuration from a non-leader
server.

- `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event
- `-event-topic=<allocation,evaluation,job,node,*>:<filter>`: Enable event
stream capture. Filter by comma delimited list of topic filters or "all".
Defaults to "none" (disabled). Refer to the [Events API](/api-docs/events) for
additional detail.
Expand Down