Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

command/operator_debug: add pprof interval #11938

Merged
merged 10 commits into from
Apr 4, 2022
65 changes: 58 additions & 7 deletions command/operator_debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ type OperatorDebugCommand struct {
collectDir string
duration time.Duration
interval time.Duration
pprofInterval time.Duration
pprofDuration time.Duration
logLevel string
maxNodes int
Expand Down Expand Up @@ -166,6 +167,10 @@ Debug Options:
The interval between snapshots of the Nomad state. Set interval equal to
duration to capture a single snapshot. Defaults to 30s.

-pprof-interval=<pprof-interval>
The interval between pprof collections. Set interval equal to
duration to capture a single snapshot. Defaults to 250ms.

-log-level=<level>
The log level to monitor. Defaults to DEBUG.

Expand Down Expand Up @@ -334,7 +339,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }

var duration, interval, output, pprofDuration, eventTopic string
var duration, interval, pprofInterval, output, pprofDuration, eventTopic string
var eventIndex int64
var nodeIDs, serverIDs string
var allowStale bool
Expand All @@ -351,6 +356,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags.BoolVar(&allowStale, "stale", false, "")
flags.StringVar(&output, "output", "", "")
flags.StringVar(&pprofDuration, "pprof-duration", "1s", "")
flags.StringVar(&pprofInterval, "pprof-interval", "250ms", "")
flags.BoolVar(&c.verbose, "verbose", false, "")

c.consul = &external{tls: &api.TLSConfig{}}
Expand Down Expand Up @@ -423,6 +429,20 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
c.index = uint64(eventIndex)

// Parse the pprof capture interval
pi, err := time.ParseDuration(pprofInterval)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error parsing pprof-interval: %s: %s", pprofInterval, err.Error()))
return 1
}
c.pprofInterval = pi

// Validate interval
if pi.Seconds() > pd.Seconds() {
c.Ui.Error(fmt.Sprintf("pprof-interval %s must be less than pprof-duration %s", pprofInterval, pprofDuration))
return 1
}

// Verify there are no extra arguments
args = flags.Args()
if l := len(args); l != 0 {
Expand Down Expand Up @@ -595,6 +615,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
c.Ui.Output(fmt.Sprintf(" Interval: %s", interval))
c.Ui.Output(fmt.Sprintf(" Duration: %s", duration))
c.Ui.Output(fmt.Sprintf(" Pprof Interval: %s", pprofInterval))
if c.pprofDuration.Seconds() != 1 {
c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration))
}
Expand Down Expand Up @@ -663,7 +684,7 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
c.collectVault(clusterDir, vaultAddr)

c.collectAgentHosts(client)
c.collectPprofs(client)
c.collectPeriodicPprofs(client)

c.collectPeriodic(client)

Expand Down Expand Up @@ -876,19 +897,48 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
c.writeJSON(path, "agent-host.json", host, err)
}

func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
tgross marked this conversation as resolved.
Show resolved Hide resolved
duration := time.After(c.duration)
// Create a ticker to execute on every interval ticks
ticker := time.NewTicker(c.pprofInterval)

var pprofIntervalCount int
var name string

// Additionally, an out of loop execute to imitate first tick
c.collectPprofs(client, pprofIntervalCount)

for {
select {
case <-duration:
c.cancel()
return

case <-ticker.C:
name = fmt.Sprintf("%04d", pprofIntervalCount)
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %s", name))
c.collectPprofs(client, pprofIntervalCount)
pprofIntervalCount++

case <-c.ctx.Done():
return
}
}
}

// collectPprofs captures the /agent/pprof for each listed node
func (c *OperatorDebugCommand) collectPprofs(client *api.Client) {
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
for _, n := range c.nodeIDs {
c.collectPprof(clientDir, n, client)
c.collectPprof(clientDir, n, client, interval)
}

for _, n := range c.serverIDs {
c.collectPprof(serverDir, n, client)
c.collectPprof(serverDir, n, client, interval)
}
}

// collectPprof captures pprof data for the node
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) {
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, interval int) {
pprofDurationSeconds := int(c.pprofDuration.Seconds())
opts := api.PprofOptions{Seconds: pprofDurationSeconds}
if path == serverDir {
Expand All @@ -911,7 +961,8 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
return // only exit on 403
}
} else {
err := c.writeBytes(path, "profile.prof", bs)
filename := fmt.Sprintf("profile_%04d.prof", interval)
danishprakash marked this conversation as resolved.
Show resolved Hide resolved
err := c.writeBytes(path, filename, bs)
if err != nil {
c.Ui.Error(err.Error())
}
Expand Down
20 changes: 13 additions & 7 deletions command/operator_debug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,12 @@ func TestDebug_MultiRegion(t *testing.T) {
// Good
{
name: "no region - all servers, all clients",
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
},
{
name: "region1 - server1 address",
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region1 + "\n",
Expand All @@ -227,7 +227,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region1 - client1 address",
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrClient1, "-region", region1, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region1 + "\n",
Expand All @@ -238,7 +238,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region2 - server2 address",
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region2 + "\n",
Expand All @@ -249,7 +249,7 @@ func TestDebug_MultiRegion(t *testing.T) {
},
{
name: "region2 - client2 address",
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrClient2, "-region", region2, "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 0,
expectedOutputs: []string{
"Region: " + region2 + "\n",
Expand All @@ -262,7 +262,7 @@ func TestDebug_MultiRegion(t *testing.T) {
// Bad
{
name: "invalid region - all servers, all clients",
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "0"},
args: []string{"-address", addrServer1, "-region", "never", "-duration", "50ms", "-interval", "50ms", "-server-id", "all", "-node-id", "all", "-pprof-duration", "1s", "-pprof-interval", "250ms"},
expectedCode: 1,
expectedError: "500 (No path to region)",
},
Expand Down Expand Up @@ -347,6 +347,11 @@ func TestDebug_Failures(t *testing.T) {
args: []string{"-pprof-duration", "baz"},
expectedCode: 1,
},
{
name: "Fails bad pprof interval",
args: []string{"-pprof-interval", "bar"},
expectedCode: 1,
},
{
name: "Fails bad address",
args: []string{"-address", url + "bogus"},
Expand Down Expand Up @@ -482,7 +487,8 @@ func TestDebug_CapturedFiles(t *testing.T) {
"-node-id", clientID,
"-duration", duration.String(),
"-interval", interval.String(),
"-pprof-duration", "0",
"-pprof-duration", "1s",
"-pprof-interval", "250ms",
})

// Get capture directory
Expand Down