From 27b5d0bed5dfe8d76f96fbccb44c31b521605ccb Mon Sep 17 00:00:00 2001 From: danishprakash Date: Sun, 9 Jan 2022 18:55:29 +0530 Subject: [PATCH] command/operator_debug: add pprof interval --- command/operator_debug.go | 51 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 446ddc03edab..ff4efcfa4056 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -38,6 +38,7 @@ type OperatorDebugCommand struct { collectDir string duration time.Duration interval time.Duration + pprofInterval time.Duration pprofDuration time.Duration logLevel string maxNodes int @@ -166,6 +167,10 @@ Debug Options: The interval between snapshots of the Nomad state. Set interval equal to duration to capture a single snapshot. Defaults to 30s. + -pprofInterval= + The interval between pprof collections. Set interval equal to + duration to capture a single snapshot. Defaults to 30s. + -log-level= The log level to monitor. Defaults to DEBUG. @@ -334,7 +339,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { flags := c.Meta.FlagSet(c.Name(), FlagSetClient) flags.Usage = func() { c.Ui.Output(c.Help()) } - var duration, interval, output, pprofDuration, eventTopic string + var duration, interval, pprofInterval, output, pprofDuration, eventTopic string var eventIndex int64 var nodeIDs, serverIDs string var allowStale bool @@ -343,6 +348,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { flags.Int64Var(&eventIndex, "event-index", 0, "") flags.StringVar(&eventTopic, "event-topic", "none", "") flags.StringVar(&interval, "interval", "30s", "") + flags.StringVar(&pprofInterval, "pprofInterval", "30s", "") flags.StringVar(&c.logLevel, "log-level", "DEBUG", "") flags.IntVar(&c.maxNodes, "max-nodes", 10, "") flags.StringVar(&c.nodeClass, "node-class", "", "") @@ -423,6 +429,20 @@ func (c *OperatorDebugCommand) Run(args []string) int { } c.index = uint64(eventIndex) + // Parse the pprof capture interval + pi, err := time.ParseDuration(pprofInterval) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error parsing interval: %s: %s", pprofInterval, err.Error())) + return 1 + } + c.pprofInterval = pi + + // Validate interval + if pi.Seconds() > pd.Seconds() { + c.Ui.Error(fmt.Sprintf("Error parsing pprof interval: %s is greater than duration %s", interval, duration)) + return 1 + } + // Verify there are no extra arguments args = flags.Args() if l := len(args); l != 0 { @@ -594,6 +614,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { c.Ui.Output(fmt.Sprintf(" Node Class: %s", c.nodeClass)) } c.Ui.Output(fmt.Sprintf(" Interval: %s", interval)) + c.Ui.Output(fmt.Sprintf(" Pprof Interval: %s", pprofInterval)) c.Ui.Output(fmt.Sprintf(" Duration: %s", duration)) if c.pprofDuration.Seconds() != 1 { c.Ui.Output(fmt.Sprintf(" pprof Duration: %s", c.pprofDuration)) @@ -663,7 +684,7 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error { c.collectVault(clusterDir, vaultAddr) c.collectAgentHosts(client) - c.collectPprofs(client) + c.collectPeriodicPprofs(client) c.collectPeriodic(client) @@ -876,6 +897,32 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli c.writeJSON(path, "agent-host.json", host, err) } +func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) { + duration := time.After(c.duration) + // Set interval to 0 so that we immediately execute, wait the interval next time + pprofInterval := time.After(0 * time.Second) + var pprofIntervalCount int + var name string + + for { + select { + case <-duration: + c.cancel() + return + + case <-pprofInterval: + name = fmt.Sprintf("%04d", pprofIntervalCount) + c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %s", name)) + c.collectPprofs(client) + pprofInterval = time.After(c.pprofInterval) + pprofIntervalCount++ + + case <-c.ctx.Done(): + return + } + } +} + // collectPprofs captures the /agent/pprof for each listed node func (c *OperatorDebugCommand) collectPprofs(client *api.Client) { for _, n := range c.nodeIDs {