Skip to content

Commit

Permalink
debug: add version constraint to avoid pprof panic (#12807)
Browse files Browse the repository at this point in the history
  • Loading branch information
davemay99 committed Apr 28, 2022
1 parent 9dccbb1 commit 522b630
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 21 deletions.
3 changes: 3 additions & 0 deletions .changelog/12807.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
cli: `operator debug` command now skips generating pprofs to avoid a panic on Nomad 0.11.2. 0.11.1, and 0.11.0
```
122 changes: 101 additions & 21 deletions command/operator_debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (

"github.com/hashicorp/go-cleanhttp"
"github.com/hashicorp/go-multierror"
goversion "github.com/hashicorp/go-version"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/api/contexts"
"github.com/hashicorp/nomad/helper"
Expand Down Expand Up @@ -54,14 +55,17 @@ type OperatorDebugCommand struct {
cancel context.CancelFunc
opts *api.QueryOptions
verbose bool
members *api.ServerMembers
nodes []*api.NodeListStub
}

const (
userAgent = "nomad operator debug"
clusterDir = "cluster"
clientDir = "client"
serverDir = "server"
intervalDir = "interval"
userAgent = "nomad operator debug"
clusterDir = "cluster"
clientDir = "client"
serverDir = "server"
intervalDir = "interval"
minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2"
)

func (c *OperatorDebugCommand) Help() string {
Expand Down Expand Up @@ -502,6 +506,16 @@ func (c *OperatorDebugCommand) Run(args []string) int {
AuthToken: c.Meta.token,
}

// Get complete list of client nodes
c.nodes, _, err = client.Nodes().List(c.queryOpts())
if err != nil {
c.Ui.Error(fmt.Sprintf("Error querying node info: %v", err))
return 1
}

// Write nodes to file
c.writeJSON(clusterDir, "nodes.json", c.nodes, err)

// Search all nodes If a node class is specified without a list of node id prefixes
if c.nodeClass != "" && nodeIDs == "" {
nodeIDs = "all"
Expand Down Expand Up @@ -565,17 +579,17 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}

// Resolve servers
members, err := client.Agent().MembersOpts(c.queryOpts())
c.members, err = client.Agent().MembersOpts(c.queryOpts())
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err))
return 1
}

// Write complete list of server members to file
c.writeJSON(clusterDir, "members.json", members, err)
c.writeJSON(clusterDir, "members.json", c.members, err)

// Filter for servers matching criteria
c.serverIDs, err = filterServerMembers(members, serverIDs, c.region)
c.serverIDs, err = filterServerMembers(c.members, serverIDs, c.region)
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to parse server list; err: %v", err))
return 1
Expand All @@ -584,8 +598,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
serversFound := 0
serverCaptureCount := 0

if members != nil {
serversFound = len(members.Members)
if c.members != nil {
serversFound = len(c.members.Members)
}
if c.serverIDs != nil {
serverCaptureCount = len(c.serverIDs)
Expand Down Expand Up @@ -900,9 +914,31 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli

func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {

pprofNodeIDs := []string{}
pprofServerIDs := []string{}

// threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions
for _, serverID := range c.serverIDs {
version := c.getNomadVersion(serverID, "")
err := checkVersion(version, minimumVersionPprofConstraint)
if err != nil {
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
}
pprofServerIDs = append(pprofServerIDs, serverID)
}

for _, nodeID := range c.nodeIDs {
version := c.getNomadVersion("", nodeID)
err := checkVersion(version, minimumVersionPprofConstraint)
if err != nil {
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
}
pprofNodeIDs = append(pprofNodeIDs, nodeID)
}

// Take the first set of pprofs synchronously...
c.Ui.Output(" Capture pprofInterval 0000")
c.collectPprofs(client, 0)
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, 0)
if c.pprofInterval == c.pprofDuration {
return
}
Expand All @@ -921,7 +957,7 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
return
case <-timer.C:
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %04d", pprofIntervalCount))
c.collectPprofs(client, pprofIntervalCount)
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, pprofIntervalCount)
timer.Reset(c.pprofInterval)
pprofIntervalCount++
}
Expand All @@ -930,12 +966,12 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
}

// collectPprofs captures the /agent/pprof for each listed node
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
for _, n := range c.nodeIDs {
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, serverIDs, nodeIDs []string, interval int) {
for _, n := range nodeIDs {
c.collectPprof(clientDir, n, client, interval)
}

for _, n := range c.serverIDs {
for _, n := range serverIDs {
c.collectPprof(serverDir, n, client, interval)
}
}
Expand Down Expand Up @@ -987,12 +1023,6 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client,
c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations
c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads

// This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
// c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives

// This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
// c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes
}

// savePprofProfile retrieves a pprof profile and writes to disk
Expand Down Expand Up @@ -1714,3 +1744,53 @@ func isRedirectError(err error) bool {
const redirectErr string = `invalid character '<' looking for beginning of value`
return strings.Contains(err.Error(), redirectErr)
}

// getNomadVersion fetches the version of Nomad running on a given server/client node ID
func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) string {
if serverID == "" && nodeID == "" {
return ""
}

version := ""
if serverID != "" {
for _, server := range c.members.Members {
// Raft v2 server
if server.Name == serverID {
version = server.Tags["build"]
}

// Raft v3 server
if server.Tags["id"] == serverID {
version = server.Tags["version"]
}
}
}

if nodeID != "" {
for _, node := range c.nodes {
if node.ID == nodeID {
version = node.Version
}
}
}

return version
}

// checkVersion verifies that version satisfies the constraint
func checkVersion(version string, versionConstraint string) error {
v, err := goversion.NewVersion(version)
if err != nil {
return fmt.Errorf("error: %v", err)
}

c, err := goversion.NewConstraint(versionConstraint)
if err != nil {
return fmt.Errorf("error: %v", err)
}

if !c.Check(v) {
return nil
}
return fmt.Errorf("unsupported version=%s matches version filter %s", version, minimumVersionPprofConstraint)
}
30 changes: 30 additions & 0 deletions command/operator_debug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,36 @@ func TestDebug_Fail_Pprof(t *testing.T) {
require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway
}

// TestDebug_PprofVersionCheck asserts that only versions < 0.12.0 are
// filtered by the version constraint.
func TestDebug_PprofVersionCheck(t *testing.T) {
cases := []struct {
version string
errMsg string
}{
{"0.8.7", ""},
{"0.11.1", "unsupported version=0.11.1 matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.2", "unsupported version=0.11.2 matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.3", ""},
{"0.11.3+ent", ""},
{"0.12.0", ""},
{"1.3.0", ""},
{"foo.bar", "error: Malformed version: foo.bar"},
}

for _, tc := range cases {
t.Run(tc.version, func(t *testing.T) {
err := checkVersion(tc.version, minimumVersionPprofConstraint)
if tc.errMsg == "" {
require.NoError(t, err, "expected no error from %s", tc.version)
} else {
require.EqualError(t, err, tc.errMsg)
}
})
}
}

func TestDebug_StringToSlice(t *testing.T) {
ci.Parallel(t)

Expand Down

0 comments on commit 522b630

Please sign in to comment.