Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

debug: add version constraint to avoid pprof panic #12807

Merged
merged 18 commits into from
Apr 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/12807.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
cli: `operator debug` command now skips generating pprofs to avoid a panic on Nomad 0.11.2. 0.11.1, and 0.11.0
```
122 changes: 101 additions & 21 deletions command/operator_debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (

"github.com/hashicorp/go-cleanhttp"
"github.com/hashicorp/go-multierror"
goversion "github.com/hashicorp/go-version"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/api/contexts"
"github.com/hashicorp/nomad/helper"
Expand Down Expand Up @@ -54,14 +55,17 @@ type OperatorDebugCommand struct {
cancel context.CancelFunc
opts *api.QueryOptions
verbose bool
members *api.ServerMembers
nodes []*api.NodeListStub
}

const (
userAgent = "nomad operator debug"
clusterDir = "cluster"
clientDir = "client"
serverDir = "server"
intervalDir = "interval"
userAgent = "nomad operator debug"
clusterDir = "cluster"
clientDir = "client"
serverDir = "server"
intervalDir = "interval"
minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2"
jazzyfresh marked this conversation as resolved.
Show resolved Hide resolved
)

func (c *OperatorDebugCommand) Help() string {
Expand Down Expand Up @@ -502,6 +506,16 @@ func (c *OperatorDebugCommand) Run(args []string) int {
AuthToken: c.Meta.token,
}

// Get complete list of client nodes
c.nodes, _, err = client.Nodes().List(c.queryOpts())
if err != nil {
c.Ui.Error(fmt.Sprintf("Error querying node info: %v", err))
return 1
}

// Write nodes to file
c.writeJSON(clusterDir, "nodes.json", c.nodes, err)

// Search all nodes If a node class is specified without a list of node id prefixes
if c.nodeClass != "" && nodeIDs == "" {
nodeIDs = "all"
Expand Down Expand Up @@ -565,17 +579,17 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}

// Resolve servers
members, err := client.Agent().MembersOpts(c.queryOpts())
c.members, err = client.Agent().MembersOpts(c.queryOpts())
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err))
return 1
}

// Write complete list of server members to file
c.writeJSON(clusterDir, "members.json", members, err)
c.writeJSON(clusterDir, "members.json", c.members, err)

// Filter for servers matching criteria
c.serverIDs, err = filterServerMembers(members, serverIDs, c.region)
c.serverIDs, err = filterServerMembers(c.members, serverIDs, c.region)
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to parse server list; err: %v", err))
return 1
Expand All @@ -584,8 +598,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
serversFound := 0
serverCaptureCount := 0

if members != nil {
serversFound = len(members.Members)
if c.members != nil {
serversFound = len(c.members.Members)
}
if c.serverIDs != nil {
serverCaptureCount = len(c.serverIDs)
Expand Down Expand Up @@ -900,9 +914,31 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli

func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {

pprofNodeIDs := []string{}
pprofServerIDs := []string{}

// threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions
for _, serverID := range c.serverIDs {
version := c.getNomadVersion(serverID, "")
err := checkVersion(version, minimumVersionPprofConstraint)
if err != nil {
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
}
pprofServerIDs = append(pprofServerIDs, serverID)
}

for _, nodeID := range c.nodeIDs {
version := c.getNomadVersion("", nodeID)
err := checkVersion(version, minimumVersionPprofConstraint)
if err != nil {
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
}
pprofNodeIDs = append(pprofNodeIDs, nodeID)
}

// Take the first set of pprofs synchronously...
c.Ui.Output(" Capture pprofInterval 0000")
c.collectPprofs(client, 0)
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, 0)
if c.pprofInterval == c.pprofDuration {
return
}
Expand All @@ -921,7 +957,7 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
return
case <-timer.C:
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %04d", pprofIntervalCount))
c.collectPprofs(client, pprofIntervalCount)
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, pprofIntervalCount)
timer.Reset(c.pprofInterval)
pprofIntervalCount++
}
Expand All @@ -930,12 +966,12 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
}

// collectPprofs captures the /agent/pprof for each listed node
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
for _, n := range c.nodeIDs {
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, serverIDs, nodeIDs []string, interval int) {
for _, n := range nodeIDs {
c.collectPprof(clientDir, n, client, interval)
}

for _, n := range c.serverIDs {
for _, n := range serverIDs {
c.collectPprof(serverDir, n, client, interval)
}
}
Expand Down Expand Up @@ -987,12 +1023,6 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client,
c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations
c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads

// This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
// c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives

// This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
// c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes
}

// savePprofProfile retrieves a pprof profile and writes to disk
Expand Down Expand Up @@ -1714,3 +1744,53 @@ func isRedirectError(err error) bool {
const redirectErr string = `invalid character '<' looking for beginning of value`
return strings.Contains(err.Error(), redirectErr)
}

// getNomadVersion fetches the version of Nomad running on a given server/client node ID
func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) string {
if serverID == "" && nodeID == "" {
return ""
}

version := ""
if serverID != "" {
for _, server := range c.members.Members {
tgross marked this conversation as resolved.
Show resolved Hide resolved
// Raft v2 server
if server.Name == serverID {
version = server.Tags["build"]
}

// Raft v3 server
if server.Tags["id"] == serverID {
version = server.Tags["version"]
}
}
}

if nodeID != "" {
for _, node := range c.nodes {
if node.ID == nodeID {
version = node.Version
}
}
}

return version
}

// checkVersion verifies that version satisfies the constraint
func checkVersion(version string, versionConstraint string) error {
v, err := goversion.NewVersion(version)
if err != nil {
return fmt.Errorf("error: %v", err)
}

c, err := goversion.NewConstraint(versionConstraint)
if err != nil {
return fmt.Errorf("error: %v", err)
}

if !c.Check(v) {
return nil
}
return fmt.Errorf("unsupported version=%s matches version filter %s", version, minimumVersionPprofConstraint)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return fmt.Errorf("unsupported version=%s matches version filter %s", version, minimumVersionPprofConstraint)
return fmt.Errorf("unsupported version=%s matches version filter %s", version, versionConstraint)

should show the versionConstraint passed in since it used for checking

}
30 changes: 30 additions & 0 deletions command/operator_debug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,36 @@ func TestDebug_Fail_Pprof(t *testing.T) {
require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway
}

// TestDebug_PprofVersionCheck asserts that only versions < 0.12.0 are
// filtered by the version constraint.
func TestDebug_PprofVersionCheck(t *testing.T) {
tgross marked this conversation as resolved.
Show resolved Hide resolved
cases := []struct {
version string
errMsg string
}{
{"0.8.7", ""},
{"0.11.1", "unsupported version=0.11.1 matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.2", "unsupported version=0.11.2 matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter >= 0.11.0, <= 0.11.2"},
{"0.11.3", ""},
{"0.11.3+ent", ""},
{"0.12.0", ""},
{"1.3.0", ""},
{"foo.bar", "error: Malformed version: foo.bar"},
}

for _, tc := range cases {
t.Run(tc.version, func(t *testing.T) {
err := checkVersion(tc.version, minimumVersionPprofConstraint)
if tc.errMsg == "" {
require.NoError(t, err, "expected no error from %s", tc.version)
} else {
require.EqualError(t, err, tc.errMsg)
}
})
}
}

func TestDebug_StringToSlice(t *testing.T) {
ci.Parallel(t)

Expand Down