From 6f411b5794f86052dfe29b8777acf3d0d7e60e4f Mon Sep 17 00:00:00 2001 From: Dave May Date: Wed, 27 Apr 2022 15:37:51 -0400 Subject: [PATCH 01/18] debug: add version check framework --- command/operator_debug.go | 22 ++++++++++++++++++++++ command/operator_debug_test.go | 30 ++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/command/operator_debug.go b/command/operator_debug.go index dbb643a49120..3c91b34cb797 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -23,6 +23,7 @@ import ( "github.com/hashicorp/go-cleanhttp" "github.com/hashicorp/go-multierror" + goversion "github.com/hashicorp/go-version" "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/api/contexts" "github.com/hashicorp/nomad/helper" @@ -1714,3 +1715,24 @@ func isRedirectError(err error) bool { const redirectErr string = `invalid character '<' looking for beginning of value` return strings.Contains(err.Error(), redirectErr) } + + +// checkVersion verifies that version satisfies the constraint +func checkVersion(version string, versionConstraint string) (bool, error) { + v, err := goversion.NewVersion(version) + if err != nil { + return false, err + } + + c, err := goversion.NewConstraint(versionConstraint) + if err != nil { + return false, err + } + + if c.Check(v) { + return true, nil + } + + err = fmt.Errorf("version %s did not match constraint: %s", version, c.String()) + return false, err +} diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index c50d8f922527..f4b3ef988fd7 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -578,6 +578,36 @@ func TestDebug_Fail_Pprof(t *testing.T) { require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway } +// TestDebug_PprofVersionCheck asserts that only versions 0.11.0 -> 0.11.2 match +// the version constraint. +func TestDebug_PprofVersionCheck(t *testing.T) { + cases := []struct { + version string + constraint string + isMatch bool + isError bool + }{ + {"0.8.7", ">= 0.11.0, <= 0.11.2", false, true}, + {"0.11.0", ">= 0.11.0, <= 0.11.2", true, false}, + {"0.11.1", ">= 0.11.0, <= 0.11.2", true, false}, + {"0.11.2", ">= 0.11.0, <= 0.11.2", true, false}, + {"0.11.3", ">= 0.11.0, <= 0.11.2", false, true}, + {"0.12.0", ">= 0.11.0, <= 0.11.2", false, true}, + {"1.3.0", ">= 0.11.0, <= 0.11.2", false, true}, + } + + for _, tc := range cases { + match, err := checkVersion(tc.version, ">= 0.11.0, <= 0.11.2") + assert.Equal(t, tc.isMatch, match) + if tc.isError { + require.Error(t, err) + fmt.Println(err.Error()) + } else { + require.NoError(t, err) + } + } +} + func TestDebug_StringToSlice(t *testing.T) { ci.Parallel(t) From f66317516d3955ee6858d0fd64c6e4112ccc8565 Mon Sep 17 00:00:00 2001 From: Dave May Date: Wed, 27 Apr 2022 16:08:33 -0400 Subject: [PATCH 02/18] debug: get server/client agent versions --- command/operator_debug.go | 42 ++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 3c91b34cb797..a1449f2848e3 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -55,6 +55,8 @@ type OperatorDebugCommand struct { cancel context.CancelFunc opts *api.QueryOptions verbose bool + members *api.ServerMembers + nodes []*api.NodeListStub } const ( @@ -503,6 +505,14 @@ func (c *OperatorDebugCommand) Run(args []string) int { AuthToken: c.Meta.token, } + // Get complete list of client nodes + var qm *api.QueryMeta + c.nodes, qm, err = client.Nodes().List(c.queryOpts()) + c.verboseOutf("%d nodes retrieved in %s", len(c.nodes), qm.RequestTime.String()) + + // Write nodes to file + c.writeJSON(clusterDir, "nodes.json", c.nodes, err) + // Search all nodes If a node class is specified without a list of node id prefixes if c.nodeClass != "" && nodeIDs == "" { nodeIDs = "all" @@ -566,17 +576,17 @@ func (c *OperatorDebugCommand) Run(args []string) int { } // Resolve servers - members, err := client.Agent().MembersOpts(c.queryOpts()) + c.members, err = client.Agent().MembersOpts(c.queryOpts()) if err != nil { c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err)) return 1 } // Write complete list of server members to file - c.writeJSON(clusterDir, "members.json", members, err) + c.writeJSON(clusterDir, "members.json", c.members, err) // Filter for servers matching criteria - c.serverIDs, err = filterServerMembers(members, serverIDs, c.region) + c.serverIDs, err = filterServerMembers(c.members, serverIDs, c.region) if err != nil { c.Ui.Error(fmt.Sprintf("Failed to parse server list; err: %v", err)) return 1 @@ -585,8 +595,8 @@ func (c *OperatorDebugCommand) Run(args []string) int { serversFound := 0 serverCaptureCount := 0 - if members != nil { - serversFound = len(members.Members) + if c.members != nil { + serversFound = len(c.members.Members) } if c.serverIDs != nil { serverCaptureCount = len(c.serverIDs) @@ -1716,6 +1726,28 @@ func isRedirectError(err error) bool { return strings.Contains(err.Error(), redirectErr) } +// getNomadVersion fetches the version of Nomad running on a given server/client node ID +func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) string { + version := "" + + if serverID != "" { + for _, server := range c.members.Members { + if server.Tags["id"] == serverID { + version = server.Tags["id"] + } + } + } + + if nodeID != "" { + for _, node := range c.nodes { + if node.ID == nodeID { + version = node.Version + } + } + } + + return version +} // checkVersion verifies that version satisfies the constraint func checkVersion(version string, versionConstraint string) (bool, error) { From 9a73a0cfe6d6d67a96dbfd3f3e50ed6e57584e46 Mon Sep 17 00:00:00 2001 From: Dave May Date: Wed, 27 Apr 2022 16:10:49 -0400 Subject: [PATCH 03/18] debug: skip trace pprof for 0.11.0 to 0.11.2 --- command/operator_debug.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index a1449f2848e3..b5a3d9635294 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -994,7 +994,6 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, opts.Debug = 0 c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines - c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads @@ -1004,6 +1003,19 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, // This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable // c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes + + // trace pprof causes a panic on Nomad 0.11.0 to 0.11.2, so verify the the + // Nomad version of this agent before capture + version := c.getNomadVersion(opts.NodeID, opts.ServerID) + // c.Ui.Output(fmt.Sprintf("Nomad Version: %v", version)) + + if _, err := checkVersion(version, ">= 0.11.0, <= 0.11.2"); err != nil { + c.Ui.Error((fmt.Sprintf("Not running trace pprof profile, err: %v", err))) + return + } + + c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program + } // savePprofProfile retrieves a pprof profile and writes to disk From 2bf2c2575857b3fd9d00749069dbbac6b0bb325e Mon Sep 17 00:00:00 2001 From: Dave May Date: Wed, 27 Apr 2022 17:24:27 -0400 Subject: [PATCH 04/18] debug: threadcreate is the actual culprit --- command/operator_debug.go | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index b5a3d9635294..4162402032cf 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -993,10 +993,10 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, // Reset to pprof binary format opts.Debug = 0 - c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines - c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. - c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations - c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads + c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines + c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program + c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. + c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations // This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable // c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives @@ -1004,18 +1004,14 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, // This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable // c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes - // trace pprof causes a panic on Nomad 0.11.0 to 0.11.2, so verify the the - // Nomad version of this agent before capture + // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions version := c.getNomadVersion(opts.NodeID, opts.ServerID) - // c.Ui.Output(fmt.Sprintf("Nomad Version: %v", version)) - if _, err := checkVersion(version, ">= 0.11.0, <= 0.11.2"); err != nil { - c.Ui.Error((fmt.Sprintf("Not running trace pprof profile, err: %v", err))) + c.Ui.Error((fmt.Sprintf("Not running threadcreate pprof profile, err: %v", err))) return } - c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program - + c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads } // savePprofProfile retrieves a pprof profile and writes to disk From 80e63b01c5e173f4c5332b6422e88dd61e36cc66 Mon Sep 17 00:00:00 2001 From: Dave May Date: Wed, 27 Apr 2022 21:31:49 -0400 Subject: [PATCH 05/18] debug: bail early on error fetching nodes --- command/operator_debug.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 4162402032cf..ed01ab0ee5b7 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -506,9 +506,11 @@ func (c *OperatorDebugCommand) Run(args []string) int { } // Get complete list of client nodes - var qm *api.QueryMeta - c.nodes, qm, err = client.Nodes().List(c.queryOpts()) - c.verboseOutf("%d nodes retrieved in %s", len(c.nodes), qm.RequestTime.String()) + c.nodes, _, err = client.Nodes().List(c.queryOpts()) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error querying node info: %v", err)) + return 1 + } // Write nodes to file c.writeJSON(clusterDir, "nodes.json", c.nodes, err) From 83cd22ad7fcaaebdf83ea6001cd9305d0b54f5d8 Mon Sep 17 00:00:00 2001 From: Dave May Date: Wed, 27 Apr 2022 21:44:09 -0400 Subject: [PATCH 06/18] debug: clean up version check error handling --- command/operator_debug.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index ed01ab0ee5b7..245d7bc53e63 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -1008,7 +1008,7 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions version := c.getNomadVersion(opts.NodeID, opts.ServerID) - if _, err := checkVersion(version, ">= 0.11.0, <= 0.11.2"); err != nil { + if skip, err := checkVersion(version, ">= 0.11.0, <= 0.11.2"); skip || err != nil { c.Ui.Error((fmt.Sprintf("Not running threadcreate pprof profile, err: %v", err))) return } @@ -1775,6 +1775,5 @@ func checkVersion(version string, versionConstraint string) (bool, error) { return true, nil } - err = fmt.Errorf("version %s did not match constraint: %s", version, c.String()) - return false, err + return false, nil } From ce4c5098f6b1d78bcfe6149c598e2d8043308b1c Mon Sep 17 00:00:00 2001 From: Dave May Date: Wed, 27 Apr 2022 21:46:14 -0400 Subject: [PATCH 07/18] debug: cleanup nomad version fetch --- command/operator_debug.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 245d7bc53e63..16ec945b75cc 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -1007,7 +1007,7 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, // c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions - version := c.getNomadVersion(opts.NodeID, opts.ServerID) + version := c.getNomadVersion(opts.ServerID, opts.NodeID) if skip, err := checkVersion(version, ">= 0.11.0, <= 0.11.2"); skip || err != nil { c.Ui.Error((fmt.Sprintf("Not running threadcreate pprof profile, err: %v", err))) return @@ -1738,12 +1738,15 @@ func isRedirectError(err error) bool { // getNomadVersion fetches the version of Nomad running on a given server/client node ID func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) string { - version := "" + if serverID == "" && nodeID == "" { + return "" + } + version := "" if serverID != "" { for _, server := range c.members.Members { if server.Tags["id"] == serverID { - version = server.Tags["id"] + version = server.Tags["version"] } } } From 11d67a6fa478ba108b7dff1e9e3a6c7c921b9143 Mon Sep 17 00:00:00 2001 From: Dave May Date: Wed, 27 Apr 2022 23:05:39 -0400 Subject: [PATCH 08/18] debug: clarify reason for skipping pprof --- command/operator_debug.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 16ec945b75cc..71f9dce3fd59 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -1008,8 +1008,13 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions version := c.getNomadVersion(opts.ServerID, opts.NodeID) - if skip, err := checkVersion(version, ">= 0.11.0, <= 0.11.2"); skip || err != nil { - c.Ui.Error((fmt.Sprintf("Not running threadcreate pprof profile, err: %v", err))) + skip, err := checkVersion(version, ">= 0.11.0, <= 0.11.2") + if skip { + c.Ui.Warn(fmt.Sprintf("Skipping threadcreate pprof: unsupported version=%s matches constraint %s", version, ">= 0.11.0, <= 0.11.2")) + return + } + if err != nil { + c.Ui.Error((fmt.Sprintf("Skipping threadcreate pprof, error: %v", err))) return } From 299034d9ef08f1ed6b3c0d1c16084de59483974f Mon Sep 17 00:00:00 2001 From: Dave May Date: Wed, 27 Apr 2022 23:29:11 -0400 Subject: [PATCH 09/18] debug: another variant for obtaining version --- command/operator_debug.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/command/operator_debug.go b/command/operator_debug.go index 71f9dce3fd59..07c9a73aa092 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -1750,6 +1750,12 @@ func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) s version := "" if serverID != "" { for _, server := range c.members.Members { + // Raft v2 server + if server.Name == serverID { + version = server.Tags["build"] + } + + // Raft v3 server if server.Tags["id"] == serverID { version = server.Tags["version"] } From 7c674cc3052af34f8c9e23abad7a3b90cf99cd2d Mon Sep 17 00:00:00 2001 From: Dave May Date: Thu, 28 Apr 2022 00:36:01 -0400 Subject: [PATCH 10/18] debug: update test to match error behavior --- command/operator_debug_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index f4b3ef988fd7..98604ff36498 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -587,13 +587,14 @@ func TestDebug_PprofVersionCheck(t *testing.T) { isMatch bool isError bool }{ - {"0.8.7", ">= 0.11.0, <= 0.11.2", false, true}, + {"0.8.7", ">= 0.11.0, <= 0.11.2", false, false}, {"0.11.0", ">= 0.11.0, <= 0.11.2", true, false}, {"0.11.1", ">= 0.11.0, <= 0.11.2", true, false}, {"0.11.2", ">= 0.11.0, <= 0.11.2", true, false}, - {"0.11.3", ">= 0.11.0, <= 0.11.2", false, true}, - {"0.12.0", ">= 0.11.0, <= 0.11.2", false, true}, - {"1.3.0", ">= 0.11.0, <= 0.11.2", false, true}, + {"0.11.3", ">= 0.11.0, <= 0.11.2", false, false}, + {"0.12.0", ">= 0.11.0, <= 0.11.2", false, false}, + {"1.3.0", ">= 0.11.0, <= 0.11.2", false, false}, + {"foo.bar", ">= 0.11.0, <= 0.11.2", false, true}, } for _, tc := range cases { @@ -601,7 +602,6 @@ func TestDebug_PprofVersionCheck(t *testing.T) { assert.Equal(t, tc.isMatch, match) if tc.isError { require.Error(t, err) - fmt.Println(err.Error()) } else { require.NoError(t, err) } From bc0092c92b6adccdffab4b579fc7ee2d71b4f246 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 28 Apr 2022 09:21:09 -0400 Subject: [PATCH 11/18] constant-ify constraint --- command/operator_debug.go | 16 +++++++++------- command/operator_debug_test.go | 25 ++++++++++++------------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 07c9a73aa092..6074fbe0dc62 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -60,11 +60,12 @@ type OperatorDebugCommand struct { } const ( - userAgent = "nomad operator debug" - clusterDir = "cluster" - clientDir = "client" - serverDir = "server" - intervalDir = "interval" + userAgent = "nomad operator debug" + clusterDir = "cluster" + clientDir = "client" + serverDir = "server" + intervalDir = "interval" + minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2" ) func (c *OperatorDebugCommand) Help() string { @@ -1008,9 +1009,10 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions version := c.getNomadVersion(opts.ServerID, opts.NodeID) - skip, err := checkVersion(version, ">= 0.11.0, <= 0.11.2") + skip, err := checkVersion(version, minimumVersionPprofConstraint) + if skip { - c.Ui.Warn(fmt.Sprintf("Skipping threadcreate pprof: unsupported version=%s matches constraint %s", version, ">= 0.11.0, <= 0.11.2")) + c.Ui.Warn(fmt.Sprintf("Skipping threadcreate pprof: unsupported version=%s matches constraint %s", version, minimumVersionPprofConstraint)) return } if err != nil { diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index 98604ff36498..95fa8b8e7e4a 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -582,23 +582,22 @@ func TestDebug_Fail_Pprof(t *testing.T) { // the version constraint. func TestDebug_PprofVersionCheck(t *testing.T) { cases := []struct { - version string - constraint string - isMatch bool - isError bool + version string + isMatch bool + isError bool }{ - {"0.8.7", ">= 0.11.0, <= 0.11.2", false, false}, - {"0.11.0", ">= 0.11.0, <= 0.11.2", true, false}, - {"0.11.1", ">= 0.11.0, <= 0.11.2", true, false}, - {"0.11.2", ">= 0.11.0, <= 0.11.2", true, false}, - {"0.11.3", ">= 0.11.0, <= 0.11.2", false, false}, - {"0.12.0", ">= 0.11.0, <= 0.11.2", false, false}, - {"1.3.0", ">= 0.11.0, <= 0.11.2", false, false}, - {"foo.bar", ">= 0.11.0, <= 0.11.2", false, true}, + {"0.8.7", false, false}, + {"0.11.0", true, false}, + {"0.11.1", true, false}, + {"0.11.2", true, false}, + {"0.11.3", false, false}, + {"0.12.0", false, false}, + {"1.3.0", false, false}, + {"foo.bar", false, true}, } for _, tc := range cases { - match, err := checkVersion(tc.version, ">= 0.11.0, <= 0.11.2") + match, err := checkVersion(tc.version, minimumVersionPprofConstraint) assert.Equal(t, tc.isMatch, match) if tc.isError { require.Error(t, err) From 26b808046c80cf5a511871ea523742c9c7050bc1 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 28 Apr 2022 09:21:36 -0400 Subject: [PATCH 12/18] remove commented-out code --- command/operator_debug.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 6074fbe0dc62..8ee6723fd3d1 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -1001,12 +1001,6 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations - // This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable - // c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives - - // This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable - // c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes - // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions version := c.getNomadVersion(opts.ServerID, opts.NodeID) skip, err := checkVersion(version, minimumVersionPprofConstraint) From 4431840a640bc964cbc3d3fbe46ea980a26d14cb Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 28 Apr 2022 09:28:46 -0400 Subject: [PATCH 13/18] move version check to top of pprof loop --- command/operator_debug.go | 62 ++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 8ee6723fd3d1..61d1da75034e 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -914,9 +914,39 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) { + pprofNodeIDs := []string{} + pprofServerIDs := []string{} + + // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions + for _, serverID := range c.serverIDs { + version := c.getNomadVersion(serverID, "") + skip, err := checkVersion(version, minimumVersionPprofConstraint) + + if skip { + c.Ui.Warn(fmt.Sprintf("Skipping threadcreate pprof: unsupported version=%s matches constraint %s", version, minimumVersionPprofConstraint)) + } + if err != nil { + c.Ui.Error((fmt.Sprintf("Skipping threadcreate pprof, error: %v", err))) + } + pprofServerIDs = append(pprofServerIDs, serverID) + } + + for _, nodeID := range c.nodeIDs { + version := c.getNomadVersion("", nodeID) + skip, err := checkVersion(version, minimumVersionPprofConstraint) + + if skip { + c.Ui.Warn(fmt.Sprintf("Skipping threadcreate pprof: unsupported version=%s matches constraint %s", version, minimumVersionPprofConstraint)) + } + if err != nil { + c.Ui.Error((fmt.Sprintf("Skipping threadcreate pprof, error: %v", err))) + } + pprofNodeIDs = append(pprofNodeIDs, nodeID) + } + // Take the first set of pprofs synchronously... c.Ui.Output(" Capture pprofInterval 0000") - c.collectPprofs(client, 0) + c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, 0) if c.pprofInterval == c.pprofDuration { return } @@ -935,7 +965,7 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) { return case <-timer.C: c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %04d", pprofIntervalCount)) - c.collectPprofs(client, pprofIntervalCount) + c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, pprofIntervalCount) timer.Reset(c.pprofInterval) pprofIntervalCount++ } @@ -944,12 +974,12 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) { } // collectPprofs captures the /agent/pprof for each listed node -func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) { - for _, n := range c.nodeIDs { +func (c *OperatorDebugCommand) collectPprofs(client *api.Client, serverIDs, nodeIDs []string, interval int) { + for _, n := range nodeIDs { c.collectPprof(clientDir, n, client, interval) } - for _, n := range c.serverIDs { + for _, n := range serverIDs { c.collectPprof(serverDir, n, client, interval) } } @@ -996,24 +1026,10 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client, // Reset to pprof binary format opts.Debug = 0 - c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines - c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program - c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. - c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations - - // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions - version := c.getNomadVersion(opts.ServerID, opts.NodeID) - skip, err := checkVersion(version, minimumVersionPprofConstraint) - - if skip { - c.Ui.Warn(fmt.Sprintf("Skipping threadcreate pprof: unsupported version=%s matches constraint %s", version, minimumVersionPprofConstraint)) - return - } - if err != nil { - c.Ui.Error((fmt.Sprintf("Skipping threadcreate pprof, error: %v", err))) - return - } - + c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines + c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program + c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. + c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads } From f61d205e3d07abf808428ac9c26acca90e6af89b Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 28 Apr 2022 09:43:29 -0400 Subject: [PATCH 14/18] update version filter to <0.12 --- command/operator_debug.go | 31 ++++++++++----------------- command/operator_debug_test.go | 38 +++++++++++++++++----------------- 2 files changed, 30 insertions(+), 39 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 61d1da75034e..1b291d5d733a 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -65,7 +65,7 @@ const ( clientDir = "client" serverDir = "server" intervalDir = "interval" - minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2" + minimumVersionPprofConstraint = "< 0.12.0" ) func (c *OperatorDebugCommand) Help() string { @@ -920,26 +920,18 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) { // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions for _, serverID := range c.serverIDs { version := c.getNomadVersion(serverID, "") - skip, err := checkVersion(version, minimumVersionPprofConstraint) - - if skip { - c.Ui.Warn(fmt.Sprintf("Skipping threadcreate pprof: unsupported version=%s matches constraint %s", version, minimumVersionPprofConstraint)) - } + err := checkVersion(version, minimumVersionPprofConstraint) if err != nil { - c.Ui.Error((fmt.Sprintf("Skipping threadcreate pprof, error: %v", err))) + c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err)) } pprofServerIDs = append(pprofServerIDs, serverID) } for _, nodeID := range c.nodeIDs { version := c.getNomadVersion("", nodeID) - skip, err := checkVersion(version, minimumVersionPprofConstraint) - - if skip { - c.Ui.Warn(fmt.Sprintf("Skipping threadcreate pprof: unsupported version=%s matches constraint %s", version, minimumVersionPprofConstraint)) - } + err := checkVersion(version, minimumVersionPprofConstraint) if err != nil { - c.Ui.Error((fmt.Sprintf("Skipping threadcreate pprof, error: %v", err))) + c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err)) } pprofNodeIDs = append(pprofNodeIDs, nodeID) } @@ -1786,20 +1778,19 @@ func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) s } // checkVersion verifies that version satisfies the constraint -func checkVersion(version string, versionConstraint string) (bool, error) { +func checkVersion(version string, versionConstraint string) error { v, err := goversion.NewVersion(version) if err != nil { - return false, err + return fmt.Errorf("error: %v", err) } c, err := goversion.NewConstraint(versionConstraint) if err != nil { - return false, err + return fmt.Errorf("error: %v", err) } - if c.Check(v) { - return true, nil + if !c.Check(v) { + return nil } - - return false, nil + return fmt.Errorf("unsupported version=%s matches version filter %s", version, minimumVersionPprofConstraint) } diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index 95fa8b8e7e4a..29183b958b8f 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -578,32 +578,32 @@ func TestDebug_Fail_Pprof(t *testing.T) { require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway } -// TestDebug_PprofVersionCheck asserts that only versions 0.11.0 -> 0.11.2 match -// the version constraint. +// TestDebug_PprofVersionCheck asserts that only versions < 0.12.0 are +// filtered by the version constraint. func TestDebug_PprofVersionCheck(t *testing.T) { cases := []struct { version string - isMatch bool - isError bool + errMsg string }{ - {"0.8.7", false, false}, - {"0.11.0", true, false}, - {"0.11.1", true, false}, - {"0.11.2", true, false}, - {"0.11.3", false, false}, - {"0.12.0", false, false}, - {"1.3.0", false, false}, - {"foo.bar", false, true}, + {"0.8.7", "unsupported version=0.8.7 matches version filter < 0.12.0"}, + {"0.11.0", "unsupported version=0.11.0 matches version filter < 0.12.0"}, + {"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter < 0.12.0"}, + {"0.11.3", "unsupported version=0.11.3 matches version filter < 0.12.0"}, + {"0.11.3+ent", "unsupported version=0.11.3+ent matches version filter < 0.12.0"}, + {"0.12.0", ""}, + {"1.3.0", ""}, + {"foo.bar", "error: Malformed version: foo.bar"}, } for _, tc := range cases { - match, err := checkVersion(tc.version, minimumVersionPprofConstraint) - assert.Equal(t, tc.isMatch, match) - if tc.isError { - require.Error(t, err) - } else { - require.NoError(t, err) - } + t.Run(tc.version, func(t *testing.T) { + err := checkVersion(tc.version, minimumVersionPprofConstraint) + if tc.errMsg == "" { + require.NoError(t, err, "expected no error from %s", tc.version) + } else { + require.EqualError(t, err, tc.errMsg) + } + }) } } From 16894c7fbbcb5290eaf389ef35bdad826397463c Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 28 Apr 2022 09:48:15 -0400 Subject: [PATCH 15/18] changelog entry --- .changelog/12807.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/12807.txt diff --git a/.changelog/12807.txt b/.changelog/12807.txt new file mode 100644 index 000000000000..60111c7471f9 --- /dev/null +++ b/.changelog/12807.txt @@ -0,0 +1,3 @@ +```release-note:improvement +cli: `operator debug` command now skips generating pprofs to avoid a panic on unsupported versions of Nomad older than 0.12.0 +``` From 2dfd818f3ced05bc8737ff5cd74168cfa80ce341 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 28 Apr 2022 11:55:34 -0400 Subject: [PATCH 16/18] re-narrow constraint --- command/operator_debug.go | 2 +- command/operator_debug_test.go | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 1b291d5d733a..6af31d21b3de 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -65,7 +65,7 @@ const ( clientDir = "client" serverDir = "server" intervalDir = "interval" - minimumVersionPprofConstraint = "< 0.12.0" + minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2" ) func (c *OperatorDebugCommand) Help() string { diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index 29183b958b8f..04471ed7c80f 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -585,11 +585,12 @@ func TestDebug_PprofVersionCheck(t *testing.T) { version string errMsg string }{ - {"0.8.7", "unsupported version=0.8.7 matches version filter < 0.12.0"}, - {"0.11.0", "unsupported version=0.11.0 matches version filter < 0.12.0"}, - {"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter < 0.12.0"}, - {"0.11.3", "unsupported version=0.11.3 matches version filter < 0.12.0"}, - {"0.11.3+ent", "unsupported version=0.11.3+ent matches version filter < 0.12.0"}, + {"0.8.7", ""}, + {"0.11.1", "unsupported version=0.11.1 matches version filter >= 0.11.0, <= 0.11.2"}, + {"0.11.2", "unsupported version=0.11.2 matches version filter >= 0.11.0, <= 0.11.2"}, + {"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter >= 0.11.0, <= 0.11.2"}, + {"0.11.3", ""}, + {"0.11.3+ent", ""}, {"0.12.0", ""}, {"1.3.0", ""}, {"foo.bar", "error: Malformed version: foo.bar"}, From 71eabe07b60b0a3fc1949e3ecd8f83c40415a781 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 28 Apr 2022 12:10:10 -0400 Subject: [PATCH 17/18] revise changelog entry --- .changelog/12807.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changelog/12807.txt b/.changelog/12807.txt index 60111c7471f9..79d05376f60e 100644 --- a/.changelog/12807.txt +++ b/.changelog/12807.txt @@ -1,3 +1,3 @@ ```release-note:improvement -cli: `operator debug` command now skips generating pprofs to avoid a panic on unsupported versions of Nomad older than 0.12.0 +cli: `operator debug` command now skips generating pprofs to avoid a panic on Nomad 0.11.2 and 0.11.1 ``` From c32df359cc43c853f6530b1801a0de028da8fbab Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 28 Apr 2022 13:14:42 -0400 Subject: [PATCH 18/18] fixup changelog --- .changelog/12807.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changelog/12807.txt b/.changelog/12807.txt index 79d05376f60e..af29d5e7c0e7 100644 --- a/.changelog/12807.txt +++ b/.changelog/12807.txt @@ -1,3 +1,3 @@ ```release-note:improvement -cli: `operator debug` command now skips generating pprofs to avoid a panic on Nomad 0.11.2 and 0.11.1 +cli: `operator debug` command now skips generating pprofs to avoid a panic on Nomad 0.11.2. 0.11.1, and 0.11.0 ```