From 98316317cf42523c20b5bffc87d65a76f4b99def Mon Sep 17 00:00:00 2001 From: davemay99 Date: Thu, 4 Mar 2021 10:27:14 -0500 Subject: [PATCH 1/8] Change default interval from 2m to 30s --- command/operator_debug.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index eeaf8ae5564c..72f5e17a5e46 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -74,8 +74,8 @@ Debug Options: The duration of the log monitor command. Defaults to 2m. -interval= - The interval between snapshots of the Nomad state. If unspecified, only one snapshot is - captured. + The interval between snapshots of the Nomad state. Set interval equal to + duration to capture a single snapshot. Defaults to 30s. -log-level= The log level to monitor. Defaults to DEBUG. @@ -195,7 +195,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { var nodeIDs, serverIDs string flags.StringVar(&duration, "duration", "2m", "") - flags.StringVar(&interval, "interval", "2m", "") + flags.StringVar(&interval, "interval", "30s", "") flags.StringVar(&c.logLevel, "log-level", "DEBUG", "") flags.IntVar(&c.maxNodes, "max-nodes", 10, "") flags.StringVar(&c.nodeClass, "node-class", "", "") @@ -246,6 +246,13 @@ func (c *OperatorDebugCommand) Run(args []string) int { } c.interval = i + // Sanity check interval + if i.Seconds() > d.Seconds() { + c.Ui.Info(fmt.Sprintf("Interval %s is greater than duration %s, resetting to %s.", interval, duration, duration)) + interval = duration // Update string for summary + c.interval = c.duration // Update parsed interval + } + // Parse the pprof capture duration pd, err := time.ParseDuration(pprofDuration) if err != nil { From e3c17683cd3d7fd687b7273cb917f182240364f1 Mon Sep 17 00:00:00 2001 From: davemay99 Date: Thu, 4 Mar 2021 10:35:19 -0500 Subject: [PATCH 2/8] Change default to capture all servers --- command/operator_debug.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 72f5e17a5e46..cca16c139dba 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -94,8 +94,8 @@ Debug Options: Duration for pprof collection. Defaults to 1s. -server-id=, - Comma separated list of Nomad server names, "leader", or "all" to monitor for logs and include pprof - profiles. + Comma separated list of Nomad server names to monitor for logs, API outputs, and pprof profiles. + Accepts server names, "leader", or "all". Defaults to "all". -stale= If "false", the default, get membership data from the cluster leader. If the cluster is in @@ -200,7 +200,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { flags.IntVar(&c.maxNodes, "max-nodes", 10, "") flags.StringVar(&c.nodeClass, "node-class", "", "") flags.StringVar(&nodeIDs, "node-id", "", "") - flags.StringVar(&serverIDs, "server-id", "", "") + flags.StringVar(&serverIDs, "server-id", "all", "") flags.BoolVar(&c.stale, "stale", false, "") flags.StringVar(&output, "output", "", "") flags.StringVar(&pprofDuration, "pprof-duration", "1s", "") From f0e4348639bce81da09af9546d78dd52d8028b27 Mon Sep 17 00:00:00 2001 From: davemay99 Date: Thu, 4 Mar 2021 10:39:10 -0500 Subject: [PATCH 3/8] gitignore debug bundles --- .gitignore | 3 +++ CHANGELOG.md | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 656ec0a7ec31..3ad9447e65fe 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,6 @@ azure-hashistack.pem # generated keys for e2e tests e2e/terraform/keys/ + +# debug bundles +nomad-debug-* diff --git a/CHANGELOG.md b/CHANGELOG.md index 6279f1855802..2883811a1932 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,9 @@ BUG FIXES: * api: Added missing devices block to AllocatedTaskResources [[GH-10064](https://github.com/hashicorp/nomad/pull/10064)] * cli: Fixed a bug where non-int proxy port would panic CLI [[GH-10072](https://github.com/hashicorp/nomad/issues/10072)] * cli: Fixed a bug where `nomad operator debug` incorrectly parsed https Consul API URLs. [[GH-10082](https://github.com/hashicorp/nomad/pull/10082)] + * cli: Update defaults for `nomad operator debug` flags `-interval` and `-server-id` to match common usage. [GH-10121] * client: Fixed log formatting when killing tasks. [[GH-10135](https://github.com/hashicorp/nomad/issues/10135)] - * ui: Fixed the rendering of interstitial components shown after processing a dynamic application sizing recommendation. [[GH-10094](https://github.com/hashicorp/nomad/pull/10094)] + * ui: Fixed the rendering of interstitial components shown after processing a dynamic application sizing recommendation. [[GH-10094](https://github.com/hashicorp/nomad/pull/10094)] IMPROVEMENTS: * consul/connect: Enable setting `local_bind_address` field on connect upstreams [[GH-6248](https://github.com/hashicorp/nomad/issues/6248)] From f6e8b735e614251142cf2e664d0a0e00198f11ac Mon Sep 17 00:00:00 2001 From: davemay99 Date: Fri, 5 Mar 2021 17:34:05 -0500 Subject: [PATCH 4/8] Organize help text and wrap at 80 columns --- command/operator_debug.go | 92 ++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index cca16c139dba..cce300ca5aa6 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -55,59 +55,26 @@ func (c *OperatorDebugCommand) Help() string { helpText := ` Usage: nomad operator debug [options] - Build an archive containing Nomad cluster configuration and state, and Consul and Vault - status. Include logs and pprof profiles for selected servers and client nodes. + Build an archive containing Nomad cluster configuration and state, and Consul + and Vault status. Include logs and pprof profiles for selected servers and + client nodes. If ACLs are enabled, this command will require a token with the 'node:read' capability to run. In order to collect information, the token will also require the 'agent:read' and 'operator:read' capabilities, as well as the 'list-jobs' capability for all namespaces. To collect pprof profiles the - token will also require 'agent:write', or enable_debug configuration set to true. + token will also require 'agent:write', or enable_debug configuration set to + true. General Options: ` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + ` -Debug Options: - - -duration= - The duration of the log monitor command. Defaults to 2m. - - -interval= - The interval between snapshots of the Nomad state. Set interval equal to - duration to capture a single snapshot. Defaults to 30s. - - -log-level= - The log level to monitor. Defaults to DEBUG. - - -max-nodes= - Cap the maximum number of client nodes included in the capture. Defaults to 10, set to 0 for unlimited. - - -node-id=, - Comma separated list of Nomad client node ids, to monitor for logs and include pprof - profiles. Accepts id prefixes, and "all" to select all nodes (up to count = max-nodes). - - -node-class= - Filter client nodes based on node class. - - -pprof-duration= - Duration for pprof collection. Defaults to 1s. - - -server-id=, - Comma separated list of Nomad server names to monitor for logs, API outputs, and pprof profiles. - Accepts server names, "leader", or "all". Defaults to "all". - - -stale= - If "false", the default, get membership data from the cluster leader. If the cluster is in - an outage unable to establish leadership, it may be necessary to get the configuration from - a non-leader server. - - -output= - Path to the parent directory of the output directory. If not specified, an archive is built - in the current directory. +Consul Options: -consul-http-addr= - The address and port of the Consul HTTP agent. Overrides the CONSUL_HTTP_ADDR environment variable. + The address and port of the Consul HTTP agent. Overrides the + CONSUL_HTTP_ADDR environment variable. -consul-token= Token used to query Consul. Overrides the CONSUL_HTTP_TOKEN environment @@ -133,6 +100,8 @@ Debug Options: Path to a directory of PEM encoded CA cert files to verify the Consul certificate. Overrides the CONSUL_CAPATH environment variable. +Vault Options: + -vault-address= The address and port of the Vault HTTP agent. Overrides the VAULT_ADDR environment variable. @@ -156,6 +125,47 @@ Debug Options: -vault-ca-path= Path to a directory of PEM encoded CA cert files to verify the Vault certificate. Overrides the VAULT_CAPATH environment variable. + +Debug Options: + + -duration= + The duration of the log monitor command. Defaults to 2m. + + -interval= + The interval between snapshots of the Nomad state. Set interval equal to + duration to capture a single snapshot. Defaults to 30s. + + -log-level= + The log level to monitor. Defaults to DEBUG. + + -max-nodes= + Cap the maximum number of client nodes included in the capture. Defaults + to 10, set to 0 for unlimited. + + -node-id=, + Comma separated list of Nomad client node ids to monitor for logs, API + outputs, and pprof profiles. Accepts id prefixes, and "all" to select all + nodes (up to count = max-nodes). Defaults to "all". + + -node-class= + Filter client nodes based on node class. + + -pprof-duration= + Duration for pprof collection. Defaults to 1s. + + -server-id=, + Comma separated list of Nomad server names to monitor for logs, API + outputs, and pprof profiles. Accepts server names, "leader", or "all". + Defaults to "all". + + -stale= + If "false", the default, get membership data from the cluster leader. If + the cluster is in an outage unable to establish leadership, it may be + necessary to get the configuration from a non-leader server. + + -output= + Path to the parent directory of the output directory. If not specified, an + archive is built in the current directory. ` return strings.TrimSpace(helpText) } From 0f0e9a0d712cf4bfd685aba6b8359b8f844a16ab Mon Sep 17 00:00:00 2001 From: davemay99 Date: Fri, 5 Mar 2021 20:21:19 -0500 Subject: [PATCH 5/8] Make invalid intervals fail Update tests to reflect interval validation --- command/operator_debug.go | 7 +++---- command/operator_debug_test.go | 27 ++++++++++++++++----------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index cce300ca5aa6..5b8e6a6303fa 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -256,11 +256,10 @@ func (c *OperatorDebugCommand) Run(args []string) int { } c.interval = i - // Sanity check interval + // Validate interval if i.Seconds() > d.Seconds() { - c.Ui.Info(fmt.Sprintf("Interval %s is greater than duration %s, resetting to %s.", interval, duration, duration)) - interval = duration // Update string for summary - c.interval = c.duration // Update parsed interval + c.Ui.Error(fmt.Sprintf("Error parsing interval: %s is greater than duration %s", interval, duration)) + return 1 } // Parse the pprof capture duration diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index 1a3173d389e6..dff63b5c8611 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -119,7 +119,7 @@ func TestDebug_NodeClass(t *testing.T) { cases := testCases{ { name: "address=api, node-class=clienta, max-nodes=2", - args: []string{"-address", url, "-duration", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clienta", "-max-nodes", "2"}, + args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clienta", "-max-nodes", "2"}, expectedCode: 0, expectedOutputs: []string{ "Servers: (1/1)", @@ -132,7 +132,7 @@ func TestDebug_NodeClass(t *testing.T) { }, { name: "address=api, node-class=clientb, max-nodes=2", - args: []string{"-address", url, "-duration", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clientb", "-max-nodes", "2"}, + args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clientb", "-max-nodes", "2"}, expectedCode: 0, expectedOutputs: []string{ "Servers: (1/1)", @@ -189,19 +189,19 @@ func TestDebug_ClientToServer(t *testing.T) { var cases = testCases{ { name: "testAgent api server", - args: []string{"-address", url, "-duration", "250ms", "-server-id", "all", "-node-id", "all"}, + args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"}, expectedCode: 0, expectedOutputs: []string{"Created debug archive"}, }, { name: "server address", - args: []string{"-address", addrServer, "-duration", "250ms", "-server-id", "all", "-node-id", "all"}, + args: []string{"-address", addrServer, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"}, expectedCode: 0, expectedOutputs: []string{"Created debug archive"}, }, { name: "client1 address - verify no SIGSEGV panic", - args: []string{"-address", addrClient1, "-duration", "250ms", "-server-id", "all", "-node-id", "all"}, + args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"}, expectedCode: 0, expectedOutputs: []string{"Created debug archive"}, }, @@ -218,7 +218,7 @@ func TestDebug_SingleServer(t *testing.T) { var cases = testCases{ { name: "address=api, server-id=leader", - args: []string{"-address", url, "-duration", "250ms", "-server-id", "leader"}, + args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader"}, expectedCode: 0, expectedOutputs: []string{ "Servers: (1/1)", @@ -229,7 +229,7 @@ func TestDebug_SingleServer(t *testing.T) { }, { name: "address=api, server-id=all", - args: []string{"-address", url, "-duration", "250ms", "-server-id", "all"}, + args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all"}, expectedCode: 0, expectedOutputs: []string{ "Servers: (1/1)", @@ -261,7 +261,7 @@ func TestDebug_Failures(t *testing.T) { }, { name: "Fails missing node ids", - args: []string{"-node-id", "abc,def", "-duration", "250ms"}, + args: []string{"-node-id", "abc,def", "-duration", "250ms", "-interval", "250ms"}, expectedCode: 1, }, { @@ -274,6 +274,11 @@ func TestDebug_Failures(t *testing.T) { args: []string{"-interval", "bar"}, expectedCode: 1, }, + { + name: "Fails intervals greater than duration", + args: []string{"-duration", "5m", "-interval", "10m"}, + expectedCode: 1, + }, { name: "Fails bad address", args: []string{"-address", url + "bogus"}, @@ -307,7 +312,7 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) { cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} // Debug on the leader and all client nodes - code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "leader", "-node-id", "all", "-output", os.TempDir()}) + code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", os.TempDir()}) assert.Equal(t, 0, code) // Bad plugin name should be escaped before it reaches the sandbox test @@ -391,7 +396,7 @@ func TestDebug_ExistingOutput(t *testing.T) { os.MkdirAll(path, 0755) defer os.Remove(path) - code := cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms"}) + code := cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms", "-interval", "50ms"}) require.Equal(t, 2, code) } @@ -413,7 +418,7 @@ func TestDebug_Fail_Pprof(t *testing.T) { cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} // Debug on client - node class = "clienta" - code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "all"}) + code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all"}) assert.Equal(t, 0, code) // Pprof failure isn't fatal require.Contains(t, ui.OutputWriter.String(), "Starting debugger") From 4eb3199aed3e69e4150d32257984108f19845c00 Mon Sep 17 00:00:00 2001 From: Dave May Date: Mon, 8 Mar 2021 16:07:32 -0500 Subject: [PATCH 6/8] Update CHANGELOG.md Co-authored-by: Tim Gross --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2883811a1932..8cc1e57edd9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ BUG FIXES: * cli: Fixed a bug where `nomad operator debug` incorrectly parsed https Consul API URLs. [[GH-10082](https://github.com/hashicorp/nomad/pull/10082)] * cli: Update defaults for `nomad operator debug` flags `-interval` and `-server-id` to match common usage. [GH-10121] * client: Fixed log formatting when killing tasks. [[GH-10135](https://github.com/hashicorp/nomad/issues/10135)] - * ui: Fixed the rendering of interstitial components shown after processing a dynamic application sizing recommendation. [[GH-10094](https://github.com/hashicorp/nomad/pull/10094)] + * ui: Fixed the rendering of interstitial components shown after processing a dynamic application sizing recommendation. [[GH-10094](https://github.com/hashicorp/nomad/pull/10094)] IMPROVEMENTS: * consul/connect: Enable setting `local_bind_address` field on connect upstreams [[GH-6248](https://github.com/hashicorp/nomad/issues/6248)] From f737d5a30b108696e899d15ee414caa469af2969 Mon Sep 17 00:00:00 2001 From: davemay99 Date: Mon, 8 Mar 2021 17:17:27 -0500 Subject: [PATCH 7/8] Re-run make changelogfmt after rebase --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8cc1e57edd9a..6f57941b9eda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ BUG FIXES: * api: Added missing devices block to AllocatedTaskResources [[GH-10064](https://github.com/hashicorp/nomad/pull/10064)] * cli: Fixed a bug where non-int proxy port would panic CLI [[GH-10072](https://github.com/hashicorp/nomad/issues/10072)] * cli: Fixed a bug where `nomad operator debug` incorrectly parsed https Consul API URLs. [[GH-10082](https://github.com/hashicorp/nomad/pull/10082)] - * cli: Update defaults for `nomad operator debug` flags `-interval` and `-server-id` to match common usage. [GH-10121] + * cli: Update defaults for `nomad operator debug` flags `-interval` and `-server-id` to match common usage. [[GH-10121](https://github.com/hashicorp/nomad/issues/10121)] * client: Fixed log formatting when killing tasks. [[GH-10135](https://github.com/hashicorp/nomad/issues/10135)] * ui: Fixed the rendering of interstitial components shown after processing a dynamic application sizing recommendation. [[GH-10094](https://github.com/hashicorp/nomad/pull/10094)] From 2729bcf1a2fcdd7e49ffac71145db96071343668 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 9 Mar 2021 08:13:02 -0500 Subject: [PATCH 8/8] changelog entry should be under improvements, not bugfixes --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f57941b9eda..9b89de8fc94f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,12 +4,13 @@ BUG FIXES: * api: Added missing devices block to AllocatedTaskResources [[GH-10064](https://github.com/hashicorp/nomad/pull/10064)] * cli: Fixed a bug where non-int proxy port would panic CLI [[GH-10072](https://github.com/hashicorp/nomad/issues/10072)] * cli: Fixed a bug where `nomad operator debug` incorrectly parsed https Consul API URLs. [[GH-10082](https://github.com/hashicorp/nomad/pull/10082)] - * cli: Update defaults for `nomad operator debug` flags `-interval` and `-server-id` to match common usage. [[GH-10121](https://github.com/hashicorp/nomad/issues/10121)] * client: Fixed log formatting when killing tasks. [[GH-10135](https://github.com/hashicorp/nomad/issues/10135)] * ui: Fixed the rendering of interstitial components shown after processing a dynamic application sizing recommendation. [[GH-10094](https://github.com/hashicorp/nomad/pull/10094)] IMPROVEMENTS: + * cli: Update defaults for `nomad operator debug` flags `-interval` and `-server-id` to match common usage. [[GH-10121](https://github.com/hashicorp/nomad/issues/10121)] * consul/connect: Enable setting `local_bind_address` field on connect upstreams [[GH-6248](https://github.com/hashicorp/nomad/issues/6248)] + ## 1.0.4 (February 24, 2021) * driver/docker: Added support for optional extra container labels. [[GH-9885](https://github.com/hashicorp/nomad/issues/9885)]