diff --git a/.gitignore b/.gitignore index 656ec0a7ec31..3ad9447e65fe 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,6 @@ azure-hashistack.pem # generated keys for e2e tests e2e/terraform/keys/ + +# debug bundles +nomad-debug-* diff --git a/CHANGELOG.md b/CHANGELOG.md index 6279f1855802..9b89de8fc94f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,9 @@ BUG FIXES: * ui: Fixed the rendering of interstitial components shown after processing a dynamic application sizing recommendation. [[GH-10094](https://github.com/hashicorp/nomad/pull/10094)] IMPROVEMENTS: + * cli: Update defaults for `nomad operator debug` flags `-interval` and `-server-id` to match common usage. [[GH-10121](https://github.com/hashicorp/nomad/issues/10121)] * consul/connect: Enable setting `local_bind_address` field on connect upstreams [[GH-6248](https://github.com/hashicorp/nomad/issues/6248)] + ## 1.0.4 (February 24, 2021) * driver/docker: Added support for optional extra container labels. [[GH-9885](https://github.com/hashicorp/nomad/issues/9885)] diff --git a/command/operator_debug.go b/command/operator_debug.go index eeaf8ae5564c..5b8e6a6303fa 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -55,59 +55,26 @@ func (c *OperatorDebugCommand) Help() string { helpText := ` Usage: nomad operator debug [options] - Build an archive containing Nomad cluster configuration and state, and Consul and Vault - status. Include logs and pprof profiles for selected servers and client nodes. + Build an archive containing Nomad cluster configuration and state, and Consul + and Vault status. Include logs and pprof profiles for selected servers and + client nodes. If ACLs are enabled, this command will require a token with the 'node:read' capability to run. In order to collect information, the token will also require the 'agent:read' and 'operator:read' capabilities, as well as the 'list-jobs' capability for all namespaces. To collect pprof profiles the - token will also require 'agent:write', or enable_debug configuration set to true. + token will also require 'agent:write', or enable_debug configuration set to + true. General Options: ` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + ` -Debug Options: - - -duration= - The duration of the log monitor command. Defaults to 2m. - - -interval= - The interval between snapshots of the Nomad state. If unspecified, only one snapshot is - captured. - - -log-level= - The log level to monitor. Defaults to DEBUG. - - -max-nodes= - Cap the maximum number of client nodes included in the capture. Defaults to 10, set to 0 for unlimited. - - -node-id=, - Comma separated list of Nomad client node ids, to monitor for logs and include pprof - profiles. Accepts id prefixes, and "all" to select all nodes (up to count = max-nodes). - - -node-class= - Filter client nodes based on node class. - - -pprof-duration= - Duration for pprof collection. Defaults to 1s. - - -server-id=, - Comma separated list of Nomad server names, "leader", or "all" to monitor for logs and include pprof - profiles. - - -stale= - If "false", the default, get membership data from the cluster leader. If the cluster is in - an outage unable to establish leadership, it may be necessary to get the configuration from - a non-leader server. - - -output= - Path to the parent directory of the output directory. If not specified, an archive is built - in the current directory. +Consul Options: -consul-http-addr= - The address and port of the Consul HTTP agent. Overrides the CONSUL_HTTP_ADDR environment variable. + The address and port of the Consul HTTP agent. Overrides the + CONSUL_HTTP_ADDR environment variable. -consul-token= Token used to query Consul. Overrides the CONSUL_HTTP_TOKEN environment @@ -133,6 +100,8 @@ Debug Options: Path to a directory of PEM encoded CA cert files to verify the Consul certificate. Overrides the CONSUL_CAPATH environment variable. +Vault Options: + -vault-address= The address and port of the Vault HTTP agent. Overrides the VAULT_ADDR environment variable. @@ -156,6 +125,47 @@ Debug Options: -vault-ca-path= Path to a directory of PEM encoded CA cert files to verify the Vault certificate. Overrides the VAULT_CAPATH environment variable. + +Debug Options: + + -duration= + The duration of the log monitor command. Defaults to 2m. + + -interval= + The interval between snapshots of the Nomad state. Set interval equal to + duration to capture a single snapshot. Defaults to 30s. + + -log-level= + The log level to monitor. Defaults to DEBUG. + + -max-nodes= + Cap the maximum number of client nodes included in the capture. Defaults + to 10, set to 0 for unlimited. + + -node-id=, + Comma separated list of Nomad client node ids to monitor for logs, API + outputs, and pprof profiles. Accepts id prefixes, and "all" to select all + nodes (up to count = max-nodes). Defaults to "all". + + -node-class= + Filter client nodes based on node class. + + -pprof-duration= + Duration for pprof collection. Defaults to 1s. + + -server-id=, + Comma separated list of Nomad server names to monitor for logs, API + outputs, and pprof profiles. Accepts server names, "leader", or "all". + Defaults to "all". + + -stale= + If "false", the default, get membership data from the cluster leader. If + the cluster is in an outage unable to establish leadership, it may be + necessary to get the configuration from a non-leader server. + + -output= + Path to the parent directory of the output directory. If not specified, an + archive is built in the current directory. ` return strings.TrimSpace(helpText) } @@ -195,12 +205,12 @@ func (c *OperatorDebugCommand) Run(args []string) int { var nodeIDs, serverIDs string flags.StringVar(&duration, "duration", "2m", "") - flags.StringVar(&interval, "interval", "2m", "") + flags.StringVar(&interval, "interval", "30s", "") flags.StringVar(&c.logLevel, "log-level", "DEBUG", "") flags.IntVar(&c.maxNodes, "max-nodes", 10, "") flags.StringVar(&c.nodeClass, "node-class", "", "") flags.StringVar(&nodeIDs, "node-id", "", "") - flags.StringVar(&serverIDs, "server-id", "", "") + flags.StringVar(&serverIDs, "server-id", "all", "") flags.BoolVar(&c.stale, "stale", false, "") flags.StringVar(&output, "output", "", "") flags.StringVar(&pprofDuration, "pprof-duration", "1s", "") @@ -246,6 +256,12 @@ func (c *OperatorDebugCommand) Run(args []string) int { } c.interval = i + // Validate interval + if i.Seconds() > d.Seconds() { + c.Ui.Error(fmt.Sprintf("Error parsing interval: %s is greater than duration %s", interval, duration)) + return 1 + } + // Parse the pprof capture duration pd, err := time.ParseDuration(pprofDuration) if err != nil { diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index 1a3173d389e6..dff63b5c8611 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -119,7 +119,7 @@ func TestDebug_NodeClass(t *testing.T) { cases := testCases{ { name: "address=api, node-class=clienta, max-nodes=2", - args: []string{"-address", url, "-duration", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clienta", "-max-nodes", "2"}, + args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clienta", "-max-nodes", "2"}, expectedCode: 0, expectedOutputs: []string{ "Servers: (1/1)", @@ -132,7 +132,7 @@ func TestDebug_NodeClass(t *testing.T) { }, { name: "address=api, node-class=clientb, max-nodes=2", - args: []string{"-address", url, "-duration", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clientb", "-max-nodes", "2"}, + args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clientb", "-max-nodes", "2"}, expectedCode: 0, expectedOutputs: []string{ "Servers: (1/1)", @@ -189,19 +189,19 @@ func TestDebug_ClientToServer(t *testing.T) { var cases = testCases{ { name: "testAgent api server", - args: []string{"-address", url, "-duration", "250ms", "-server-id", "all", "-node-id", "all"}, + args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"}, expectedCode: 0, expectedOutputs: []string{"Created debug archive"}, }, { name: "server address", - args: []string{"-address", addrServer, "-duration", "250ms", "-server-id", "all", "-node-id", "all"}, + args: []string{"-address", addrServer, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"}, expectedCode: 0, expectedOutputs: []string{"Created debug archive"}, }, { name: "client1 address - verify no SIGSEGV panic", - args: []string{"-address", addrClient1, "-duration", "250ms", "-server-id", "all", "-node-id", "all"}, + args: []string{"-address", addrClient1, "-duration", "250ms", "-interval", "250ms", "-server-id", "all", "-node-id", "all"}, expectedCode: 0, expectedOutputs: []string{"Created debug archive"}, }, @@ -218,7 +218,7 @@ func TestDebug_SingleServer(t *testing.T) { var cases = testCases{ { name: "address=api, server-id=leader", - args: []string{"-address", url, "-duration", "250ms", "-server-id", "leader"}, + args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader"}, expectedCode: 0, expectedOutputs: []string{ "Servers: (1/1)", @@ -229,7 +229,7 @@ func TestDebug_SingleServer(t *testing.T) { }, { name: "address=api, server-id=all", - args: []string{"-address", url, "-duration", "250ms", "-server-id", "all"}, + args: []string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all"}, expectedCode: 0, expectedOutputs: []string{ "Servers: (1/1)", @@ -261,7 +261,7 @@ func TestDebug_Failures(t *testing.T) { }, { name: "Fails missing node ids", - args: []string{"-node-id", "abc,def", "-duration", "250ms"}, + args: []string{"-node-id", "abc,def", "-duration", "250ms", "-interval", "250ms"}, expectedCode: 1, }, { @@ -274,6 +274,11 @@ func TestDebug_Failures(t *testing.T) { args: []string{"-interval", "bar"}, expectedCode: 1, }, + { + name: "Fails intervals greater than duration", + args: []string{"-duration", "5m", "-interval", "10m"}, + expectedCode: 1, + }, { name: "Fails bad address", args: []string{"-address", url + "bogus"}, @@ -307,7 +312,7 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) { cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} // Debug on the leader and all client nodes - code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "leader", "-node-id", "all", "-output", os.TempDir()}) + code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "leader", "-node-id", "all", "-output", os.TempDir()}) assert.Equal(t, 0, code) // Bad plugin name should be escaped before it reaches the sandbox test @@ -391,7 +396,7 @@ func TestDebug_ExistingOutput(t *testing.T) { os.MkdirAll(path, 0755) defer os.Remove(path) - code := cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms"}) + code := cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms", "-interval", "50ms"}) require.Equal(t, 2, code) } @@ -413,7 +418,7 @@ func TestDebug_Fail_Pprof(t *testing.T) { cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} // Debug on client - node class = "clienta" - code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "all"}) + code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-interval", "250ms", "-server-id", "all"}) assert.Equal(t, 0, code) // Pprof failure isn't fatal require.Contains(t, ui.OutputWriter.String(), "Starting debugger")