diff --git a/.changelog/11678.txt b/.changelog/11678.txt new file mode 100644 index 000000000000..c82272549fb4 --- /dev/null +++ b/.changelog/11678.txt @@ -0,0 +1,3 @@ +```release-note:bug +cli: Fixed a bug where the `-stale` flag was not respected by `nomad operator debug` +``` diff --git a/api/agent.go b/api/agent.go index 39bfb95443cd..424e9ad95d28 100644 --- a/api/agent.go +++ b/api/agent.go @@ -147,6 +147,17 @@ func (a *Agent) Members() (*ServerMembers, error) { return resp, nil } +// Members is used to query all of the known server members +// with the ability to set QueryOptions +func (a *Agent) MembersOpts(opts *QueryOptions) (*ServerMembers, error) { + var resp *ServerMembers + _, err := a.client.query("/v1/agent/members", &resp, opts) + if err != nil { + return nil, err + } + return resp, nil +} + // ForceLeave is used to eject an existing node from the cluster. func (a *Agent) ForceLeave(node string) error { _, err := a.client.write("/v1/agent/force-leave?node="+node, nil, nil, nil) diff --git a/api/nodes.go b/api/nodes.go index 488f5eb625df..2c86651dbb9e 100644 --- a/api/nodes.go +++ b/api/nodes.go @@ -49,6 +49,15 @@ func (n *Nodes) PrefixList(prefix string) ([]*NodeListStub, *QueryMeta, error) { return n.List(&QueryOptions{Prefix: prefix}) } +func (n *Nodes) PrefixListOpts(prefix string, opts *QueryOptions) ([]*NodeListStub, *QueryMeta, error) { + if opts == nil { + opts = &QueryOptions{Prefix: prefix} + } else { + opts.Prefix = prefix + } + return n.List(opts) +} + // Info is used to query a specific node by its ID. func (n *Nodes) Info(nodeID string, q *QueryOptions) (*Node, *QueryMeta, error) { var resp Node diff --git a/api/regions.go b/api/regions.go index c94ce297a892..98df011d04e0 100644 --- a/api/regions.go +++ b/api/regions.go @@ -12,7 +12,8 @@ func (c *Client) Regions() *Regions { return &Regions{client: c} } -// List returns a list of all of the regions. +// List returns a list of all of the regions from the server +// that serves the request. It is never forwarded to a leader. func (r *Regions) List() ([]string, error) { var resp []string if _, err := r.client.query("/v1/regions", &resp, nil); err != nil { diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index dc4afd1ef6c7..798d65487f80 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -71,7 +71,9 @@ func (s *HTTPServer) AgentSelfRequest(resp http.ResponseWriter, req *http.Reques member = srv.LocalMember() aclObj, err = srv.ResolveToken(secret) } else { - // Not a Server; use the Client for token resolution + // Not a Server, so use the Client for token resolution. Note + // this gets forwarded to a server with AllowStale = true if + // the local ACL cache TTL has expired (30s by default) aclObj, err = s.agent.Client().ResolveToken(secret) } @@ -677,7 +679,9 @@ func (s *HTTPServer) AgentHostRequest(resp http.ResponseWriter, req *http.Reques aclObj, err = srv.ResolveToken(secret) enableDebug = srv.GetConfig().EnableDebug } else { - // Not a Server; use the Client for token resolution + // Not a Server, so use the Client for token resolution. Note + // this gets forwarded to a server with AllowStale = true if + // the local ACL cache TTL has expired (30s by default) aclObj, err = s.agent.Client().ResolveToken(secret) enableDebug = s.agent.Client().GetConfig().EnableDebug } diff --git a/command/operator_debug.go b/command/operator_debug.go index 245be9086f1b..21b503c7906d 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -38,7 +38,6 @@ type OperatorDebugCommand struct { interval time.Duration pprofDuration time.Duration logLevel string - stale bool maxNodes int nodeClass string nodeIDs []string @@ -48,6 +47,7 @@ type OperatorDebugCommand struct { manifest []string ctx context.Context cancel context.CancelFunc + opts *api.QueryOptions } const ( @@ -140,7 +140,7 @@ Debug Options: nodes at "log-level". Defaults to 2m. -interval= - The interval between snapshots of the Nomad state. Set interval equal to + The interval between snapshots of the Nomad state. Set interval equal to duration to capture a single snapshot. Defaults to 30s. -log-level= @@ -172,7 +172,7 @@ Debug Options: necessary to get the configuration from a non-leader server. -output= - Path to the parent directory of the output directory. If specified, no + Path to the parent directory of the output directory. If specified, no archive is built. Defaults to the current directory. ` return strings.TrimSpace(helpText) @@ -211,7 +211,12 @@ func NodePredictor(factory ApiClientFactory) complete.Predictor { return nil } - resp, _, err := client.Search().PrefixSearch(a.Last, contexts.Nodes, nil) + // note we can't use the -stale flag here because we're in the + // predictor, but a stale query should be safe for prediction; + // we also can't use region forwarding because we can't rely + // on the server being up + resp, _, err := client.Search().PrefixSearch( + a.Last, contexts.Nodes, &api.QueryOptions{AllowStale: true}) if err != nil { return []string{} } @@ -228,7 +233,11 @@ func NodeClassPredictor(factory ApiClientFactory) complete.Predictor { return nil } - nodes, _, err := client.Nodes().List(nil) // TODO: should be *api.QueryOptions that matches region + // note we can't use the -stale flag here because we're in the + // predictor, but a stale query should be safe for prediction; + // we also can't use region forwarding because we can't rely + // on the server being up + nodes, _, err := client.Nodes().List(&api.QueryOptions{AllowStale: true}) if err != nil { return []string{} } @@ -259,7 +268,12 @@ func ServerPredictor(factory ApiClientFactory) complete.Predictor { if err != nil { return nil } - members, err := client.Agent().Members() + + // note we can't use the -stale flag here because we're in the + // predictor, but a stale query should be safe for prediction; + // we also can't use region forwarding because we can't rely + // on the server being up + members, err := client.Agent().MembersOpts(&api.QueryOptions{AllowStale: true}) if err != nil { return []string{} } @@ -276,6 +290,15 @@ func ServerPredictor(factory ApiClientFactory) complete.Predictor { }) } +// queryOpts returns a copy of the shared api.QueryOptions so +// that api package methods can safely modify the options +func (c *OperatorDebugCommand) queryOpts() *api.QueryOptions { + qo := new(api.QueryOptions) + *qo = *c.opts + qo.Params = helper.CopyMapStringString(c.opts.Params) + return qo +} + func (c *OperatorDebugCommand) Name() string { return "debug" } func (c *OperatorDebugCommand) Run(args []string) int { @@ -284,6 +307,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { var duration, interval, output, pprofDuration string var nodeIDs, serverIDs string + var allowStale bool flags.StringVar(&duration, "duration", "2m", "") flags.StringVar(&interval, "interval", "30s", "") @@ -292,7 +316,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { flags.StringVar(&c.nodeClass, "node-class", "", "") flags.StringVar(&nodeIDs, "node-id", "all", "") flags.StringVar(&serverIDs, "server-id", "all", "") - flags.BoolVar(&c.stale, "stale", false, "") + flags.BoolVar(&allowStale, "stale", false, "") flags.StringVar(&output, "output", "", "") flags.StringVar(&pprofDuration, "pprof-duration", "1s", "") @@ -403,6 +427,12 @@ func (c *OperatorDebugCommand) Run(args []string) int { return 1 } + c.opts = &api.QueryOptions{ + Region: c.Meta.region, + AllowStale: allowStale, + AuthToken: c.Meta.token, + } + // Search all nodes If a node class is specified without a list of node id prefixes if c.nodeClass != "" && nodeIDs == "" { nodeIDs = "all" @@ -421,7 +451,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { // Capture from nodes starting with prefix id id = sanitizeUUIDPrefix(id) } - nodes, _, err := client.Nodes().PrefixList(id) + nodes, _, err := client.Nodes().PrefixListOpts(id, c.queryOpts()) if err != nil { c.Ui.Error(fmt.Sprintf("Error querying node info: %s", err)) return 1 @@ -466,7 +496,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { } // Resolve servers - members, err := client.Agent().Members() + members, err := client.Agent().MembersOpts(c.queryOpts()) if err != nil { c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err)) return 1 @@ -559,8 +589,7 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error { self, err := client.Agent().Self() c.writeJSON(clusterDir, "agent-self.json", self, err) - var qo *api.QueryOptions - namespaces, _, err := client.Namespaces().List(qo) + namespaces, _, err := client.Namespaces().List(c.queryOpts()) c.writeJSON(clusterDir, "namespaces.json", namespaces, err) regions, err := client.Regions().List() @@ -635,6 +664,7 @@ func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client * idKey: nodeID, "log_level": c.logLevel, }, + AllowStale: c.queryOpts().AllowStale, } outCh, errCh := client.Agent().Monitor(c.ctx.Done(), &qo) @@ -672,9 +702,9 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli var host *api.HostDataResponse var err error if path == serverDir { - host, err = client.Agent().Host(id, "", nil) + host, err = client.Agent().Host(id, "", c.queryOpts()) } else { - host, err = client.Agent().Host("", id, nil) + host, err = client.Agent().Host("", id, c.queryOpts()) } if err != nil { @@ -714,7 +744,7 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) path = filepath.Join(path, id) - bs, err := client.Agent().CPUProfile(opts, nil) + bs, err := client.Agent().CPUProfile(opts, c.queryOpts()) if err != nil { c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof profile.prof, err: %v", path, err)) if structs.IsErrPermissionDenied(err) { @@ -763,7 +793,7 @@ func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opt fileName = fmt.Sprintf("%s-debug%d.txt", profile, opts.Debug) } - bs, err := retrievePprofProfile(profile, opts, client) + bs, err := retrievePprofProfile(profile, opts, client, c.queryOpts()) if err != nil { c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %s", path, fileName, err.Error())) } @@ -774,22 +804,23 @@ func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opt } } -// retrievePprofProfile gets a pprof profile from the node specified in opts using the API client -func retrievePprofProfile(profile string, opts api.PprofOptions, client *api.Client) (bs []byte, err error) { +// retrievePprofProfile gets a pprof profile from the node specified +// in opts using the API client +func retrievePprofProfile(profile string, opts api.PprofOptions, client *api.Client, qopts *api.QueryOptions) (bs []byte, err error) { switch profile { case "cpuprofile": - bs, err = client.Agent().CPUProfile(opts, nil) + bs, err = client.Agent().CPUProfile(opts, qopts) case "trace": - bs, err = client.Agent().Trace(opts, nil) + bs, err = client.Agent().Trace(opts, qopts) default: - bs, err = client.Agent().Lookup(profile, opts, nil) + bs, err = client.Agent().Lookup(profile, opts, qopts) } return bs, err } -// collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops -// the monitor requests +// collectPeriodic runs for duration, capturing the cluster state +// every interval. It flushes and stops the monitor requests func (c *OperatorDebugCommand) collectPeriodic(client *api.Client) { duration := time.After(c.duration) // Set interval to 0 so that we immediately execute, wait the interval next time @@ -820,61 +851,60 @@ func (c *OperatorDebugCommand) collectPeriodic(client *api.Client) { // collectOperator captures some cluster meta information func (c *OperatorDebugCommand) collectOperator(dir string, client *api.Client) { - rc, err := client.Operator().RaftGetConfiguration(nil) + rc, err := client.Operator().RaftGetConfiguration(c.queryOpts()) c.writeJSON(dir, "operator-raft.json", rc, err) - sc, _, err := client.Operator().SchedulerGetConfiguration(nil) + sc, _, err := client.Operator().SchedulerGetConfiguration(c.queryOpts()) c.writeJSON(dir, "operator-scheduler.json", sc, err) - ah, _, err := client.Operator().AutopilotServerHealth(nil) + ah, _, err := client.Operator().AutopilotServerHealth(c.queryOpts()) c.writeJSON(dir, "operator-autopilot-health.json", ah, err) - lic, _, err := client.Operator().LicenseGet(nil) + lic, _, err := client.Operator().LicenseGet(c.queryOpts()) c.writeJSON(dir, "license.json", lic, err) } // collectNomad captures the nomad cluster state func (c *OperatorDebugCommand) collectNomad(dir string, client *api.Client) error { - var qo *api.QueryOptions - js, _, err := client.Jobs().List(qo) + js, _, err := client.Jobs().List(c.queryOpts()) c.writeJSON(dir, "jobs.json", js, err) - ds, _, err := client.Deployments().List(qo) + ds, _, err := client.Deployments().List(c.queryOpts()) c.writeJSON(dir, "deployments.json", ds, err) - es, _, err := client.Evaluations().List(qo) + es, _, err := client.Evaluations().List(c.queryOpts()) c.writeJSON(dir, "evaluations.json", es, err) - as, _, err := client.Allocations().List(qo) + as, _, err := client.Allocations().List(c.queryOpts()) c.writeJSON(dir, "allocations.json", as, err) - ns, _, err := client.Nodes().List(qo) + ns, _, err := client.Nodes().List(c.queryOpts()) c.writeJSON(dir, "nodes.json", ns, err) // CSI Plugins - /v1/plugins?type=csi - ps, _, err := client.CSIPlugins().List(qo) + ps, _, err := client.CSIPlugins().List(c.queryOpts()) c.writeJSON(dir, "csi-plugins.json", ps, err) // CSI Plugin details - /v1/plugin/csi/:plugin_id for _, p := range ps { - csiPlugin, _, err := client.CSIPlugins().Info(p.ID, qo) + csiPlugin, _, err := client.CSIPlugins().Info(p.ID, c.queryOpts()) csiPluginFileName := fmt.Sprintf("csi-plugin-id-%s.json", p.ID) c.writeJSON(dir, csiPluginFileName, csiPlugin, err) } // CSI Volumes - /v1/volumes?type=csi - csiVolumes, _, err := client.CSIVolumes().List(qo) + csiVolumes, _, err := client.CSIVolumes().List(c.queryOpts()) c.writeJSON(dir, "csi-volumes.json", csiVolumes, err) // CSI Volume details - /v1/volumes/csi/:volume-id for _, v := range csiVolumes { - csiVolume, _, err := client.CSIVolumes().Info(v.ID, qo) + csiVolume, _, err := client.CSIVolumes().Info(v.ID, c.queryOpts()) csiFileName := fmt.Sprintf("csi-volume-id-%s.json", v.ID) c.writeJSON(dir, csiFileName, csiVolume, err) } - metrics, _, err := client.Operator().MetricsSummary(qo) + metrics, _, err := client.Operator().MetricsSummary(c.queryOpts()) c.writeJSON(dir, "metrics.json", metrics, err) return nil diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index c61419faf15c..50f3a8756eee 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -740,3 +740,54 @@ func TestDebug_CollectVault(t *testing.T) { require.FileExists(t, filepath.Join(testDir, "test", "vault-sys-health.json")) } + +// TestDebug_StaleLeadership verifies that APIs that are required to +// complete a debug run have their query options configured with the +// -stale flag +func TestDebug_StaleLeadership(t *testing.T) { + srv, _, url := testServerWithoutLeader(t, false, nil) + addrServer := srv.HTTPAddr() + + t.Logf("[TEST] testAgent api address: %s", url) + t.Logf("[TEST] Server api address: %s", addrServer) + + var cases = testCases{ + { + name: "no leader without stale flag", + args: []string{"-address", addrServer, + "-duration", "250ms", "-interval", "250ms", + "-server-id", "all", "-node-id", "all"}, + expectedCode: 1, + }, + { + name: "no leader with stale flag", + args: []string{ + "-address", addrServer, + "-duration", "250ms", "-interval", "250ms", + "-server-id", "all", "-node-id", "all", + "-stale"}, + expectedCode: 0, + expectedOutputs: []string{"Created debug archive"}, + }, + } + + runTestCases(t, cases) +} + +func testServerWithoutLeader(t *testing.T, runClient bool, cb func(*agent.Config)) (*agent.TestAgent, *api.Client, string) { + // Make a new test server + a := agent.NewTestAgent(t, t.Name(), func(config *agent.Config) { + config.Client.Enabled = runClient + config.Server.Enabled = true + config.Server.NumSchedulers = helper.IntToPtr(0) + config.Server.BootstrapExpect = 3 + + if cb != nil { + cb(config) + } + }) + t.Cleanup(func() { a.Shutdown() }) + + c := a.Client() + return a, c, a.HTTPAddr() +}