diff --git a/.changelog/11307.txt b/.changelog/11307.txt new file mode 100644 index 000000000000..7c558320d7df --- /dev/null +++ b/.changelog/11307.txt @@ -0,0 +1,3 @@ +```release-note:breaking-change +cli: Renamed folders in `nomad operator debug` bundle for clarity +``` diff --git a/command/operator_debug.go b/command/operator_debug.go index 1646f062d8f2..183722f35a5e 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -49,7 +49,11 @@ type OperatorDebugCommand struct { } const ( - userAgent = "nomad operator debug" + userAgent = "nomad operator debug" + clusterDir = "cluster" + clientDir = "client" + serverDir = "server" + intervalDir = "interval" ) func (c *OperatorDebugCommand) Help() string { @@ -458,7 +462,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { } // Write complete list of server members to file - c.writeJSON("version", "members.json", members, err) + c.writeJSON(clusterDir, "members.json", members, err) // Filter for servers matching criteria c.serverIDs, err = filterServerMembers(members, serverIDs, c.region) @@ -538,18 +542,17 @@ func (c *OperatorDebugCommand) Run(args []string) int { // collect collects data from our endpoints and writes the archive bundle func (c *OperatorDebugCommand) collect(client *api.Client) error { - // Version contains cluster meta information - dir := "version" + // Collect cluster data self, err := client.Agent().Self() - c.writeJSON(dir, "agent-self.json", self, err) + c.writeJSON(clusterDir, "agent-self.json", self, err) var qo *api.QueryOptions namespaces, _, err := client.Namespaces().List(qo) - c.writeJSON(dir, "namespaces.json", namespaces, err) + c.writeJSON(clusterDir, "namespaces.json", namespaces, err) regions, err := client.Regions().List() - c.writeJSON(dir, "regions.json", regions, err) + c.writeJSON(clusterDir, "regions.json", regions, err) // Fetch data directly from consul and vault. Ignore errors var consul, vault string @@ -582,8 +585,8 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error { } } - c.collectConsul(dir, consul) - c.collectVault(dir, vault) + c.collectConsul(clusterDir, consul) + c.collectVault(clusterDir, vault) c.collectAgentHosts(client) c.collectPprofs(client) @@ -616,11 +619,11 @@ func (c *OperatorDebugCommand) mkdir(paths ...string) error { // startMonitors starts go routines for each node and client func (c *OperatorDebugCommand) startMonitors(client *api.Client) { for _, id := range c.nodeIDs { - go c.startMonitor("client", "node_id", id, client) + go c.startMonitor(clientDir, "node_id", id, client) } for _, id := range c.serverIDs { - go c.startMonitor("server", "server_id", id, client) + go c.startMonitor(serverDir, "server_id", id, client) } } @@ -664,11 +667,11 @@ func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client * // collectAgentHosts calls collectAgentHost for each selected node func (c *OperatorDebugCommand) collectAgentHosts(client *api.Client) { for _, n := range c.nodeIDs { - c.collectAgentHost("client", n, client) + c.collectAgentHost(clientDir, n, client) } for _, n := range c.serverIDs { - c.collectAgentHost("server", n, client) + c.collectAgentHost(serverDir, n, client) } } @@ -676,7 +679,7 @@ func (c *OperatorDebugCommand) collectAgentHosts(client *api.Client) { func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Client) { var host *api.HostDataResponse var err error - if path == "server" { + if path == serverDir { host, err = client.Agent().Host(id, "", nil) } else { host, err = client.Agent().Host("", id, nil) @@ -699,11 +702,11 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli // collectPprofs captures the /agent/pprof for each listed node func (c *OperatorDebugCommand) collectPprofs(client *api.Client) { for _, n := range c.nodeIDs { - c.collectPprof("client", n, client) + c.collectPprof(clientDir, n, client) } for _, n := range c.serverIDs { - c.collectPprof("server", n, client) + c.collectPprof(serverDir, n, client) } } @@ -711,7 +714,7 @@ func (c *OperatorDebugCommand) collectPprofs(client *api.Client) { func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) { pprofDurationSeconds := int(c.pprofDuration.Seconds()) opts := api.PprofOptions{Seconds: pprofDurationSeconds} - if path == "server" { + if path == serverDir { opts.ServerID = id } else { opts.NodeID = id @@ -810,7 +813,7 @@ func (c *OperatorDebugCommand) collectPeriodic(client *api.Client) { case <-interval: name = fmt.Sprintf("%04d", intervalCount) - dir = filepath.Join("nomad", name) + dir = filepath.Join(intervalDir, name) c.Ui.Output(fmt.Sprintf(" Capture interval %s", name)) c.collectNomad(dir, client) c.collectOperator(dir, client) @@ -859,7 +862,7 @@ func (c *OperatorDebugCommand) collectNomad(dir string, client *api.Client) erro // CSI Plugins - /v1/plugins?type=csi ps, _, err := client.CSIPlugins().List(qo) - c.writeJSON(dir, "plugins.json", ps, err) + c.writeJSON(dir, "csi-plugins.json", ps, err) // CSI Plugin details - /v1/plugin/csi/:plugin_id for _, p := range ps { diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index 92b5fdb6076b..8ecea5d0037d 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -346,68 +346,126 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) { var pluginFiles []string for _, pluginName := range cases { pluginFile := fmt.Sprintf("csi-plugin-id-%s.json", helper.CleanFilename(pluginName, "_")) - pluginFile = filepath.Join(path, "nomad", "0000", pluginFile) + pluginFile = filepath.Join(path, intervalDir, "0000", pluginFile) pluginFiles = append(pluginFiles, pluginFile) } testutil.WaitForFiles(t, pluginFiles) } +func buildPathSlice(path string, files []string) []string { + paths := []string{} + for _, file := range files { + paths = append(paths, filepath.Join(path, file)) + } + return paths +} + func TestDebug_CapturedFiles(t *testing.T) { - srv, _, url := testServer(t, false, nil) + srv, _, url := testServer(t, true, nil) testutil.WaitForLeader(t, srv.Agent.RPC) + serverNodeName := srv.Config.NodeName + region := srv.Config.Region + serverName := fmt.Sprintf("%s.%s", serverNodeName, region) + clientID := srv.Agent.Client().NodeID() + + t.Logf("serverName: %s, clientID, %s", serverName, clientID) + + // Setup file slices + clusterFiles := []string{ + "agent-self.json", + "consul-agent-members.json", + "consul-agent-self.json", + "members.json", + "namespaces.json", + "regions.json", + "vault-sys-health.json", + } + + pprofFiles := []string{ + "allocs.prof", + "goroutine-debug1.txt", + "goroutine-debug2.txt", + "goroutine.prof", + "heap.prof", + "profile.prof", + "threadcreate.prof", + "trace.prof", + } + + clientFiles := []string{ + "agent-host.json", + "monitor.log", + } + clientFiles = append(clientFiles, pprofFiles...) + + serverFiles := []string{ + "agent-host.json", + "monitor.log", + } + serverFiles = append(serverFiles, pprofFiles...) + + intervalFiles := []string{ + "allocations.json", + "csi-plugins.json", + "csi-volumes.json", + "deployments.json", + "evaluations.json", + "jobs.json", + "license.json", + "metrics.json", + "nodes.json", + "operator-autopilot-health.json", + "operator-raft.json", + "operator-scheduler.json", + } + ui := cli.NewMockUi() cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} code := cmd.Run([]string{ "-address", url, "-output", os.TempDir(), - "-server-id", "leader", + "-server-id", serverName, + "-node-id", clientID, "-duration", "1300ms", "-interval", "600ms", }) + // Get capture directory path := cmd.collectDir defer os.Remove(path) + // There should be no errors require.Empty(t, ui.ErrorWriter.String()) require.Equal(t, 0, code) ui.ErrorWriter.Reset() - serverFiles := []string{ - // Version is always captured - filepath.Join(path, "version", "agent-self.json"), - - // Consul and Vault contain results or errors - filepath.Join(path, "version", "consul-agent-self.json"), - filepath.Join(path, "version", "vault-sys-health.json"), - - // Monitor files are only created when selected - filepath.Join(path, "server", "leader", "monitor.log"), - - // Pprof profiles - filepath.Join(path, "server", "leader", "profile.prof"), - filepath.Join(path, "server", "leader", "trace.prof"), - filepath.Join(path, "server", "leader", "goroutine.prof"), - filepath.Join(path, "server", "leader", "goroutine-debug1.txt"), - filepath.Join(path, "server", "leader", "goroutine-debug2.txt"), - filepath.Join(path, "server", "leader", "heap.prof"), - filepath.Join(path, "server", "leader", "allocs.prof"), - filepath.Join(path, "server", "leader", "threadcreate.prof"), - - // Multiple snapshots are collected, 00 is always created - filepath.Join(path, "nomad", "0000", "jobs.json"), - filepath.Join(path, "nomad", "0000", "nodes.json"), - filepath.Join(path, "nomad", "0000", "metrics.json"), - - // Multiple snapshots are collected, 01 requires two intervals - filepath.Join(path, "nomad", "0001", "jobs.json"), - filepath.Join(path, "nomad", "0001", "nodes.json"), - filepath.Join(path, "nomad", "0001", "metrics.json"), - } - - testutil.WaitForFilesUntil(t, serverFiles, 2*time.Minute) + // Verify cluster files + clusterPaths := buildPathSlice(cmd.path(clusterDir), clusterFiles) + t.Logf("Waiting for cluster files in path: %s", clusterDir) + testutil.WaitForFilesUntil(t, clusterPaths, 2*time.Minute) + + // Verify client files + clientPaths := buildPathSlice(cmd.path(clientDir, clientID), clientFiles) + t.Logf("Waiting for client files in path: %s", clientDir) + testutil.WaitForFilesUntil(t, clientPaths, 2*time.Minute) + + // Verify server files + serverPaths := buildPathSlice(cmd.path(serverDir, serverName), serverFiles) + t.Logf("Waiting for server files in path: %s", serverDir) + testutil.WaitForFilesUntil(t, serverPaths, 2*time.Minute) + + // Verify interval 0000 files + intervalPaths0 := buildPathSlice(cmd.path(intervalDir, "0000"), intervalFiles) + t.Logf("Waiting for interval 0000 files in path: %s", intervalDir) + testutil.WaitForFilesUntil(t, intervalPaths0, 2*time.Minute) + + // Verify interval 0001 files + intervalPaths1 := buildPathSlice(cmd.path(intervalDir, "0001"), intervalFiles) + t.Logf("Waiting for interval 0001 files in path: %s", intervalDir) + testutil.WaitForFilesUntil(t, intervalPaths1, 2*time.Minute) } func TestDebug_ExistingOutput(t *testing.T) { diff --git a/testutil/wait.go b/testutil/wait.go index fee5b6b05921..4f9965a6a82e 100644 --- a/testutil/wait.go +++ b/testutil/wait.go @@ -241,7 +241,7 @@ func WaitForRunning(t testing.TB, rpc rpcFn, job *structs.Job) []*structs.AllocL // WaitForFiles blocks until all the files in the slice are present func WaitForFiles(t testing.TB, files []string) { WaitForResult(func() (bool, error) { - return FilesExist(files), nil + return FilesExist(files) }, func(err error) { t.Fatalf("missing expected files: %v", err) }) @@ -250,18 +250,18 @@ func WaitForFiles(t testing.TB, files []string) { // WaitForFilesUntil blocks until duration or all the files in the slice are present func WaitForFilesUntil(t testing.TB, files []string, until time.Duration) { WaitForResultUntil(until, func() (bool, error) { - return FilesExist(files), nil + return FilesExist(files) }, func(err error) { t.Fatalf("missing expected files: %v", err) }) } // FilesExist verifies all files in the slice are present -func FilesExist(files []string) bool { +func FilesExist(files []string) (bool, error) { for _, f := range files { if _, err := os.Stat(f); os.IsNotExist(err) { - return false + return false, fmt.Errorf("expected file not found: %v", f) } } - return true + return true, nil }