Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cli: rename paths in debug bundle for clarity #11307

Merged
merged 5 commits into from
Oct 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/11307.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:breaking-change
cli: Renamed folders in `nomad operator debug` bundle for clarity
```
41 changes: 22 additions & 19 deletions command/operator_debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,11 @@ type OperatorDebugCommand struct {
}

const (
userAgent = "nomad operator debug"
userAgent = "nomad operator debug"
clusterDir = "cluster"
clientDir = "client"
serverDir = "server"
intervalDir = "interval"
)

func (c *OperatorDebugCommand) Help() string {
Expand Down Expand Up @@ -458,7 +462,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}

// Write complete list of server members to file
c.writeJSON("version", "members.json", members, err)
c.writeJSON(clusterDir, "members.json", members, err)

// Filter for servers matching criteria
c.serverIDs, err = filterServerMembers(members, serverIDs, c.region)
Expand Down Expand Up @@ -538,18 +542,17 @@ func (c *OperatorDebugCommand) Run(args []string) int {

// collect collects data from our endpoints and writes the archive bundle
func (c *OperatorDebugCommand) collect(client *api.Client) error {
// Version contains cluster meta information
dir := "version"
// Collect cluster data

self, err := client.Agent().Self()
c.writeJSON(dir, "agent-self.json", self, err)
c.writeJSON(clusterDir, "agent-self.json", self, err)

var qo *api.QueryOptions
namespaces, _, err := client.Namespaces().List(qo)
c.writeJSON(dir, "namespaces.json", namespaces, err)
c.writeJSON(clusterDir, "namespaces.json", namespaces, err)

regions, err := client.Regions().List()
c.writeJSON(dir, "regions.json", regions, err)
c.writeJSON(clusterDir, "regions.json", regions, err)

// Fetch data directly from consul and vault. Ignore errors
var consul, vault string
Expand Down Expand Up @@ -582,8 +585,8 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
}
}

c.collectConsul(dir, consul)
c.collectVault(dir, vault)
c.collectConsul(clusterDir, consul)
c.collectVault(clusterDir, vault)
c.collectAgentHosts(client)
c.collectPprofs(client)

Expand Down Expand Up @@ -616,11 +619,11 @@ func (c *OperatorDebugCommand) mkdir(paths ...string) error {
// startMonitors starts go routines for each node and client
func (c *OperatorDebugCommand) startMonitors(client *api.Client) {
for _, id := range c.nodeIDs {
go c.startMonitor("client", "node_id", id, client)
go c.startMonitor(clientDir, "node_id", id, client)
}

for _, id := range c.serverIDs {
go c.startMonitor("server", "server_id", id, client)
go c.startMonitor(serverDir, "server_id", id, client)
}
}

Expand Down Expand Up @@ -664,19 +667,19 @@ func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *
// collectAgentHosts calls collectAgentHost for each selected node
func (c *OperatorDebugCommand) collectAgentHosts(client *api.Client) {
for _, n := range c.nodeIDs {
c.collectAgentHost("client", n, client)
c.collectAgentHost(clientDir, n, client)
}

for _, n := range c.serverIDs {
c.collectAgentHost("server", n, client)
c.collectAgentHost(serverDir, n, client)
}
}

// collectAgentHost gets the agent host data
func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Client) {
var host *api.HostDataResponse
var err error
if path == "server" {
if path == serverDir {
host, err = client.Agent().Host(id, "", nil)
} else {
host, err = client.Agent().Host("", id, nil)
Expand All @@ -699,19 +702,19 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
// collectPprofs captures the /agent/pprof for each listed node
func (c *OperatorDebugCommand) collectPprofs(client *api.Client) {
for _, n := range c.nodeIDs {
c.collectPprof("client", n, client)
c.collectPprof(clientDir, n, client)
}

for _, n := range c.serverIDs {
c.collectPprof("server", n, client)
c.collectPprof(serverDir, n, client)
}
}

// collectPprof captures pprof data for the node
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) {
pprofDurationSeconds := int(c.pprofDuration.Seconds())
opts := api.PprofOptions{Seconds: pprofDurationSeconds}
if path == "server" {
if path == serverDir {
opts.ServerID = id
} else {
opts.NodeID = id
Expand Down Expand Up @@ -810,7 +813,7 @@ func (c *OperatorDebugCommand) collectPeriodic(client *api.Client) {

case <-interval:
name = fmt.Sprintf("%04d", intervalCount)
dir = filepath.Join("nomad", name)
dir = filepath.Join(intervalDir, name)
c.Ui.Output(fmt.Sprintf(" Capture interval %s", name))
c.collectNomad(dir, client)
c.collectOperator(dir, client)
Expand Down Expand Up @@ -859,7 +862,7 @@ func (c *OperatorDebugCommand) collectNomad(dir string, client *api.Client) erro

// CSI Plugins - /v1/plugins?type=csi
ps, _, err := client.CSIPlugins().List(qo)
c.writeJSON(dir, "plugins.json", ps, err)
c.writeJSON(dir, "csi-plugins.json", ps, err)

// CSI Plugin details - /v1/plugin/csi/:plugin_id
for _, p := range ps {
Expand Down
130 changes: 94 additions & 36 deletions command/operator_debug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,68 +346,126 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
var pluginFiles []string
for _, pluginName := range cases {
pluginFile := fmt.Sprintf("csi-plugin-id-%s.json", helper.CleanFilename(pluginName, "_"))
pluginFile = filepath.Join(path, "nomad", "0000", pluginFile)
pluginFile = filepath.Join(path, intervalDir, "0000", pluginFile)
pluginFiles = append(pluginFiles, pluginFile)
}

testutil.WaitForFiles(t, pluginFiles)
}

func buildPathSlice(path string, files []string) []string {
paths := []string{}
for _, file := range files {
paths = append(paths, filepath.Join(path, file))
}
return paths
}

func TestDebug_CapturedFiles(t *testing.T) {
srv, _, url := testServer(t, false, nil)
srv, _, url := testServer(t, true, nil)
testutil.WaitForLeader(t, srv.Agent.RPC)

serverNodeName := srv.Config.NodeName
region := srv.Config.Region
serverName := fmt.Sprintf("%s.%s", serverNodeName, region)
clientID := srv.Agent.Client().NodeID()

t.Logf("serverName: %s, clientID, %s", serverName, clientID)

// Setup file slices
clusterFiles := []string{
"agent-self.json",
"consul-agent-members.json",
"consul-agent-self.json",
"members.json",
"namespaces.json",
"regions.json",
"vault-sys-health.json",
}

pprofFiles := []string{
"allocs.prof",
"goroutine-debug1.txt",
"goroutine-debug2.txt",
"goroutine.prof",
"heap.prof",
"profile.prof",
"threadcreate.prof",
"trace.prof",
}

clientFiles := []string{
"agent-host.json",
"monitor.log",
}
clientFiles = append(clientFiles, pprofFiles...)

serverFiles := []string{
"agent-host.json",
"monitor.log",
}
serverFiles = append(serverFiles, pprofFiles...)

intervalFiles := []string{
"allocations.json",
"csi-plugins.json",
"csi-volumes.json",
"deployments.json",
"evaluations.json",
"jobs.json",
"license.json",
"metrics.json",
"nodes.json",
"operator-autopilot-health.json",
"operator-raft.json",
"operator-scheduler.json",
}

ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

code := cmd.Run([]string{
"-address", url,
"-output", os.TempDir(),
"-server-id", "leader",
"-server-id", serverName,
"-node-id", clientID,
"-duration", "1300ms",
"-interval", "600ms",
})

// Get capture directory
path := cmd.collectDir
defer os.Remove(path)

// There should be no errors
require.Empty(t, ui.ErrorWriter.String())
require.Equal(t, 0, code)
ui.ErrorWriter.Reset()

serverFiles := []string{
// Version is always captured
filepath.Join(path, "version", "agent-self.json"),

// Consul and Vault contain results or errors
filepath.Join(path, "version", "consul-agent-self.json"),
filepath.Join(path, "version", "vault-sys-health.json"),

// Monitor files are only created when selected
filepath.Join(path, "server", "leader", "monitor.log"),

// Pprof profiles
filepath.Join(path, "server", "leader", "profile.prof"),
filepath.Join(path, "server", "leader", "trace.prof"),
filepath.Join(path, "server", "leader", "goroutine.prof"),
filepath.Join(path, "server", "leader", "goroutine-debug1.txt"),
filepath.Join(path, "server", "leader", "goroutine-debug2.txt"),
filepath.Join(path, "server", "leader", "heap.prof"),
filepath.Join(path, "server", "leader", "allocs.prof"),
filepath.Join(path, "server", "leader", "threadcreate.prof"),

// Multiple snapshots are collected, 00 is always created
filepath.Join(path, "nomad", "0000", "jobs.json"),
filepath.Join(path, "nomad", "0000", "nodes.json"),
filepath.Join(path, "nomad", "0000", "metrics.json"),

// Multiple snapshots are collected, 01 requires two intervals
filepath.Join(path, "nomad", "0001", "jobs.json"),
filepath.Join(path, "nomad", "0001", "nodes.json"),
filepath.Join(path, "nomad", "0001", "metrics.json"),
}

testutil.WaitForFilesUntil(t, serverFiles, 2*time.Minute)
// Verify cluster files
clusterPaths := buildPathSlice(cmd.path(clusterDir), clusterFiles)
t.Logf("Waiting for cluster files in path: %s", clusterDir)
testutil.WaitForFilesUntil(t, clusterPaths, 2*time.Minute)

// Verify client files
clientPaths := buildPathSlice(cmd.path(clientDir, clientID), clientFiles)
t.Logf("Waiting for client files in path: %s", clientDir)
testutil.WaitForFilesUntil(t, clientPaths, 2*time.Minute)

// Verify server files
serverPaths := buildPathSlice(cmd.path(serverDir, serverName), serverFiles)
t.Logf("Waiting for server files in path: %s", serverDir)
testutil.WaitForFilesUntil(t, serverPaths, 2*time.Minute)

// Verify interval 0000 files
intervalPaths0 := buildPathSlice(cmd.path(intervalDir, "0000"), intervalFiles)
t.Logf("Waiting for interval 0000 files in path: %s", intervalDir)
testutil.WaitForFilesUntil(t, intervalPaths0, 2*time.Minute)

// Verify interval 0001 files
intervalPaths1 := buildPathSlice(cmd.path(intervalDir, "0001"), intervalFiles)
t.Logf("Waiting for interval 0001 files in path: %s", intervalDir)
testutil.WaitForFilesUntil(t, intervalPaths1, 2*time.Minute)
}

func TestDebug_ExistingOutput(t *testing.T) {
Expand Down
10 changes: 5 additions & 5 deletions testutil/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ func WaitForRunning(t testing.TB, rpc rpcFn, job *structs.Job) []*structs.AllocL
// WaitForFiles blocks until all the files in the slice are present
func WaitForFiles(t testing.TB, files []string) {
WaitForResult(func() (bool, error) {
return FilesExist(files), nil
return FilesExist(files)
}, func(err error) {
t.Fatalf("missing expected files: %v", err)
})
Expand All @@ -250,18 +250,18 @@ func WaitForFiles(t testing.TB, files []string) {
// WaitForFilesUntil blocks until duration or all the files in the slice are present
func WaitForFilesUntil(t testing.TB, files []string, until time.Duration) {
WaitForResultUntil(until, func() (bool, error) {
return FilesExist(files), nil
return FilesExist(files)
}, func(err error) {
t.Fatalf("missing expected files: %v", err)
})
}

// FilesExist verifies all files in the slice are present
func FilesExist(files []string) bool {
func FilesExist(files []string) (bool, error) {
for _, f := range files {
if _, err := os.Stat(f); os.IsNotExist(err) {
return false
return false, fmt.Errorf("expected file not found: %v", f)
}
}
return true
return true, nil
}