From d47678074bf8ae9ff2da3c91d0729bf03aee8446 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Tue, 1 Feb 2022 15:55:36 -0800 Subject: [PATCH] connect: write envoy bootstrap debugging info When Consul Connect just works, it's wonderful. When it doesn't work it can be exceeding difficult to debug: operators have to check task events, Nomad logs, Consul logs, Consul APIs, and even then critical information is missing. Using Consul to generate a bootstrap config for Envoy is notoriously difficult. Nomad doesn't even log stderr, so operators are left trying to piece together what went wrong. This patch attempts to provide *maximal* context which unfortunately includes secrets. **Secrets are always restricted to the secrets/ directory.** This makes debugging a little harder, but allows operators to know exactly what operation Nomad was trying to perform. What's added: - stderr is sent to alloc/logs/envoy_bootstrap.stderr.0 - the CLI is written to secrets/.envoy_bootstrap.cmd - the environment is written to secrets/.envoy_bootstrap.env as JSON Accessing this information is unfortunately awkward: ``` nomad alloc exec -task connect-proxy-count-countdash b36a cat secrets/.envoy_bootstrap.env nomad alloc exec -task connect-proxy-count-countdash b36a cat secrets/.envoy_bootstrap.cmd nomad alloc fs b36a alloc/logs/envoy_bootstrap.stderr.0 ``` The above assumes an alloc id that starts with `b36a` and a Connect sidecar proxy for a service named `count-countdash`. If the alloc is unable to start successfully, the debugging files are only accessible from the host filesystem. --- .changelog/11975.txt | 3 + .../taskrunner/envoy_bootstrap_hook.go | 88 +++++++++++-------- 2 files changed, 56 insertions(+), 35 deletions(-) create mode 100644 .changelog/11975.txt diff --git a/.changelog/11975.txt b/.changelog/11975.txt new file mode 100644 index 000000000000..2fb3b408d832 --- /dev/null +++ b/.changelog/11975.txt @@ -0,0 +1,3 @@ +```release-note:improvement +consul/connect: write Envoy bootstrapping information to disk for debugging +``` diff --git a/client/allocrunner/taskrunner/envoy_bootstrap_hook.go b/client/allocrunner/taskrunner/envoy_bootstrap_hook.go index c5aab12ed3c2..0ae9d236cf32 100644 --- a/client/allocrunner/taskrunner/envoy_bootstrap_hook.go +++ b/client/allocrunner/taskrunner/envoy_bootstrap_hook.go @@ -1,15 +1,17 @@ package taskrunner import ( - "bytes" "context" + "encoding/json" "fmt" + "io" "io/ioutil" "net" "os" "os/exec" "path/filepath" "strconv" + "strings" "time" "github.com/hashicorp/go-hclog" @@ -261,6 +263,11 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart // it to the secrets directory like Vault tokens. bootstrapFilePath := filepath.Join(req.TaskDir.SecretsDir, "envoy_bootstrap.json") + // Write everything related to the command to enable debugging + bootstrapStderrPath := filepath.Join(req.TaskDir.LogDir, "envoy_bootstrap.stderr.0") + bootstrapEnvPath := filepath.Join(req.TaskDir.SecretsDir, ".envoy_bootstrap.env") + bootstrapCmdPath := filepath.Join(req.TaskDir.SecretsDir, ".envoy_bootstrap.cmd") + siToken, err := h.maybeLoadSIToken(req.Task.Name, req.TaskDir.SecretsDir) if err != nil { h.logger.Error("failed to generate envoy bootstrap config", "sidecar_for", service.Name) @@ -269,16 +276,46 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart h.logger.Debug("check for SI token for task", "task", req.Task.Name, "exists", siToken != "") bootstrap := h.newEnvoyBootstrapArgs(h.alloc.TaskGroup, service, grpcAddr, envoyAdminBind, envoyReadyBind, siToken, bootstrapFilePath) + + // Create command line arguments bootstrapArgs := bootstrap.args() + + // Write args to file for debugging + argsFile, err := os.Create(bootstrapCmdPath) + if err != nil { + return errors.Wrap(err, "failed to write bootstrap command line") + } + defer argsFile.Close() + if _, err := io.WriteString(argsFile, strings.Join(bootstrapArgs, " ")+"\n"); err != nil { + return errors.Wrap(err, "failed to encode bootstrap command line") + } + + // Create environment bootstrapEnv := bootstrap.env(os.Environ()) + // Write env to file for debugging + envFile, err := os.Create(bootstrapEnvPath) + if err != nil { + return errors.Wrap(err, "failed to write bootstrap environment") + } + defer envFile.Close() + envEnc := json.NewEncoder(envFile) + envEnc.SetIndent("", " ") + if err := envEnc.Encode(bootstrapEnv); err != nil { + return errors.Wrap(err, "failed to encode bootstrap environment") + } + // keep track of latest error returned from exec-ing consul envoy bootstrap var cmdErr error // Since Consul services are registered asynchronously with this task // hook running, retry until timeout or success. - if backoffErr := decay.Backoff(func() (bool, error) { - + backoffOpts := decay.BackoffOptions{ + MaxSleepTime: h.envoyBootstrapWaitTime, + InitialGapSize: h.envoyBoostrapInitialGap, + MaxJitterSize: h.envoyBootstrapMaxJitter, + } + backoffErr := decay.Backoff(func() (bool, error) { // If hook is killed, just stop. select { case <-ctx.Done(): @@ -291,22 +328,24 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart cmd.Env = bootstrapEnv // Redirect stdout to secrets/envoy_bootstrap.json. - fd, fileErr := os.Create(bootstrapFilePath) + stdout, fileErr := os.Create(bootstrapFilePath) if fileErr != nil { return false, fmt.Errorf("failed to create secrets/envoy_bootstrap.json for envoy: %w", fileErr) } - cmd.Stdout = fd + defer stdout.Close() + cmd.Stdout = stdout - // Redirect stderr into a buffer for later reading. - buf := bytes.NewBuffer(nil) - cmd.Stderr = buf + // Redirect stderr into another file for later debugging. + stderr, fileErr := os.OpenFile(bootstrapStderrPath, os.O_RDWR|os.O_CREATE, 0644) + if fileErr != nil { + return false, fmt.Errorf("failed to create alloc/logs/envoy_bootstrap.stderr.0 for envoy: %w", fileErr) + } + defer stderr.Close() + cmd.Stderr = stderr // Generate bootstrap cmdErr = cmd.Run() - // Close bootstrap.json regardless of any command errors. - _ = fd.Close() - // Command succeeded, exit. if cmdErr == nil { // Bootstrap written. Mark as done and move on. @@ -324,11 +363,9 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart _ = os.Remove(bootstrapFilePath) return true, cmdErr - }, decay.BackoffOptions{ - MaxSleepTime: h.envoyBootstrapWaitTime, - InitialGapSize: h.envoyBoostrapInitialGap, - MaxJitterSize: h.envoyBootstrapMaxJitter, - }); backoffErr != nil { + }, backoffOpts) + + if backoffErr != nil { // Wrap the last error from Consul and set that as our status. _, recoverable := cmdErr.(*exec.ExitError) return structs.NewRecoverableError( @@ -394,25 +431,6 @@ func (h *envoyBootstrapHook) writeConfig(filename, config string) error { return nil } -func (h *envoyBootstrapHook) execute(cmd *exec.Cmd) (string, error) { - var ( - stdout bytes.Buffer - stderr bytes.Buffer - ) - - cmd.Stdout = &stdout - cmd.Stderr = &stderr - - if err := cmd.Run(); err != nil { - _, recoverable := err.(*exec.ExitError) - // ExitErrors are recoverable since they indicate the - // command was runnable but exited with a unsuccessful - // error code. - return stderr.String(), structs.NewRecoverableError(err, recoverable) - } - return stdout.String(), nil -} - // grpcAddress determines the Consul gRPC endpoint address to use. // // In host networking this will default to 127.0.0.1:8502.