Skip to content

Commit

Permalink
connect: use exp backoff when waiting on consul envoy bootstrap
Browse files Browse the repository at this point in the history
This PR wraps the use of the consul envoy bootstrap command in
an expoenential backoff closure, configured to timeout after 60
seconds. This is an increase over the current behavior of making
3 attempts over 6 seconds.

Should help with #10451
  • Loading branch information
shoenig committed Apr 26, 2021
1 parent accf39a commit 0143218
Show file tree
Hide file tree
Showing 3 changed files with 324 additions and 46 deletions.
129 changes: 83 additions & 46 deletions client/allocrunner/taskrunner/envoy_bootstrap_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,29 @@ import (
"github.com/hashicorp/nomad/client/taskenv"
agentconsul "github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/exptime"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/nomad/structs/config"
"github.com/pkg/errors"
)

const envoyBootstrapHookName = "envoy_bootstrap"

const (
// envoyBootstrapWaitTime is the amount of time this hook should wait on Consul
// objects to be created before giving up.
envoyBootstrapWaitTime = 60 * time.Second

// envoyBootstrapInitialGap is the initial amount of time the envoy bootstrap
// retry loop will wait, exponentially increasing each iteration, not including
// jitter.
envoyBoostrapInitialGap = 1 * time.Second

// envoyBootstrapMaxJitter is the maximum amount of jitter applied to the
// wait gap each iteration of the envoy bootstrap retry loop.
envoyBootstrapMaxJitter = 500 * time.Millisecond
)

type consulTransportConfig struct {
HTTPAddr string // required
Auth string // optional, env CONSUL_HTTP_AUTH
Expand Down Expand Up @@ -100,16 +116,32 @@ type envoyBootstrapHook struct {
// consulNamespace is the Consul namespace as set by in the job
consulNamespace string

// envoyBootstrapWaitTime is the total amount of time hook will wait for Consul
envoyBootstrapWaitTime time.Duration

// envoyBootstrapInitialGap is the initial wait gap when retyring
envoyBoostrapInitialGap time.Duration

// envoyBootstrapMaxJitter is the maximum amount of jitter applied to retries
envoyBootstrapMaxJitter time.Duration

// envoyBootstrapExpSleep controls exponential waiting
envoyBootstrapExpSleep func(time.Duration)

// logger is used to log things
logger hclog.Logger
}

func newEnvoyBootstrapHook(c *envoyBootstrapHookConfig) *envoyBootstrapHook {
return &envoyBootstrapHook{
alloc: c.alloc,
consulConfig: c.consul,
consulNamespace: c.consulNamespace,
logger: c.logger.Named(envoyBootstrapHookName),
alloc: c.alloc,
consulConfig: c.consul,
consulNamespace: c.consulNamespace,
envoyBootstrapWaitTime: envoyBootstrapWaitTime,
envoyBoostrapInitialGap: envoyBoostrapInitialGap,
envoyBootstrapMaxJitter: envoyBootstrapMaxJitter,
envoyBootstrapExpSleep: time.Sleep,
logger: c.logger.Named(envoyBootstrapHookName),
}
}

Expand Down Expand Up @@ -221,66 +253,71 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart
bootstrapArgs := bootstrap.args()
bootstrapEnv := bootstrap.env(os.Environ())

// keep track of latest error returned from exec-ing consul envoy bootstrap
var cmdErr error

// Since Consul services are registered asynchronously with this task
// hook running, retry a small number of times with backoff.
for tries := 3; ; tries-- {
// hook running, retry until timeout or success.
if backoffErr := exptime.Backoff(func() (bool, error) {

// If hook is killed, just stop.
select {
case <-ctx.Done():
return false, nil
default:
}

// Prepare bootstrap command to run.
cmd := exec.CommandContext(ctx, "consul", bootstrapArgs...)
cmd.Env = bootstrapEnv

// Redirect output to secrets/envoy_bootstrap.json
fd, err := os.Create(bootstrapFilePath)
if err != nil {
return fmt.Errorf("error creating secrets/envoy_bootstrap.json for envoy: %v", err)
// Redirect stdout to secrets/envoy_bootstrap.json.
fd, fileErr := os.Create(bootstrapFilePath)
if fileErr != nil {
return false, fmt.Errorf("failed to create secrets/envoy_bootstrap.json for envoy: %w", fileErr)
}
cmd.Stdout = fd

// Redirect stderr into a buffer for later reading.
buf := bytes.NewBuffer(nil)
cmd.Stderr = buf

// Generate bootstrap
err = cmd.Run()
cmdErr = cmd.Run()

// Close bootstrap.json
fd.Close()
// Close bootstrap.json regardless of any command errors.
_ = fd.Close()

if err == nil {
// Happy path! Bootstrap was created, exit.
break
// Command succeeded, exit.
if cmdErr == nil {
// Bootstrap written. Mark as done and move on.
resp.Done = true
return false, nil
}

// Check for error from command
if tries == 0 {
h.logger.Error("error creating bootstrap configuration for Connect proxy sidecar", "error", err, "stderr", buf.String())

// Cleanup the bootstrap file. An errors here is not
// important as (a) we test to ensure the deletion
// occurs, and (b) the file will either be rewritten on
// retry or eventually garbage collected if the task
// fails.
os.Remove(bootstrapFilePath)

// ExitErrors are recoverable since they indicate the
// command was runnable but exited with a unsuccessful
// error code.
_, recoverable := err.(*exec.ExitError)
return structs.NewRecoverableError(
fmt.Errorf("error creating bootstrap configuration for Connect proxy sidecar: %v", err),
recoverable,
)
}

// Sleep before retrying to give Consul services time to register
select {
case <-time.After(2 * time.Second):
case <-ctx.Done():
// Killed before bootstrap, exit without setting Done
return nil
}
// Command failed, prepare for retry
//
// Cleanup the bootstrap file. An errors here is not
// important as (a) we test to ensure the deletion
// occurs, and (b) the file will either be rewritten on
// retry or eventually garbage collected if the task
// fails.
_ = os.Remove(bootstrapFilePath)

return true, cmdErr
}, exptime.BackoffOptions{
MaxSleepTime: h.envoyBootstrapWaitTime,
InitialGapSize: h.envoyBoostrapInitialGap,
MaxJitterSize: h.envoyBootstrapMaxJitter,
}); backoffErr != nil {
// Wrap the last error from Consul and set that as our status.
_, recoverable := cmdErr.(*exec.ExitError)
return structs.NewRecoverableError(
fmt.Errorf("error creating bootstrap configuration for Connect proxy sidecar: %v", cmdErr),
recoverable,
)
}

// Bootstrap written. Mark as done and move on.
resp.Done = true
return nil
}

Expand Down
99 changes: 99 additions & 0 deletions client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"os"
"path/filepath"
"testing"
"time"

consulapi "github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/client/allocdir"
Expand Down Expand Up @@ -625,6 +626,11 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) {
h := newEnvoyBootstrapHook(newEnvoyBootstrapHookConfig(alloc, &config.ConsulConfig{
Addr: testConsul.HTTPAddr,
}, consulNamespace, logger))

// Lower the allowable wait time for testing
h.envoyBootstrapWaitTime = 1 * time.Second
h.envoyBoostrapInitialGap = 100 * time.Millisecond

req := &interfaces.TaskPrestartRequest{
Task: sidecarTask,
TaskDir: allocDir.NewTaskDir(sidecarTask.Name),
Expand All @@ -648,6 +654,99 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) {
require.True(t, os.IsNotExist(err))
}

func TestTaskRunner_EnvoyBootstrapHook_retryTimeout(t *testing.T) {
t.Parallel()
logger := testlog.HCLogger(t)

testConsul := getTestConsul(t)
defer testConsul.Stop()

begin := time.Now()

// Setup an Allocation
alloc := mock.ConnectAlloc()
alloc.AllocatedResources.Shared.Networks = []*structs.NetworkResource{
{
Mode: "bridge",
IP: "10.0.0.1",
DynamicPorts: []structs.Port{
{
Label: "connect-proxy-foo",
Value: 9999,
To: 9999,
},
},
},
}
tg := alloc.Job.TaskGroups[0]
tg.Services = []*structs.Service{
{
Name: "foo",
PortLabel: "9999", // Just need a valid port, nothing will bind to it
Connect: &structs.ConsulConnect{
SidecarService: &structs.ConsulSidecarService{},
},
},
}
sidecarTask := &structs.Task{
Name: "sidecar",
Kind: structs.NewTaskKind(structs.ConnectProxyPrefix, "foo"),
}
tg.Tasks = append(tg.Tasks, sidecarTask)
allocDir, cleanupAlloc := allocdir.TestAllocDir(t, logger, "EnvoyBootstrapRetryTimeout")
defer cleanupAlloc()

// Get a Consul client
consulConfig := consulapi.DefaultConfig()
consulConfig.Address = testConsul.HTTPAddr

// Do NOT register group services, causing the hook to retry until timeout

// Run Connect bootstrap hook
h := newEnvoyBootstrapHook(newEnvoyBootstrapHookConfig(alloc, &config.ConsulConfig{
Addr: consulConfig.Address,
}, consulNamespace, logger))

// Keep track of the retry backoff iterations
iterations := 0

// Lower the allowable wait time for testing
h.envoyBootstrapWaitTime = 3 * time.Second
h.envoyBoostrapInitialGap = 1 * time.Second
h.envoyBootstrapExpSleep = func(d time.Duration) {
iterations++
time.Sleep(d)
}

// Create the prestart request
req := &interfaces.TaskPrestartRequest{
Task: sidecarTask,
TaskDir: allocDir.NewTaskDir(sidecarTask.Name),
TaskEnv: taskenv.NewEmptyTaskEnv(),
}
require.NoError(t, req.TaskDir.Build(false, nil))

var resp interfaces.TaskPrestartResponse

// Run the hook and get the error
err := h.Prestart(context.Background(), req, &resp)
require.EqualError(t, err, "error creating bootstrap configuration for Connect proxy sidecar: exit status 1")

// Current time should be at least start time + total wait time
minimum := begin.Add(h.envoyBootstrapWaitTime)
require.True(t, time.Now().After(minimum))

// Should hit at least 2 iterations
require.Greater(t, 2, iterations)

// Make sure we captured the recoverable-ness of the error
_, ok := err.(*structs.RecoverableError)
require.True(t, ok)

// Assert the hook is not done (it failed)
require.False(t, resp.Done)
}

func TestTaskRunner_EnvoyBootstrapHook_extractNameAndKind(t *testing.T) {
t.Run("connect sidecar", func(t *testing.T) {
kind, name, err := (*envoyBootstrapHook)(nil).extractNameAndKind(
Expand Down
Loading

0 comments on commit 0143218

Please sign in to comment.