Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

consul/connect: dynamically select envoy sidecar at runtime #8945

Merged
merged 1 commit into from
Oct 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ type allocRunner struct {
// registering services and checks
consulClient consul.ConsulServiceAPI

// consulProxiesClient is the client used by the envoy version hook for
// looking up supported envoy versions of the consul agent.
consulProxiesClient consul.SupportedProxiesAPI

// sidsClient is the client used by the service identity hook for
// managing SI tokens
sidsClient consul.ServiceIdentityAPI
Expand Down Expand Up @@ -186,6 +190,7 @@ func NewAllocRunner(config *Config) (*allocRunner, error) {
alloc: alloc,
clientConfig: config.ClientConfig,
consulClient: config.Consul,
consulProxiesClient: config.ConsulProxies,
sidsClient: config.ConsulSI,
vaultClient: config.Vault,
tasks: make(map[string]*taskrunner.TaskRunner, len(tg.Tasks)),
Expand Down Expand Up @@ -236,7 +241,7 @@ func NewAllocRunner(config *Config) (*allocRunner, error) {
// initTaskRunners creates task runners but does *not* run them.
func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
for _, task := range tasks {
config := &taskrunner.Config{
trConfig := &taskrunner.Config{
Alloc: ar.alloc,
ClientConfig: ar.clientConfig,
Task: task,
Expand All @@ -246,6 +251,7 @@ func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
StateUpdater: ar,
DynamicRegistry: ar.dynamicRegistry,
Consul: ar.consulClient,
ConsulProxies: ar.consulProxiesClient,
ConsulSI: ar.sidsClient,
Vault: ar.vaultClient,
DeviceStatsReporter: ar.deviceStatsReporter,
Expand All @@ -257,7 +263,7 @@ func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
}

// Create, but do not Run, the task runner
tr, err := taskrunner.NewTaskRunner(config)
tr, err := taskrunner.NewTaskRunner(trConfig)
if err != nil {
return fmt.Errorf("failed creating runner for task %q: %v", task.Name, err)
}
Expand Down
4 changes: 4 additions & 0 deletions client/allocrunner/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ type Config struct {
// Consul is the Consul client used to register task services and checks
Consul consul.ConsulServiceAPI

// ConsulProxies is the Consul client used to lookup supported envoy versions
// of the Consul agent.
ConsulProxies consul.SupportedProxiesAPI

// ConsulSI is the Consul client used to manage service identity tokens.
ConsulSI consul.ServiceIdentityAPI

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (

"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
ifs "github.com/hashicorp/nomad/client/allocrunner/interfaces"
agentconsul "github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
Expand Down Expand Up @@ -152,7 +152,7 @@ func (h *envoyBootstrapHook) lookupService(svcKind, svcName, tgName string) (*st
// Prestart creates an envoy bootstrap config file.
//
// Must be aware of both launching envoy as a sidecar proxy, as well as a connect gateway.
func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error {
func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestartRequest, resp *ifs.TaskPrestartResponse) error {
if !req.Task.Kind.IsConnectProxy() && !req.Task.Kind.IsAnyConnectGateway() {
// Not a Connect proxy sidecar
resp.Done = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ func TestTaskRunner_EnvoyBootstrapHook_gateway_ok(t *testing.T) {
// Run the hook
require.NoError(t, h.Prestart(context.Background(), req, &resp))

// Assert the hook is done
// Assert the hook is Done
require.True(t, resp.Done)
require.NotNil(t, resp.Env)

Expand Down
170 changes: 170 additions & 0 deletions client/allocrunner/taskrunner/envoy_version_hook.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
package taskrunner

import (
"context"
"strings"

"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-version"
ifs "github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/client/consul"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/pkg/errors"
)

const (
// envoyVersionHookName is the name of this hook and appears in logs.
envoyVersionHookName = "envoy_version"

// envoyLegacyImage is used when the version of Consul is too old to support
// the SupportedProxies field in the self API.
//
// This is the version defaulted by Nomad before v0.13.0 and/or when using versions
// of Consul before v1.7.8, v1.8.5, and v1.9.0.
envoyLegacyImage = "envoyproxy/envoy:v1.11.2@sha256:a7769160c9c1a55bb8d07a3b71ce5d64f72b1f665f10d81aa1581bc3cf850d09"
)

type envoyVersionHookConfig struct {
alloc *structs.Allocation
proxiesClient consul.SupportedProxiesAPI
logger hclog.Logger
}

func newEnvoyVersionHookConfig(alloc *structs.Allocation, proxiesClient consul.SupportedProxiesAPI, logger hclog.Logger) *envoyVersionHookConfig {
return &envoyVersionHookConfig{
alloc: alloc,
logger: logger,
proxiesClient: proxiesClient,
}
}

// envoyVersionHook is used to determine and set the Docker image used for Consul
// Connect sidecar proxy tasks. It will query Consul for a set of preferred Envoy
// versions if the task image is unset or references ${NOMAD_envoy_version}. Nomad
// will fallback the image to the previous default Envoy v1.11.2 if Consul is too old
// to support the supported proxies API.
type envoyVersionHook struct {
// alloc is the allocation with the envoy task being rewritten.
alloc *structs.Allocation

// proxiesClient is the subset of the Consul API for getting information
// from Consul about the versions of Envoy it supports.
proxiesClient consul.SupportedProxiesAPI

// logger is used to log things.
logger hclog.Logger
}

func newEnvoyVersionHook(c *envoyVersionHookConfig) *envoyVersionHook {
return &envoyVersionHook{
alloc: c.alloc,
proxiesClient: c.proxiesClient,
logger: c.logger.Named(envoyVersionHookName),
}
}

func (envoyVersionHook) Name() string {
return envoyVersionHookName
}

func (h *envoyVersionHook) Prestart(_ context.Context, request *ifs.TaskPrestartRequest, response *ifs.TaskPrestartResponse) error {
if h.skip(request) {
response.Done = true
return nil
}

// We either need to acquire Consul's preferred Envoy version or fallback
// to the legacy default. Query Consul and use the (possibly empty) result.
proxies, err := h.proxiesClient.Proxies()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If Consul is being restarted when this is called it will fail the task and go into a restart loop which... seems ok?

In configurations where Consul does not start before Nomad there could be spurious task failures, but I think that's already true for Connect? Bootstrap has that awkward bulitin retry that avoids spurious task failures, but it's not a very robust approach.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The envoy bootstrap hook's retry block I believe really targets the race condition between launching the sidecar and Nomad actually registering the service to be proxied - potentially a common occurrence. I'm thinking /v1/agent/self response failures are either rare enough or catastrophic enough that letting the task do a restart in those cases is probably fine

if err != nil {
return errors.Wrap(err, "error retrieving supported Envoy versions from Consul")
}

// Determine the concrete Envoy image identifier by applying version string
// substitution (${NOMAD_envoy_version}).
image, err := h.tweakImage(h.taskImage(request.Task.Config), proxies)
if err != nil {
return errors.Wrap(err, "error interpreting desired Envoy version from Consul")
}

// Set the resulting image.
h.logger.Trace("setting task envoy image", "image", image)
request.Task.Config["image"] = image
response.Done = true
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we should actually skip this to ensure we re-run it in the event Envoy fails? Seems cheap enough we might as well.

Bootstrap has a similar problem though, so it might not be worth worrying about here if there's a chance we're just going to fail later due to bootstrap changes not being picked up or something.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking the opposite - if Envoy gets restarted the generated envoy config and the envoy version that gets launched should continue to work together. It's kind of interesting in that Envoy will try twice - interpreting the config at versions N and N-1, but it'd be more consistent to just have the version and bootstrap hooks behave the same. OTOH, maybe now it would make sense to query the version and regenerate the config on every sidecar task restart.

return nil
}

// skip will return true if the request does not contain a task that should have
// its envoy proxy version resolved automatically.
func (h *envoyVersionHook) skip(request *ifs.TaskPrestartRequest) bool {
switch {
case request.Task.Driver != "docker":
return true
case !request.Task.UsesConnectSidecar():
return true
case !h.needsVersion(request.Task.Config):
return true
}
return false
}

// getConfiguredImage extracts the configured config.image value from the request.
// If the image is empty or not a string, Nomad will fallback to the normal
// official Envoy image as if the setting was not configured. This is also what
// Nomad would do if the sidecar_task was not set in the first place.
func (_ *envoyVersionHook) taskImage(config map[string]interface{}) string {
value, exists := config["image"]
if !exists {
return structs.EnvoyImageFormat
}

image, ok := value.(string)
if !ok {
return structs.EnvoyImageFormat
}

return image
}

// needsVersion returns true if the docker.config.image is making use of the
// ${NOMAD_envoy_version} faux environment variable.
// Nomad does not need to query Consul to get the preferred Envoy version, etc.)
func (h *envoyVersionHook) needsVersion(config map[string]interface{}) bool {
if len(config) == 0 {
return false
}

image := h.taskImage(config)

return strings.Contains(image, structs.EnvoyVersionVar)
}

// image determines the best Envoy version to use. If supported is nil or empty
// Nomad will fallback to the legacy envoy image used before Nomad v0.13.
func (_ *envoyVersionHook) tweakImage(configured string, supported map[string][]string) (string, error) {
versions := supported["envoy"]
if len(versions) == 0 {
return envoyLegacyImage, nil
}

latest, err := semver(versions[0])
if err != nil {
return "", err
}

return strings.ReplaceAll(configured, structs.EnvoyVersionVar, latest), nil
}

// semver sanitizes the envoy version string coming from Consul into the format
// used by the Envoy project when publishing images (i.e. proper semver). This
// resulting string value does NOT contain the 'v' prefix for 2 reasons:
// 1) the version library does not include the 'v'
// 2) its plausible unofficial images use the 3 numbers without the prefix for
// tagging their own images
func semver(chosen string) (string, error) {
v, err := version.NewVersion(chosen)
if err != nil {
return "", errors.Wrap(err, "unexpected envoy version format")
}
return v.String(), nil
}
Loading