Skip to content

Commit

Permalink
template: trigger change_mode for dynamic secrets on restore (#9636)
Browse files Browse the repository at this point in the history
When a task is restored after a client restart, the template runner will
create a new lease for any dynamic secret (ex. Consul or PKI secrets
engines). But because this lease is being created in the prestart hook, we
don't trigger the `change_mode`.

This changeset uses the the existence of the task handle to detect a
previously running task that's been restored, so that we can trigger the
template `change_mode` if the template is changed, as it will be only with
dynamic secrets.
  • Loading branch information
tgross committed Dec 16, 2020
1 parent 33a4188 commit 004f1c9
Show file tree
Hide file tree
Showing 6 changed files with 249 additions and 88 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 1.0.2 (Unreleased)

BUG FIXES:
* template: Fixed a bug where dynamic secrets did not trigger the template `change_mode` after a client restart. [[GH-9636](https://github.com/hashicorp/nomad/issues/9636)]

## 1.0.1 (Unreleased)

IMPROVEMENTS:
Expand Down
8 changes: 8 additions & 0 deletions client/allocrunner/taskrunner/interfaces/lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,12 @@ type TaskLifecycle interface {

// Kill a task permanently.
Kill(ctx context.Context, event *structs.TaskEvent) error

// IsRunning returns true if the task runner has a handle to the task
// driver, which is useful for distinguishing restored tasks during
// prestart hooks. But note that the driver handle could go away after you
// check this, so callers should make sure they're handling that case
// safely. Ideally prestart hooks should be idempotent whenever possible
// to handle restored tasks; use this as an escape hatch.
IsRunning() bool
}
4 changes: 4 additions & 0 deletions client/allocrunner/taskrunner/lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,7 @@ func (tr *TaskRunner) Kill(ctx context.Context, event *structs.TaskEvent) error

return tr.getKillErr()
}

func (tr *TaskRunner) IsRunning() bool {
return tr.getDriverHandle() != nil
}
192 changes: 104 additions & 88 deletions client/allocrunner/taskrunner/template/template.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,22 @@ WAIT:
continue
}

dirty := false
for _, event := range events {
// This template hasn't been rendered
if event.LastWouldRender.IsZero() {
continue WAIT
}
if event.WouldRender && event.DidRender {
dirty = true
}
}

// if there's a driver handle then the task is already running and
// that changes how we want to behave on first render
if dirty && tm.config.Lifecycle.IsRunning() {
handledRenders := make(map[string]time.Time, len(tm.config.Templates))
tm.onTemplateRendered(handledRenders, time.Time{})
}

break WAIT
Expand Down Expand Up @@ -368,112 +379,117 @@ func (tm *TaskTemplateManager) handleTemplateRerenders(allRenderedTime time.Time
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template failed: %v", err)))
case <-tm.runner.TemplateRenderedCh():
// A template has been rendered, figure out what to do
var handling []string
signals := make(map[string]struct{})
restart := false
var splay time.Duration
tm.onTemplateRendered(handledRenders, allRenderedTime)
}
}
}

events := tm.runner.RenderEvents()
for id, event := range events {
func (tm *TaskTemplateManager) onTemplateRendered(handledRenders map[string]time.Time, allRenderedTime time.Time) {

// First time through
if allRenderedTime.After(event.LastDidRender) || allRenderedTime.Equal(event.LastDidRender) {
handledRenders[id] = allRenderedTime
continue
}
var handling []string
signals := make(map[string]struct{})
restart := false
var splay time.Duration

// We have already handled this one
if htime := handledRenders[id]; htime.After(event.LastDidRender) || htime.Equal(event.LastDidRender) {
continue
}
events := tm.runner.RenderEvents()
for id, event := range events {

// Lookup the template and determine what to do
tmpls, ok := tm.lookup[id]
if !ok {
tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template runner returned unknown template id %q", id)))
return
}
// First time through
if allRenderedTime.After(event.LastDidRender) || allRenderedTime.Equal(event.LastDidRender) {
handledRenders[id] = allRenderedTime
continue
}

// Read environment variables from templates
envMap, err := loadTemplateEnv(tm.config.Templates, tm.config.TaskDir, tm.config.EnvBuilder.Build())
if err != nil {
tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template failed to read environment variables: %v", err)))
return
}
tm.config.EnvBuilder.SetTemplateEnv(envMap)

for _, tmpl := range tmpls {
switch tmpl.ChangeMode {
case structs.TemplateChangeModeSignal:
signals[tmpl.ChangeSignal] = struct{}{}
case structs.TemplateChangeModeRestart:
restart = true
case structs.TemplateChangeModeNoop:
continue
}
// We have already handled this one
if htime := handledRenders[id]; htime.After(event.LastDidRender) || htime.Equal(event.LastDidRender) {
continue
}

if tmpl.Splay > splay {
splay = tmpl.Splay
}
}
// Lookup the template and determine what to do
tmpls, ok := tm.lookup[id]
if !ok {
tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template runner returned unknown template id %q", id)))
return
}

handling = append(handling, id)
// Read environment variables from templates
envMap, err := loadTemplateEnv(tm.config.Templates, tm.config.TaskDir, tm.config.EnvBuilder.Build())
if err != nil {
tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template failed to read environment variables: %v", err)))
return
}
tm.config.EnvBuilder.SetTemplateEnv(envMap)

for _, tmpl := range tmpls {
switch tmpl.ChangeMode {
case structs.TemplateChangeModeSignal:
signals[tmpl.ChangeSignal] = struct{}{}
case structs.TemplateChangeModeRestart:
restart = true
case structs.TemplateChangeModeNoop:
continue
}

if restart || len(signals) != 0 {
if splay != 0 {
ns := splay.Nanoseconds()
offset := rand.Int63n(ns)
t := time.Duration(offset)
if tmpl.Splay > splay {
splay = tmpl.Splay
}
}

select {
case <-time.After(t):
case <-tm.shutdownCh:
return
}
}
handling = append(handling, id)
}

// Update handle time
for _, id := range handling {
handledRenders[id] = events[id].LastDidRender
}
if restart || len(signals) != 0 {
if splay != 0 {
ns := splay.Nanoseconds()
offset := rand.Int63n(ns)
t := time.Duration(offset)

if restart {
tm.config.Lifecycle.Restart(context.Background(),
structs.NewTaskEvent(structs.TaskRestartSignal).
SetDisplayMessage("Template with change_mode restart re-rendered"), false)
} else if len(signals) != 0 {
var mErr multierror.Error
for signal := range signals {
s := tm.signals[signal]
event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetDisplayMessage("Template re-rendered")
if err := tm.config.Lifecycle.Signal(event, signal); err != nil {
multierror.Append(&mErr, err)
}
}
select {
case <-time.After(t):
case <-tm.shutdownCh:
return
}
}

// Update handle time
for _, id := range handling {
handledRenders[id] = events[id].LastDidRender
}

if err := mErr.ErrorOrNil(); err != nil {
flat := make([]os.Signal, 0, len(signals))
for signal := range signals {
flat = append(flat, tm.signals[signal])
}
if restart {
tm.config.Lifecycle.Restart(context.Background(),
structs.NewTaskEvent(structs.TaskRestartSignal).
SetDisplayMessage("Template with change_mode restart re-rendered"), false)
} else if len(signals) != 0 {
var mErr multierror.Error
for signal := range signals {
s := tm.signals[signal]
event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetDisplayMessage("Template re-rendered")
if err := tm.config.Lifecycle.Signal(event, signal); err != nil {
multierror.Append(&mErr, err)
}
}

tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template failed to send signals %v: %v", flat, err)))
}
if err := mErr.ErrorOrNil(); err != nil {
flat := make([]os.Signal, 0, len(signals))
for signal := range signals {
flat = append(flat, tm.signals[signal])
}

tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template failed to send signals %v: %v", flat, err)))
}
}
}

}

// allTemplatesNoop returns whether all the managed templates have change mode noop.
Expand Down
106 changes: 106 additions & 0 deletions client/allocrunner/taskrunner/template/template_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ type MockTaskHooks struct {

Events []*structs.TaskEvent
EmitEventCh chan *structs.TaskEvent

// hasHandle can be set to simulate restoring a task after client restart
hasHandle bool
}

func NewMockTaskHooks() *MockTaskHooks {
Expand Down Expand Up @@ -98,6 +101,10 @@ func (m *MockTaskHooks) Kill(ctx context.Context, event *structs.TaskEvent) erro
return nil
}

func (m *MockTaskHooks) IsRunning() bool {
return m.hasHandle
}

func (m *MockTaskHooks) EmitEvent(event *structs.TaskEvent) {
m.Events = append(m.Events, event)
select {
Expand Down Expand Up @@ -760,6 +767,105 @@ func TestTaskTemplateManager_Unblock_Multi_Template(t *testing.T) {
}
}

// TestTaskTemplateManager_FirstRender_Restored tests that a task that's been
// restored renders and triggers its change mode if the template has changed
func TestTaskTemplateManager_FirstRender_Restored(t *testing.T) {
t.Parallel()
require := require.New(t)
// Make a template that will render based on a key in Vault
vaultPath := "secret/data/password"
key := "password"
content := "barbaz"
embedded := fmt.Sprintf(`{{with secret "%s"}}{{.Data.data.%s}}{{end}}`, vaultPath, key)
file := "my.tmpl"
template := &structs.Template{
EmbeddedTmpl: embedded,
DestPath: file,
ChangeMode: structs.TemplateChangeModeRestart,
}

harness := newTestHarness(t, []*structs.Template{template}, false, true)
harness.start(t)
defer harness.stop()

// Ensure no unblock
select {
case <-harness.mockHooks.UnblockCh:
require.Fail("Task unblock should not have been called")
case <-time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second):
}

// Write the secret to Vault
logical := harness.vault.Client.Logical()
_, err := logical.Write(vaultPath, map[string]interface{}{"data": map[string]interface{}{key: content}})
require.NoError(err)

// Wait for the unblock
select {
case <-harness.mockHooks.UnblockCh:
case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second):
require.Fail("Task unblock should have been called")
}

// Check the file is there
path := filepath.Join(harness.taskDir, file)
raw, err := ioutil.ReadFile(path)
require.NoError(err, "Failed to read rendered template from %q", path)
require.Equal(content, string(raw), "Unexpected template data; got %s, want %q", raw, content)

// task is now running
harness.mockHooks.hasHandle = true

// simulate a client restart
harness.manager.Stop()
harness.mockHooks.UnblockCh = make(chan struct{}, 1)
harness.start(t)

// Wait for the unblock
select {
case <-harness.mockHooks.UnblockCh:
case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second):
require.Fail("Task unblock should have been called")
}

select {
case <-harness.mockHooks.RestartCh:
require.Fail("should not have restarted", harness.mockHooks)
case <-harness.mockHooks.SignalCh:
require.Fail("should not have restarted", harness.mockHooks)
case <-time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second):
}

// simulate a client restart and TTL expiry
harness.manager.Stop()
content = "bazbar"
_, err = logical.Write(vaultPath, map[string]interface{}{"data": map[string]interface{}{key: content}})
require.NoError(err)
harness.mockHooks.UnblockCh = make(chan struct{}, 1)
harness.start(t)

// Wait for the unblock
select {
case <-harness.mockHooks.UnblockCh:
case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second):
require.Fail("Task unblock should have been called")
}

// Wait for restart
timeout := time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second)
OUTER:
for {
select {
case <-harness.mockHooks.RestartCh:
break OUTER
case <-harness.mockHooks.SignalCh:
require.Fail("Signal with restart policy", harness.mockHooks)
case <-timeout:
require.Fail("Should have received a restart", harness.mockHooks)
}
}
}

func TestTaskTemplateManager_Rerender_Noop(t *testing.T) {
t.Parallel()
// Make a template that will render based on a key in Consul
Expand Down
Loading

0 comments on commit 004f1c9

Please sign in to comment.