diff --git a/CHANGELOG.md b/CHANGELOG.md index 84129c4425db..5768ae5aaefb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 1.0.2 (Unreleased) + +BUG FIXES: + * template: Fixed a bug where dynamic secrets did not trigger the template `change_mode` after a client restart. [[GH-9636](https://github.com/hashicorp/nomad/issues/9636)] + ## 1.0.1 (Unreleased) IMPROVEMENTS: diff --git a/client/allocrunner/taskrunner/interfaces/lifecycle.go b/client/allocrunner/taskrunner/interfaces/lifecycle.go index 1890471bf56a..325a13e387ee 100644 --- a/client/allocrunner/taskrunner/interfaces/lifecycle.go +++ b/client/allocrunner/taskrunner/interfaces/lifecycle.go @@ -16,4 +16,12 @@ type TaskLifecycle interface { // Kill a task permanently. Kill(ctx context.Context, event *structs.TaskEvent) error + + // IsRunning returns true if the task runner has a handle to the task + // driver, which is useful for distinguishing restored tasks during + // prestart hooks. But note that the driver handle could go away after you + // check this, so callers should make sure they're handling that case + // safely. Ideally prestart hooks should be idempotent whenever possible + // to handle restored tasks; use this as an escape hatch. + IsRunning() bool } diff --git a/client/allocrunner/taskrunner/lifecycle.go b/client/allocrunner/taskrunner/lifecycle.go index c9fc10a539e2..2b8c7350ed8c 100644 --- a/client/allocrunner/taskrunner/lifecycle.go +++ b/client/allocrunner/taskrunner/lifecycle.go @@ -86,3 +86,7 @@ func (tr *TaskRunner) Kill(ctx context.Context, event *structs.TaskEvent) error return tr.getKillErr() } + +func (tr *TaskRunner) IsRunning() bool { + return tr.getDriverHandle() != nil +} diff --git a/client/allocrunner/taskrunner/template/template.go b/client/allocrunner/taskrunner/template/template.go index bbc9fea489bf..3f6c8f1b12b0 100644 --- a/client/allocrunner/taskrunner/template/template.go +++ b/client/allocrunner/taskrunner/template/template.go @@ -273,11 +273,22 @@ WAIT: continue } + dirty := false for _, event := range events { // This template hasn't been rendered if event.LastWouldRender.IsZero() { continue WAIT } + if event.WouldRender && event.DidRender { + dirty = true + } + } + + // if there's a driver handle then the task is already running and + // that changes how we want to behave on first render + if dirty && tm.config.Lifecycle.IsRunning() { + handledRenders := make(map[string]time.Time, len(tm.config.Templates)) + tm.onTemplateRendered(handledRenders, time.Time{}) } break WAIT @@ -368,112 +379,117 @@ func (tm *TaskTemplateManager) handleTemplateRerenders(allRenderedTime time.Time SetFailsTask(). SetDisplayMessage(fmt.Sprintf("Template failed: %v", err))) case <-tm.runner.TemplateRenderedCh(): - // A template has been rendered, figure out what to do - var handling []string - signals := make(map[string]struct{}) - restart := false - var splay time.Duration + tm.onTemplateRendered(handledRenders, allRenderedTime) + } + } +} - events := tm.runner.RenderEvents() - for id, event := range events { +func (tm *TaskTemplateManager) onTemplateRendered(handledRenders map[string]time.Time, allRenderedTime time.Time) { - // First time through - if allRenderedTime.After(event.LastDidRender) || allRenderedTime.Equal(event.LastDidRender) { - handledRenders[id] = allRenderedTime - continue - } + var handling []string + signals := make(map[string]struct{}) + restart := false + var splay time.Duration - // We have already handled this one - if htime := handledRenders[id]; htime.After(event.LastDidRender) || htime.Equal(event.LastDidRender) { - continue - } + events := tm.runner.RenderEvents() + for id, event := range events { - // Lookup the template and determine what to do - tmpls, ok := tm.lookup[id] - if !ok { - tm.config.Lifecycle.Kill(context.Background(), - structs.NewTaskEvent(structs.TaskKilling). - SetFailsTask(). - SetDisplayMessage(fmt.Sprintf("Template runner returned unknown template id %q", id))) - return - } + // First time through + if allRenderedTime.After(event.LastDidRender) || allRenderedTime.Equal(event.LastDidRender) { + handledRenders[id] = allRenderedTime + continue + } - // Read environment variables from templates - envMap, err := loadTemplateEnv(tm.config.Templates, tm.config.TaskDir, tm.config.EnvBuilder.Build()) - if err != nil { - tm.config.Lifecycle.Kill(context.Background(), - structs.NewTaskEvent(structs.TaskKilling). - SetFailsTask(). - SetDisplayMessage(fmt.Sprintf("Template failed to read environment variables: %v", err))) - return - } - tm.config.EnvBuilder.SetTemplateEnv(envMap) - - for _, tmpl := range tmpls { - switch tmpl.ChangeMode { - case structs.TemplateChangeModeSignal: - signals[tmpl.ChangeSignal] = struct{}{} - case structs.TemplateChangeModeRestart: - restart = true - case structs.TemplateChangeModeNoop: - continue - } + // We have already handled this one + if htime := handledRenders[id]; htime.After(event.LastDidRender) || htime.Equal(event.LastDidRender) { + continue + } - if tmpl.Splay > splay { - splay = tmpl.Splay - } - } + // Lookup the template and determine what to do + tmpls, ok := tm.lookup[id] + if !ok { + tm.config.Lifecycle.Kill(context.Background(), + structs.NewTaskEvent(structs.TaskKilling). + SetFailsTask(). + SetDisplayMessage(fmt.Sprintf("Template runner returned unknown template id %q", id))) + return + } - handling = append(handling, id) + // Read environment variables from templates + envMap, err := loadTemplateEnv(tm.config.Templates, tm.config.TaskDir, tm.config.EnvBuilder.Build()) + if err != nil { + tm.config.Lifecycle.Kill(context.Background(), + structs.NewTaskEvent(structs.TaskKilling). + SetFailsTask(). + SetDisplayMessage(fmt.Sprintf("Template failed to read environment variables: %v", err))) + return + } + tm.config.EnvBuilder.SetTemplateEnv(envMap) + + for _, tmpl := range tmpls { + switch tmpl.ChangeMode { + case structs.TemplateChangeModeSignal: + signals[tmpl.ChangeSignal] = struct{}{} + case structs.TemplateChangeModeRestart: + restart = true + case structs.TemplateChangeModeNoop: + continue } - if restart || len(signals) != 0 { - if splay != 0 { - ns := splay.Nanoseconds() - offset := rand.Int63n(ns) - t := time.Duration(offset) + if tmpl.Splay > splay { + splay = tmpl.Splay + } + } - select { - case <-time.After(t): - case <-tm.shutdownCh: - return - } - } + handling = append(handling, id) + } - // Update handle time - for _, id := range handling { - handledRenders[id] = events[id].LastDidRender - } + if restart || len(signals) != 0 { + if splay != 0 { + ns := splay.Nanoseconds() + offset := rand.Int63n(ns) + t := time.Duration(offset) - if restart { - tm.config.Lifecycle.Restart(context.Background(), - structs.NewTaskEvent(structs.TaskRestartSignal). - SetDisplayMessage("Template with change_mode restart re-rendered"), false) - } else if len(signals) != 0 { - var mErr multierror.Error - for signal := range signals { - s := tm.signals[signal] - event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetDisplayMessage("Template re-rendered") - if err := tm.config.Lifecycle.Signal(event, signal); err != nil { - multierror.Append(&mErr, err) - } - } + select { + case <-time.After(t): + case <-tm.shutdownCh: + return + } + } + + // Update handle time + for _, id := range handling { + handledRenders[id] = events[id].LastDidRender + } - if err := mErr.ErrorOrNil(); err != nil { - flat := make([]os.Signal, 0, len(signals)) - for signal := range signals { - flat = append(flat, tm.signals[signal]) - } + if restart { + tm.config.Lifecycle.Restart(context.Background(), + structs.NewTaskEvent(structs.TaskRestartSignal). + SetDisplayMessage("Template with change_mode restart re-rendered"), false) + } else if len(signals) != 0 { + var mErr multierror.Error + for signal := range signals { + s := tm.signals[signal] + event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetDisplayMessage("Template re-rendered") + if err := tm.config.Lifecycle.Signal(event, signal); err != nil { + multierror.Append(&mErr, err) + } + } - tm.config.Lifecycle.Kill(context.Background(), - structs.NewTaskEvent(structs.TaskKilling). - SetFailsTask(). - SetDisplayMessage(fmt.Sprintf("Template failed to send signals %v: %v", flat, err))) - } + if err := mErr.ErrorOrNil(); err != nil { + flat := make([]os.Signal, 0, len(signals)) + for signal := range signals { + flat = append(flat, tm.signals[signal]) } + + tm.config.Lifecycle.Kill(context.Background(), + structs.NewTaskEvent(structs.TaskKilling). + SetFailsTask(). + SetDisplayMessage(fmt.Sprintf("Template failed to send signals %v: %v", flat, err))) } } } + } // allTemplatesNoop returns whether all the managed templates have change mode noop. diff --git a/client/allocrunner/taskrunner/template/template_test.go b/client/allocrunner/taskrunner/template/template_test.go index 5bb989900ff5..ff6fa682d2fe 100644 --- a/client/allocrunner/taskrunner/template/template_test.go +++ b/client/allocrunner/taskrunner/template/template_test.go @@ -57,6 +57,9 @@ type MockTaskHooks struct { Events []*structs.TaskEvent EmitEventCh chan *structs.TaskEvent + + // hasHandle can be set to simulate restoring a task after client restart + hasHandle bool } func NewMockTaskHooks() *MockTaskHooks { @@ -98,6 +101,10 @@ func (m *MockTaskHooks) Kill(ctx context.Context, event *structs.TaskEvent) erro return nil } +func (m *MockTaskHooks) IsRunning() bool { + return m.hasHandle +} + func (m *MockTaskHooks) EmitEvent(event *structs.TaskEvent) { m.Events = append(m.Events, event) select { @@ -760,6 +767,105 @@ func TestTaskTemplateManager_Unblock_Multi_Template(t *testing.T) { } } +// TestTaskTemplateManager_FirstRender_Restored tests that a task that's been +// restored renders and triggers its change mode if the template has changed +func TestTaskTemplateManager_FirstRender_Restored(t *testing.T) { + t.Parallel() + require := require.New(t) + // Make a template that will render based on a key in Vault + vaultPath := "secret/data/password" + key := "password" + content := "barbaz" + embedded := fmt.Sprintf(`{{with secret "%s"}}{{.Data.data.%s}}{{end}}`, vaultPath, key) + file := "my.tmpl" + template := &structs.Template{ + EmbeddedTmpl: embedded, + DestPath: file, + ChangeMode: structs.TemplateChangeModeRestart, + } + + harness := newTestHarness(t, []*structs.Template{template}, false, true) + harness.start(t) + defer harness.stop() + + // Ensure no unblock + select { + case <-harness.mockHooks.UnblockCh: + require.Fail("Task unblock should not have been called") + case <-time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second): + } + + // Write the secret to Vault + logical := harness.vault.Client.Logical() + _, err := logical.Write(vaultPath, map[string]interface{}{"data": map[string]interface{}{key: content}}) + require.NoError(err) + + // Wait for the unblock + select { + case <-harness.mockHooks.UnblockCh: + case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second): + require.Fail("Task unblock should have been called") + } + + // Check the file is there + path := filepath.Join(harness.taskDir, file) + raw, err := ioutil.ReadFile(path) + require.NoError(err, "Failed to read rendered template from %q", path) + require.Equal(content, string(raw), "Unexpected template data; got %s, want %q", raw, content) + + // task is now running + harness.mockHooks.hasHandle = true + + // simulate a client restart + harness.manager.Stop() + harness.mockHooks.UnblockCh = make(chan struct{}, 1) + harness.start(t) + + // Wait for the unblock + select { + case <-harness.mockHooks.UnblockCh: + case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second): + require.Fail("Task unblock should have been called") + } + + select { + case <-harness.mockHooks.RestartCh: + require.Fail("should not have restarted", harness.mockHooks) + case <-harness.mockHooks.SignalCh: + require.Fail("should not have restarted", harness.mockHooks) + case <-time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second): + } + + // simulate a client restart and TTL expiry + harness.manager.Stop() + content = "bazbar" + _, err = logical.Write(vaultPath, map[string]interface{}{"data": map[string]interface{}{key: content}}) + require.NoError(err) + harness.mockHooks.UnblockCh = make(chan struct{}, 1) + harness.start(t) + + // Wait for the unblock + select { + case <-harness.mockHooks.UnblockCh: + case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second): + require.Fail("Task unblock should have been called") + } + + // Wait for restart + timeout := time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second) +OUTER: + for { + select { + case <-harness.mockHooks.RestartCh: + break OUTER + case <-harness.mockHooks.SignalCh: + require.Fail("Signal with restart policy", harness.mockHooks) + case <-timeout: + require.Fail("Should have received a restart", harness.mockHooks) + } + } +} + func TestTaskTemplateManager_Rerender_Noop(t *testing.T) { t.Parallel() // Make a template that will render based on a key in Consul diff --git a/website/pages/docs/upgrade/upgrade-specific.mdx b/website/pages/docs/upgrade/upgrade-specific.mdx index 2eaf31239189..a6e3f812782f 100644 --- a/website/pages/docs/upgrade/upgrade-specific.mdx +++ b/website/pages/docs/upgrade/upgrade-specific.mdx @@ -14,6 +14,27 @@ upgrade. However, specific versions of Nomad may have more details provided for their upgrades as a result of new features or changed behavior. This page is used to document those details separately from the standard upgrade flow. +## Nomad 1.0.2 + +#### Dynamic secrets trigger template changes on client restart + +Nomad 1.0.2 changed the behavior of template `change_mode` triggers when a +client node restarts. In Nomad 1.0.1 and earlier, the first rendering of a +template after a client restart would not trigger the `change_mode`. For +dynamic secrets such as the Vault PKI secrets engine, this resulted in the +secret being updated but not restarting or signalling the task. When the +secret's lease expired at some later time, the task workload might fail +because of the stale secret. For example, a web server's SSL certificate would +be expired and browsers would be unable to connect. + +In Nomad 1.0.2, when a client node is restarted any task with Vault secrets +that are generated or have expired will have its `change_mode` triggered. If +`change_mode = "restart"` this will result in the task being restarted, to +avoid the task failing unexpectedly at some point in the future. This change +only impacts tasks using dynamic Vault secrets engines such as [PKI][pki], or +when secrets are rotated. Secrets that don't change in Vault will not trigger +a `change_mode` on client restart. + ## Nomad 1.0.1 #### Envoy worker threads @@ -963,3 +984,4 @@ deleted and then Nomad 0.3.0 can be launched. [vault_grace]: /docs/job-specification/template [node drain]: https://www.nomadproject.io/docs/upgrade#5-upgrade-clients [`template.disable_file_sandbox`]: /docs/configuration/client#template-parameters +[pki]: https://www.vaultproject.io/docs/secrets/pki