diff --git a/CHANGELOG.md b/CHANGELOG.md index a972dc6a175b..e6a0628b50b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ BUG FIXES: + * consul: Fixed a bug where failing tasks with group services would only cause the allocation to restart once instead of respecting the `restart` field. [[GH-9869](https://github.com/hashicorp/nomad/issues/9869)] * consul/connect: Fixed a bug where gateway proxy connection default timeout not set [[GH-9851](https://github.com/hashicorp/nomad/pull/9851)] * consul/connect: Fixed a bug preventing more than one connect gateway per Nomad client [[GH-9849](https://github.com/hashicorp/nomad/pull/9849)] * scheduler: Fixed a bug where shared ports were not persisted during inplace updates for service jobs. [[GH-9830](https://github.com/hashicorp/nomad/issues/9830)] @@ -16,7 +17,7 @@ IMPROVEMENTS: * consul/connect: Interpolate the connect, service meta, and service canary meta blocks with the task environment [[GH-9586](https://github.com/hashicorp/nomad/pull/9586)] * consul/connect: enable configuring custom gateway task [[GH-9639](https://github.com/hashicorp/nomad/pull/9639)] * cli: Added JSON/go template formatting to agent-info command. [[GH-9788](https://github.com/hashicorp/nomad/pull/9788)] - + BUG FIXES: * client: Fixed a bug where non-`docker` tasks with network isolation were restarted on client restart. [[GH-9757](https://github.com/hashicorp/nomad/issues/9757)] diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 526cb3e3bb37..279f5f9e48a2 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -1138,6 +1138,9 @@ func (ar *allocRunner) Restart(ctx context.Context, event *structs.TaskEvent, fa var err *multierror.Error var errMutex sync.Mutex + // run alloc task restart hooks + ar.taskRestartHooks() + go func() { var wg sync.WaitGroup defer close(waitCh) @@ -1170,6 +1173,9 @@ func (ar *allocRunner) Restart(ctx context.Context, event *structs.TaskEvent, fa func (ar *allocRunner) RestartAll(taskEvent *structs.TaskEvent) error { var err *multierror.Error + // run alloc task restart hooks + ar.taskRestartHooks() + for tn := range ar.tasks { rerr := ar.RestartTask(tn, taskEvent.Copy()) if rerr != nil { diff --git a/client/allocrunner/alloc_runner_hooks.go b/client/allocrunner/alloc_runner_hooks.go index df797e44c196..2ad59febfad9 100644 --- a/client/allocrunner/alloc_runner_hooks.go +++ b/client/allocrunner/alloc_runner_hooks.go @@ -360,3 +360,28 @@ func (ar *allocRunner) shutdownHooks() { } } } + +func (ar *allocRunner) taskRestartHooks() { + for _, hook := range ar.runnerHooks { + re, ok := hook.(interfaces.RunnerTaskRestartHook) + if !ok { + continue + } + + name := re.Name() + var start time.Time + if ar.logger.IsTrace() { + start = time.Now() + ar.logger.Trace("running alloc task restart hook", + "name", name, "start", start) + } + + re.PreTaskRestart() + + if ar.logger.IsTrace() { + end := time.Now() + ar.logger.Trace("finished alloc task restart hook", + "name", name, "end", end, "duration", end.Sub(start)) + } + } +} diff --git a/client/allocrunner/groupservice_hook.go b/client/allocrunner/groupservice_hook.go index 616e7f2cf69b..3e0e52a3c6b7 100644 --- a/client/allocrunner/groupservice_hook.go +++ b/client/allocrunner/groupservice_hook.go @@ -93,7 +93,10 @@ func (h *groupServiceHook) Prerun() error { h.prerun = true h.mu.Unlock() }() + return h.prerunLocked() +} +func (h *groupServiceHook) prerunLocked() error { if len(h.services) == 0 { return nil } @@ -145,10 +148,26 @@ func (h *groupServiceHook) Update(req *interfaces.RunnerUpdateRequest) error { return h.consulClient.UpdateWorkload(oldWorkloadServices, newWorkloadServices) } +func (h *groupServiceHook) PreTaskRestart() error { + h.mu.Lock() + defer func() { + // Mark prerun as true to unblock Updates + h.prerun = true + h.mu.Unlock() + }() + + h.preKillLocked() + return h.prerunLocked() +} + func (h *groupServiceHook) PreKill() { h.mu.Lock() defer h.mu.Unlock() + h.preKillLocked() +} +// implements the PreKill hook but requires the caller hold the lock +func (h *groupServiceHook) preKillLocked() { // If we have a shutdown delay deregister // group services and then wait // before continuing to kill tasks diff --git a/client/allocrunner/groupservice_hook_test.go b/client/allocrunner/groupservice_hook_test.go index 405a2c8337bc..e1bb2ef19d26 100644 --- a/client/allocrunner/groupservice_hook_test.go +++ b/client/allocrunner/groupservice_hook_test.go @@ -22,6 +22,7 @@ var _ interfaces.RunnerPrerunHook = (*groupServiceHook)(nil) var _ interfaces.RunnerUpdateHook = (*groupServiceHook)(nil) var _ interfaces.RunnerPostrunHook = (*groupServiceHook)(nil) var _ interfaces.RunnerPreKillHook = (*groupServiceHook)(nil) +var _ interfaces.RunnerTaskRestartHook = (*groupServiceHook)(nil) // TestGroupServiceHook_NoGroupServices asserts calling group service hooks // without group services does not error. @@ -50,11 +51,17 @@ func TestGroupServiceHook_NoGroupServices(t *testing.T) { require.NoError(t, h.Postrun()) + require.NoError(t, h.PreTaskRestart()) + ops := consulClient.GetOps() - require.Len(t, ops, 4) - require.Equal(t, "add", ops[0].Op) - require.Equal(t, "update", ops[1].Op) - require.Equal(t, "remove", ops[2].Op) + require.Len(t, ops, 7) + require.Equal(t, "add", ops[0].Op) // Prerun + require.Equal(t, "update", ops[1].Op) // Update + require.Equal(t, "remove", ops[2].Op) // Postrun (1st) + require.Equal(t, "remove", ops[3].Op) // Postrun (2nd) + require.Equal(t, "remove", ops[4].Op) // Restart -> preKill (1st) + require.Equal(t, "remove", ops[5].Op) // Restart -> preKill (2nd) + require.Equal(t, "add", ops[6].Op) // Restart -> preRun } // TestGroupServiceHook_ShutdownDelayUpdate asserts calling group service hooks @@ -117,15 +124,21 @@ func TestGroupServiceHook_GroupServices(t *testing.T) { require.NoError(t, h.Postrun()) + require.NoError(t, h.PreTaskRestart()) + ops := consulClient.GetOps() - require.Len(t, ops, 4) - require.Equal(t, "add", ops[0].Op) - require.Equal(t, "update", ops[1].Op) - require.Equal(t, "remove", ops[2].Op) + require.Len(t, ops, 7) + require.Equal(t, "add", ops[0].Op) // Prerun + require.Equal(t, "update", ops[1].Op) // Update + require.Equal(t, "remove", ops[2].Op) // Postrun (1st) + require.Equal(t, "remove", ops[3].Op) // Postrun (2nd) + require.Equal(t, "remove", ops[4].Op) // Restart -> preKill (1st) + require.Equal(t, "remove", ops[5].Op) // Restart -> preKill (2nd) + require.Equal(t, "add", ops[6].Op) // Restart -> preRun } // TestGroupServiceHook_Error asserts group service hooks with group -// services but no group network returns an error. +// services but no group network is handled gracefully. func TestGroupServiceHook_NoNetwork(t *testing.T) { t.Parallel() @@ -159,11 +172,17 @@ func TestGroupServiceHook_NoNetwork(t *testing.T) { require.NoError(t, h.Postrun()) + require.NoError(t, h.PreTaskRestart()) + ops := consulClient.GetOps() - require.Len(t, ops, 4) - require.Equal(t, "add", ops[0].Op) - require.Equal(t, "update", ops[1].Op) - require.Equal(t, "remove", ops[2].Op) + require.Len(t, ops, 7) + require.Equal(t, "add", ops[0].Op) // Prerun + require.Equal(t, "update", ops[1].Op) // Update + require.Equal(t, "remove", ops[2].Op) // Postrun (1st) + require.Equal(t, "remove", ops[3].Op) // Postrun (2nd) + require.Equal(t, "remove", ops[4].Op) // Restart -> preKill (1st) + require.Equal(t, "remove", ops[5].Op) // Restart -> preKill (2nd) + require.Equal(t, "add", ops[6].Op) // Restart -> preRun } func TestGroupServiceHook_getWorkloadServices(t *testing.T) { diff --git a/client/allocrunner/interfaces/runner_lifecycle.go b/client/allocrunner/interfaces/runner_lifecycle.go index 33713b2c1001..7855deaa3f4d 100644 --- a/client/allocrunner/interfaces/runner_lifecycle.go +++ b/client/allocrunner/interfaces/runner_lifecycle.go @@ -55,6 +55,14 @@ type RunnerUpdateRequest struct { Alloc *structs.Allocation } +// RunnerTaskRestartHooks are executed just before the allocation runner is +// going to restart all tasks. +type RunnerTaskRestartHook interface { + RunnerHook + + PreTaskRestart() error +} + // ShutdownHook may be implemented by AllocRunner or TaskRunner hooks and will // be called when the agent process is being shutdown gracefully. type ShutdownHook interface { diff --git a/e2e/consul/check_restart.go b/e2e/consul/check_restart.go new file mode 100644 index 000000000000..63afe5d20223 --- /dev/null +++ b/e2e/consul/check_restart.go @@ -0,0 +1,106 @@ +package consul + +import ( + "fmt" + "os" + "reflect" + "regexp" + "strings" + "time" + + e2e "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/e2e/framework" + "github.com/hashicorp/nomad/helper/uuid" +) + +const ns = "" + +type CheckRestartE2ETest struct { + framework.TC + jobIds []string +} + +func (tc *CheckRestartE2ETest) BeforeAll(f *framework.F) { + e2e.WaitForLeader(f.T(), tc.Nomad()) + e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1) +} + +func (tc *CheckRestartE2ETest) AfterEach(f *framework.F) { + if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { + return + } + + for _, id := range tc.jobIds { + _, err := e2e.Command("nomad", "job", "stop", "-purge", id) + f.Assert().NoError(err) + } + tc.jobIds = []string{} + _, err := e2e.Command("nomad", "system", "gc") + f.Assert().NoError(err) +} + +// TestGroupCheckRestart runs a job with a group service that will never +// become healthy. Both tasks should be restarted up to the 'restart' limit. +func (tc *CheckRestartE2ETest) TestGroupCheckRestart(f *framework.F) { + + jobID := "test-group-check-restart-" + uuid.Short() + f.NoError(e2e.Register(jobID, "consul/input/checks_group_restart.nomad")) + tc.jobIds = append(tc.jobIds, jobID) + + var allocID string + + f.NoError( + e2e.WaitForAllocStatusComparison( + func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, + func(got []string) bool { return reflect.DeepEqual(got, []string{"failed"}) }, + &e2e.WaitConfig{Interval: time.Second * 10, Retries: 30}, + )) + + expected := "Exceeded allowed attempts 2 in interval 5m0s and mode is \"fail\"" + + out, err := e2e.Command("nomad", "alloc", "status", allocID) + f.NoError(err, "could not get allocation status") + f.Contains(out, expected, + fmt.Errorf("expected '%s', got\n%v", expected, out)) + + re := regexp.MustCompile(`Total Restarts += (.*)\n`) + match := re.FindAllStringSubmatch(out, -1) + for _, m := range match { + f.Equal("2", strings.TrimSpace(m[1]), + fmt.Errorf("expected exactly 2 restarts for both tasks, got:\n%v", out)) + } +} + +// TestTaskCheckRestart runs a job with a task service that will never become +// healthy. Only the failed task should be restarted up to the 'restart' +// limit. +func (tc *CheckRestartE2ETest) TestTaskCheckRestart(f *framework.F) { + + jobID := "test-task-check-restart-" + uuid.Short() + f.NoError(e2e.Register(jobID, "consul/input/checks_task_restart.nomad")) + tc.jobIds = append(tc.jobIds, jobID) + + var allocID string + + f.NoError( + e2e.WaitForAllocStatusComparison( + func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, + func(got []string) bool { return reflect.DeepEqual(got, []string{"failed"}) }, + &e2e.WaitConfig{Interval: time.Second * 10, Retries: 30}, + )) + + expected := "Exceeded allowed attempts 2 in interval 5m0s and mode is \"fail\"" + + out, err := e2e.Command("nomad", "alloc", "status", allocID) + f.NoError(err, "could not get allocation status") + f.Contains(out, expected, + fmt.Errorf("expected '%s', got\n%v", expected, out)) + + re := regexp.MustCompile(`Total Restarts += (.*)\n`) + match := re.FindAllStringSubmatch(out, -1) + f.Equal("2", strings.TrimSpace(match[0][1]), + fmt.Errorf("expected exactly 2 restarts for failed task, got:\n%v", out)) + + f.Equal("0", strings.TrimSpace(match[1][1]), + fmt.Errorf("expected exactly no restarts for healthy task, got:\n%v", out)) +} diff --git a/e2e/consul/consul.go b/e2e/consul/consul.go index 109db29cf817..20212480ffa1 100644 --- a/e2e/consul/consul.go +++ b/e2e/consul/consul.go @@ -35,6 +35,7 @@ func init() { Cases: []framework.TestCase{ new(ConsulE2ETest), new(ScriptChecksE2ETest), + new(CheckRestartE2ETest), }, }) } diff --git a/e2e/consul/input/checks_group_restart.nomad b/e2e/consul/input/checks_group_restart.nomad new file mode 100644 index 000000000000..0f520cbbc1ab --- /dev/null +++ b/e2e/consul/input/checks_group_restart.nomad @@ -0,0 +1,64 @@ +job "group_check_restart" { + datacenters = ["dc1"] + type = "service" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "group_check_restart" { + network { + mode = "bridge" + } + + restart { + attempts = 2 + delay = "1s" + interval = "5m" + mode = "fail" + } + + service { + name = "group-service-1" + port = "9003" + + # this check should always time out and so the service + # should not be marked healthy, resulting in the tasks + # getting restarted + check { + name = "always-dead" + type = "script" + task = "fail" + interval = "2s" + timeout = "1s" + command = "sleep" + args = ["10"] + + check_restart { + limit = 2 + grace = "5s" + ignore_warnings = false + } + } + } + + task "fail" { + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + + task "ok" { + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + } +} diff --git a/e2e/consul/input/checks_task_restart.nomad b/e2e/consul/input/checks_task_restart.nomad new file mode 100644 index 000000000000..f36ed77b7379 --- /dev/null +++ b/e2e/consul/input/checks_task_restart.nomad @@ -0,0 +1,63 @@ +job "task_check" { + datacenters = ["dc1"] + type = "service" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "task_check" { + count = 1 + + restart { + attempts = 2 + delay = "1s" + interval = "5m" + mode = "fail" + } + + task "fail" { + + service { + name = "task-service-1" + + # this check should always time out and so the service + # should not be marked healthy + check { + name = "always-dead" + type = "script" + interval = "2s" + timeout = "1s" + command = "sleep" + args = ["10"] + + check_restart { + limit = 2 + grace = "5s" + ignore_warnings = false + } + + } + } + + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + + + task "ok" { + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + + } +} diff --git a/website/content/docs/job-specification/check_restart.mdx b/website/content/docs/job-specification/check_restart.mdx index 8617b664a83d..52b85d002d9e 100644 --- a/website/content/docs/job-specification/check_restart.mdx +++ b/website/content/docs/job-specification/check_restart.mdx @@ -16,10 +16,6 @@ description: |- ]} /> -~> The `check_restart` stanza in Nomad is only supported for task networks, -_not_ group networks. Please follow [#9176][gh-9176] to be notified when -this is fixed. - As of Nomad 0.7 the `check_restart` stanza instructs Nomad when to restart tasks with unhealthy service checks. When a health check in Consul has been unhealthy for the `limit` specified in a `check_restart` stanza, it is