From 4d86fbebad8056f801c6d42eeb4b80d165311400 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Fri, 12 May 2023 19:35:50 +0000 Subject: [PATCH 1/3] client: ignore restart issued to terminal allocations This PR fixes a bug where issuing a restart to a terminal allocation would cause the allocation to run its hooks anyway. This was particularly apparent with group_service_hook who would then register services but then never deregister them - as the allocation would be effectively in a "zombie" state where it is prepped to run tasks but never will. Fixes #17079 Fixes #16238 Fixes #14618 --- .changelog/17175.txt | 3 +++ client/allocrunner/alloc_runner.go | 6 ++++++ 2 files changed, 9 insertions(+) create mode 100644 .changelog/17175.txt diff --git a/.changelog/17175.txt b/.changelog/17175.txt new file mode 100644 index 000000000000..05f5fcc9c59a --- /dev/null +++ b/.changelog/17175.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fixed a bug where restart of terminal allocation would turn it into zombie +``` diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 58026c650bf6..d82b238ef4e8 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -1273,6 +1273,12 @@ func (ar *allocRunner) RestartAll(event *structs.TaskEvent) error { // restartTasks restarts all task runners concurrently. func (ar *allocRunner) restartTasks(ctx context.Context, event *structs.TaskEvent, failure bool, force bool) error { + + // ensure we are not trying to restart an alloc that is terminal + if !ar.shouldRun() { + return fmt.Errorf("restart of an alloc that should not run") + } + waitCh := make(chan struct{}) var err *multierror.Error var errMutex sync.Mutex From 509a6bfe34c114f7705f02d8cdf78182088c131a Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Mon, 15 May 2023 14:27:58 +0000 Subject: [PATCH 2/3] e2e: add e2e test for alloc restart zombies --- e2e/clientstate/allocs_test.go | 61 ++++++++++++++++++++++++ e2e/clientstate/input/alloc_zombie.nomad | 59 +++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 e2e/clientstate/allocs_test.go create mode 100644 e2e/clientstate/input/alloc_zombie.nomad diff --git a/e2e/clientstate/allocs_test.go b/e2e/clientstate/allocs_test.go new file mode 100644 index 000000000000..b4f963ea6812 --- /dev/null +++ b/e2e/clientstate/allocs_test.go @@ -0,0 +1,61 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package clientstate + +import ( + "testing" + "time" + + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/shoenig/test/must" + "github.com/shoenig/test/wait" +) + +func TestClientAllocs(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + e2eutil.WaitForLeader(t, nomad) + e2eutil.WaitForNodesReady(t, nomad, 1) + + t.Run("testAllocZombie", testAllocZombie) +} + +// testAllocZombie ensures that a restart of a dead allocation does not cause +// it to come back to life in a not-quite alive state. +// +// https://github.com/hashicorp/nomad/issues/17079 +func testAllocZombie(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + jobID := "alloc-zombie-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) + + // start the job and wait for alloc to become failed + err := e2eutil.Register(jobID, "./input/alloc_zombie.nomad") + must.NoError(t, err) + + allocID := e2eutil.SingleAllocID(t, jobID, "", 0) + + // wait for alloc to be marked as failed + e2eutil.WaitForAllocStatus(t, nomad, allocID, "failed") + + // wait for additional failures to know we got rescheduled + must.Wait(t, wait.InitialSuccess( + wait.BoolFunc(func() bool { + statuses, err := e2eutil.AllocStatusesRescheduled(jobID, "") + must.NoError(t, err) + return len(statuses) > 2 + }), + wait.Timeout(1*time.Minute), + wait.Gap(1*time.Second), + )) + + // now attempt to restart our initial allocation + // which should do nothing but give us an error + output, err := e2eutil.Command("nomad", "alloc", "restart", allocID) + must.ErrorContains(t, err, "restart of an alloc that should not run") + must.StrContains(t, output, "Failed to restart allocation") +} diff --git a/e2e/clientstate/input/alloc_zombie.nomad b/e2e/clientstate/input/alloc_zombie.nomad new file mode 100644 index 000000000000..ac8ecdad1d21 --- /dev/null +++ b/e2e/clientstate/input/alloc_zombie.nomad @@ -0,0 +1,59 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: MPL-2.0 + +job "alloc_zombie" { + + group "group" { + network { + mode = "host" + port "http" {} + } + + service { + name = "alloczombie" + port = "http" + provider = "nomad" + + check { + name = "alloczombiecheck" + type = "http" + port = "http" + path = "/does/not/exist.txt" + interval = "2s" + timeout = "1s" + check_restart { + limit = 1 + grace = "3s" + } + } + } + + reschedule { + attempts = 3 + interval = "1m" + delay = "5s" + delay_function = "constant" + unlimited = false + } + + restart { + attempts = 0 + delay = "5s" + mode = "fail" + } + + task "python" { + driver = "raw_exec" + + config { + command = "python3" + args = ["-m", "http.server", "${NOMAD_PORT_http}", "--directory", "/tmp"] + } + + resources { + cpu = 10 + memory = 64 + } + } + } +} From 499c285faf68c28eb0d0c69dc98051e67ff60a5a Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Tue, 16 May 2023 10:19:00 -0500 Subject: [PATCH 3/3] cl: tweak text Co-authored-by: Tim Gross --- .changelog/17175.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changelog/17175.txt b/.changelog/17175.txt index 05f5fcc9c59a..fd2c2f34573e 100644 --- a/.changelog/17175.txt +++ b/.changelog/17175.txt @@ -1,3 +1,3 @@ ```release-note:bug -client: Fixed a bug where restart of terminal allocation would turn it into zombie +client: Fixed a bug where restarting a terminal allocation turns it into a zombie where allocation and task hooks will run unexpectedly ```