From 85b226375a2502fbbde44cf9e16391eec32d3a9f Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Tue, 30 Nov 2021 17:03:16 -0500
Subject: [PATCH] provide `-no-shutdown-delay` flag for job/alloc stop

Some operators use very long group/task `shutdown_delay` settings to
safely drain network connections to their workloads after service
deregistration. But during incident response, they may want to cause
that drain to be skipped so they can quickly shed load.

Provide a `-no-shutdown-delay` flag on the `nomad alloc stop` and
`nomad job stop` commands that bypasses the delay. This sets a new
desired transition state on the affected allocations that the
allocation/task runner will identify during pre-kill on the client.

Note (as documented here) that using this flag will almost always
result in failed inbound network connections for workloads as the
tasks will exit before clients receive updated service discovery
information and won't be gracefully drained.

Also note that an allocation that is already waiting for the
`shutdown_delay` will not be interrupted by running `job stop` or
`alloc stop` again with `-no-shutdown-delay` set; once we've set an
allocation to stop the allocation runner ignores further updates.
---
 .changelog/11596.txt                          |  3 +
 api/jobs.go                                   |  9 +-
 client/allocrunner/alloc_runner_hooks.go      |  2 +-
 client/allocrunner/groupservice_hook.go       | 12 +--
 .../interfaces/runner_lifecycle.go            |  2 +-
 client/allocrunner/taskrunner/task_runner.go  |  2 +-
 .../taskrunner/task_runner_test.go            | 76 +++++++++++++++-
 command/agent/alloc_endpoint.go               | 12 ++-
 command/agent/job_endpoint.go                 | 12 +++
 command/alloc_stop.go                         | 18 +++-
 command/job_stop.go                           | 34 ++++---
 nomad/alloc_endpoint.go                       |  3 +-
 nomad/fsm.go                                  | 22 ++++-
 nomad/job_endpoint_test.go                    | 91 +++++++++++++++++++
 nomad/state/state_store.go                    |  6 +-
 nomad/structs/structs.go                      | 23 ++++-
 website/content/docs/commands/alloc/stop.mdx  |  9 ++
 website/content/docs/commands/job/stop.mdx    |  9 ++
 18 files changed, 308 insertions(+), 37 deletions(-)
 create mode 100644 .changelog/11596.txt

diff --git a/.changelog/11596.txt b/.changelog/11596.txt
new file mode 100644
index 000000000000..74b451c02787
--- /dev/null
+++ b/.changelog/11596.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+cli: provide `-no-shutdown-delay` option to `job stop` and `alloc stop` commands to ignore `shutdown_delay`
+```
diff --git a/api/jobs.go b/api/jobs.go
index 146f65cf7131..b3c3944bc8c7 100644
--- a/api/jobs.go
+++ b/api/jobs.go
@@ -299,6 +299,11 @@ type DeregisterOptions struct {
 	// is useful when an operator wishes to push through a job deregistration
 	// in busy clusters with a large evaluation backlog.
 	EvalPriority int
+
+	// NoShutdownDelay, if set to true, will override the group and
+	// task shutdown_delay configuration and set it to "0s" for any
+	// allocations stopped as a result of this Deregister call.
+	NoShutdownDelay bool
 }
 
 // DeregisterOpts is used to remove an existing job. See DeregisterOptions
@@ -312,8 +317,8 @@ func (j *Jobs) DeregisterOpts(jobID string, opts *DeregisterOptions, q *WriteOpt
 	// Protect against nil opts. url.Values expects a string, and so using
 	// fmt.Sprintf is the best way to do this.
 	if opts != nil {
-		endpoint += fmt.Sprintf("?purge=%t&global=%t&eval_priority=%v",
-			opts.Purge, opts.Global, opts.EvalPriority)
+		endpoint += fmt.Sprintf("?purge=%t&global=%t&eval_priority=%v&no_shutdown_delay=%t",
+			opts.Purge, opts.Global, opts.EvalPriority, opts.NoShutdownDelay)
 	}
 
 	wm, err := j.client.delete(endpoint, &resp, q)
diff --git a/client/allocrunner/alloc_runner_hooks.go b/client/allocrunner/alloc_runner_hooks.go
index 9624e633c730..f6e95726fd27 100644
--- a/client/allocrunner/alloc_runner_hooks.go
+++ b/client/allocrunner/alloc_runner_hooks.go
@@ -338,7 +338,7 @@ func (ar *allocRunner) preKillHooks() {
 			ar.logger.Trace("running alloc pre shutdown hook", "name", name, "start", start)
 		}
 
-		pre.PreKill()
+		pre.PreKill(ar.alloc)
 
 		if ar.logger.IsTrace() {
 			end := time.Now()
diff --git a/client/allocrunner/groupservice_hook.go b/client/allocrunner/groupservice_hook.go
index 778109e65f49..8abee3d22bce 100644
--- a/client/allocrunner/groupservice_hook.go
+++ b/client/allocrunner/groupservice_hook.go
@@ -29,7 +29,6 @@ type groupServiceHook struct {
 	consulClient        consul.ConsulServiceAPI
 	consulNamespace     string
 	prerun              bool
-	delay               time.Duration
 	deregistered        bool
 	networkStatusGetter networkStatusGetter
 
@@ -41,6 +40,7 @@ type groupServiceHook struct {
 	networks       structs.Networks
 	ports          structs.AllocatedPorts
 	taskEnvBuilder *taskenv.Builder
+	delay          time.Duration
 
 	// Since Update() may be called concurrently with any other hook all
 	// hook methods must be fully serialized
@@ -164,24 +164,24 @@ func (h *groupServiceHook) PreTaskRestart() error {
 		h.mu.Unlock()
 	}()
 
-	h.preKillLocked()
+	h.preKillLocked(nil)
 	return h.prerunLocked()
 }
 
-func (h *groupServiceHook) PreKill() {
+func (h *groupServiceHook) PreKill(alloc *structs.Allocation) {
 	h.mu.Lock()
 	defer h.mu.Unlock()
-	h.preKillLocked()
+	h.preKillLocked(alloc)
 }
 
 // implements the PreKill hook but requires the caller hold the lock
-func (h *groupServiceHook) preKillLocked() {
+func (h *groupServiceHook) preKillLocked(alloc *structs.Allocation) {
 	// If we have a shutdown delay deregister group services and then wait
 	// before continuing to kill tasks.
 	h.deregister()
 	h.deregistered = true
 
-	if h.delay == 0 {
+	if h.delay == 0 || (alloc != nil && alloc.DesiredTransition.ShouldIgnoreShutdownDelay()) {
 		return
 	}
 
diff --git a/client/allocrunner/interfaces/runner_lifecycle.go b/client/allocrunner/interfaces/runner_lifecycle.go
index 7855deaa3f4d..92ef52b83002 100644
--- a/client/allocrunner/interfaces/runner_lifecycle.go
+++ b/client/allocrunner/interfaces/runner_lifecycle.go
@@ -22,7 +22,7 @@ type RunnerPrerunHook interface {
 type RunnerPreKillHook interface {
 	RunnerHook
 
-	PreKill()
+	PreKill(*structs.Allocation)
 }
 
 // RunnerPostrunHooks are executed after calling TaskRunner.Run, even for
diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go
index 68df0827f60e..e50ad37ff130 100644
--- a/client/allocrunner/taskrunner/task_runner.go
+++ b/client/allocrunner/taskrunner/task_runner.go
@@ -889,7 +889,7 @@ func (tr *TaskRunner) handleKill(resultCh <-chan *drivers.ExitResult) *drivers.E
 	// Wait for task ShutdownDelay after running prekill hooks
 	// This allows for things like service de-registration to run
 	// before waiting to kill task
-	if delay := tr.Task().ShutdownDelay; delay != 0 {
+	if delay := tr.Task().ShutdownDelay; delay != 0 && !tr.alloc.DesiredTransition.ShouldIgnoreShutdownDelay() {
 		tr.logger.Debug("waiting before killing task", "shutdown_delay", delay)
 
 		select {
diff --git a/client/allocrunner/taskrunner/task_runner_test.go b/client/allocrunner/taskrunner/task_runner_test.go
index f3ef206cfd68..8b166c4b6818 100644
--- a/client/allocrunner/taskrunner/task_runner_test.go
+++ b/client/allocrunner/taskrunner/task_runner_test.go
@@ -14,6 +14,10 @@ import (
 	"time"
 
 	"github.com/golang/snappy"
+	"github.com/kr/pretty"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
 	"github.com/hashicorp/nomad/client/allocdir"
 	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
 	"github.com/hashicorp/nomad/client/config"
@@ -26,6 +30,7 @@ import (
 	agentconsul "github.com/hashicorp/nomad/command/agent/consul"
 	mockdriver "github.com/hashicorp/nomad/drivers/mock"
 	"github.com/hashicorp/nomad/drivers/rawexec"
+	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/helper/testlog"
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
@@ -33,9 +38,6 @@ import (
 	"github.com/hashicorp/nomad/plugins/device"
 	"github.com/hashicorp/nomad/plugins/drivers"
 	"github.com/hashicorp/nomad/testutil"
-	"github.com/kr/pretty"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
 )
 
 type MockTaskStateUpdater struct {
@@ -996,6 +998,74 @@ WAIT:
 	}
 }
 
+// TestTaskRunner_NoShutdownDelay asserts services are removed from
+// Consul and tasks are killed without waiting for ${shutdown_delay}
+// when the alloc has the NoShutdownDelay transition flag set.
+func TestTaskRunner_NoShutdownDelay(t *testing.T) {
+	t.Parallel()
+
+	alloc := mock.Alloc()
+	alloc.DesiredTransition = structs.DesiredTransition{NoShutdownDelay: helper.BoolToPtr(true)}
+	task := alloc.Job.TaskGroups[0].Tasks[0]
+	task.Services[0].Tags = []string{"tag1"}
+	task.Services = task.Services[:1] // only need 1 for this test
+	task.Driver = "mock_driver"
+	task.Config = map[string]interface{}{
+		"run_for": "1000s",
+	}
+
+	// set the shutdown delay high; we'll timeout well before this
+	task.ShutdownDelay = 1 * time.Minute
+
+	tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name)
+	defer cleanup()
+
+	mockConsul := conf.Consul.(*consulapi.MockConsulServiceClient)
+
+	testWaitForTaskToStart(t, tr)
+
+	testutil.WaitForResult(func() (bool, error) {
+		ops := mockConsul.GetOps()
+		if n := len(ops); n != 1 {
+			return false, fmt.Errorf("expected 1 consul operation. Found %d", n)
+		}
+		return ops[0].Op == "add", fmt.Errorf("consul operation was not a registration: %#v", ops[0])
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+
+	killed := make(chan error)
+	go func() {
+		err := tr.Kill(context.Background(), structs.NewTaskEvent("test"))
+		killed <- err
+	}()
+
+	// Wait for first de-registration call. Note that unlike
+	// TestTaskRunner_ShutdownDelay, we're racing with task exit
+	// and can't assert that we only get the first deregistration op
+	// (from serviceHook.PreKill).
+	testutil.WaitForResult(func() (bool, error) {
+		ops := mockConsul.GetOps()
+		if n := len(ops); n < 2 {
+			return false, fmt.Errorf("expected at least 2 consul operations.")
+		}
+		return ops[1].Op == "remove", fmt.Errorf(
+			"consul operation was not a deregistration: %#v", ops[1])
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+
+	// Wait for the task to exit
+	select {
+	case <-tr.WaitCh():
+	case <-time.After(time.Duration(testutil.TestMultiplier()) * time.Second):
+		t.Fatalf("task kill did not ignore shutdown delay")
+	}
+
+	err := <-killed
+	require.NoError(t, err, "killing task returned unexpected error")
+}
+
 // TestTaskRunner_Dispatch_Payload asserts that a dispatch job runs and the
 // payload was written to disk.
 func TestTaskRunner_Dispatch_Payload(t *testing.T) {
diff --git a/command/agent/alloc_endpoint.go b/command/agent/alloc_endpoint.go
index d1a7e210c5d2..f6f724001bae 100644
--- a/command/agent/alloc_endpoint.go
+++ b/command/agent/alloc_endpoint.go
@@ -138,8 +138,18 @@ func (s *HTTPServer) allocStop(allocID string, resp http.ResponseWriter, req *ht
 		return nil, CodedError(405, ErrInvalidMethod)
 	}
 
+	noShutdownDelay := false
+	if noShutdownDelayQS := req.URL.Query().Get("no_shutdown_delay"); noShutdownDelayQS != "" {
+		var err error
+		noShutdownDelay, err = strconv.ParseBool(noShutdownDelayQS)
+		if err != nil {
+			return nil, fmt.Errorf("no_shutdown_delay value is not a boolean: %v", err)
+		}
+	}
+
 	sr := &structs.AllocStopRequest{
-		AllocID: allocID,
+		AllocID:         allocID,
+		NoShutdownDelay: noShutdownDelay,
 	}
 	s.parseWriteRequest(req, &sr.WriteRequest)
 
diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go
index 8a9da76febae..1ff8a7bde07c 100644
--- a/command/agent/job_endpoint.go
+++ b/command/agent/job_endpoint.go
@@ -453,6 +453,18 @@ func (s *HTTPServer) jobDelete(resp http.ResponseWriter, req *http.Request,
 		return nil, err
 	}
 
+	// Identify the no_shutdown_delay query param and parse.
+	noShutdownDelayStr := req.URL.Query().Get("no_shutdown_delay")
+	var noShutdownDelay bool
+	if noShutdownDelayStr != "" {
+		var err error
+		noShutdownDelay, err = strconv.ParseBool(noShutdownDelayStr)
+		if err != nil {
+			return nil, fmt.Errorf("Failed to parse value of %qq (%v) as a bool: %v", "no_shutdown_delay", noShutdownDelayStr, err)
+		}
+	}
+	args.NoShutdownDelay = noShutdownDelay
+
 	// Validate the evaluation priority if the user supplied a non-default
 	// value. It's more efficient to do it here, within the agent rather than
 	// sending a bad request for the server to reject.
diff --git a/command/alloc_stop.go b/command/alloc_stop.go
index 3c8f9cbd93c9..2875f62598e8 100644
--- a/command/alloc_stop.go
+++ b/command/alloc_stop.go
@@ -38,6 +38,14 @@ Stop Specific Options:
     screen, which can be used to examine the rescheduling evaluation using the
     eval-status command.
 
+  -no-shutdown-delay
+	Ignore the the group and task shutdown_delay configuration so there is no
+    delay between service deregistration and task shutdown. Note that using
+    this flag will result in failed network connections to the allocation
+    being stopped. Also note that an allocation that is already waiting for
+    the shutdown_delay will not be interrupted by running alloc stop again with
+    this flag.
+
   -verbose
     Show full information.
 `
@@ -47,12 +55,13 @@ Stop Specific Options:
 func (c *AllocStopCommand) Name() string { return "alloc stop" }
 
 func (c *AllocStopCommand) Run(args []string) int {
-	var detach, verbose bool
+	var detach, verbose, noShutdownDelay bool
 
 	flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
 	flags.Usage = func() { c.Ui.Output(c.Help()) }
 	flags.BoolVar(&detach, "detach", false, "")
 	flags.BoolVar(&verbose, "verbose", false, "")
+	flags.BoolVar(&noShutdownDelay, "no-shutdown-delay", false, "")
 
 	if err := flags.Parse(args); err != nil {
 		return 1
@@ -115,7 +124,12 @@ func (c *AllocStopCommand) Run(args []string) int {
 		return 1
 	}
 
-	resp, err := client.Allocations().Stop(alloc, nil)
+	var opts *api.QueryOptions
+	if noShutdownDelay {
+		opts = &api.QueryOptions{Params: map[string]string{"no_shutdown_delay": "true"}}
+	}
+
+	resp, err := client.Allocations().Stop(alloc, opts)
 	if err != nil {
 		c.Ui.Error(fmt.Sprintf("Error stopping allocation: %s", err))
 		return 1
diff --git a/command/job_stop.go b/command/job_stop.go
index 8dd5d8a11972..e007db7a49c1 100644
--- a/command/job_stop.go
+++ b/command/job_stop.go
@@ -43,14 +43,22 @@ Stop Options:
     Override the priority of the evaluations produced as a result of this job
     deregistration. By default, this is set to the priority of the job.
 
-  -purge
-    Purge is used to stop the job and purge it from the system. If not set, the
-    job will still be queryable and will be purged by the garbage collector.
-
   -global
     Stop a multi-region job in all its regions. By default job stop will stop
     only a single region at a time. Ignored for single-region jobs.
 
+  -no-shutdown-delay
+	Ignore the the group and task shutdown_delay configuration so there is no
+    delay between service deregistration and task shutdown. Note that using
+    this flag will result in failed network connections to the allocation
+    being stopped. Also note that an allocation that is already waiting for
+    the shutdown_delay will not be interrupted by running job stop again with
+    this flag.
+
+  -purge
+    Purge is used to stop the job and purge it from the system. If not set, the
+    job will still be queryable and will be purged by the garbage collector.
+
   -yes
     Automatic yes to prompts.
 
@@ -67,12 +75,13 @@ func (c *JobStopCommand) Synopsis() string {
 func (c *JobStopCommand) AutocompleteFlags() complete.Flags {
 	return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
 		complete.Flags{
-			"-detach":        complete.PredictNothing,
-			"-eval-priority": complete.PredictNothing,
-			"-purge":         complete.PredictNothing,
-			"-global":        complete.PredictNothing,
-			"-yes":           complete.PredictNothing,
-			"-verbose":       complete.PredictNothing,
+			"-detach":            complete.PredictNothing,
+			"-eval-priority":     complete.PredictNothing,
+			"-purge":             complete.PredictNothing,
+			"-global":            complete.PredictNothing,
+			"-no-shutdown-delay": complete.PredictNothing,
+			"-yes":               complete.PredictNothing,
+			"-verbose":           complete.PredictNothing,
 		})
 }
 
@@ -94,7 +103,7 @@ func (c *JobStopCommand) AutocompleteArgs() complete.Predictor {
 func (c *JobStopCommand) Name() string { return "job stop" }
 
 func (c *JobStopCommand) Run(args []string) int {
-	var detach, purge, verbose, global, autoYes bool
+	var detach, purge, verbose, global, autoYes, noShutdownDelay bool
 	var evalPriority int
 
 	flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
@@ -102,6 +111,7 @@ func (c *JobStopCommand) Run(args []string) int {
 	flags.BoolVar(&detach, "detach", false, "")
 	flags.BoolVar(&verbose, "verbose", false, "")
 	flags.BoolVar(&global, "global", false, "")
+	flags.BoolVar(&noShutdownDelay, "no-shutdown-delay", false, "")
 	flags.BoolVar(&autoYes, "yes", false, "")
 	flags.BoolVar(&purge, "purge", false, "")
 	flags.IntVar(&evalPriority, "eval-priority", 0, "")
@@ -199,7 +209,7 @@ func (c *JobStopCommand) Run(args []string) int {
 	}
 
 	// Invoke the stop
-	opts := &api.DeregisterOptions{Purge: purge, Global: global, EvalPriority: evalPriority}
+	opts := &api.DeregisterOptions{Purge: purge, Global: global, EvalPriority: evalPriority, NoShutdownDelay: noShutdownDelay}
 	wq := &api.WriteOptions{Namespace: jobs[0].JobSummary.Namespace}
 	evalID, _, err := client.Jobs().DeregisterOpts(*job.ID, opts, wq)
 	if err != nil {
diff --git a/nomad/alloc_endpoint.go b/nomad/alloc_endpoint.go
index 3a32b5f19646..0b44175adf62 100644
--- a/nomad/alloc_endpoint.go
+++ b/nomad/alloc_endpoint.go
@@ -320,7 +320,8 @@ func (a *Alloc) Stop(args *structs.AllocStopRequest, reply *structs.AllocStopRes
 		Evals: []*structs.Evaluation{eval},
 		Allocs: map[string]*structs.DesiredTransition{
 			args.AllocID: {
-				Migrate: helper.BoolToPtr(true),
+				Migrate:         helper.BoolToPtr(true),
+				NoShutdownDelay: helper.BoolToPtr(args.NoShutdownDelay),
 			},
 		},
 	}
diff --git a/nomad/fsm.go b/nomad/fsm.go
index 84721014560f..45725858fd1b 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -11,6 +11,7 @@ import (
 	log "github.com/hashicorp/go-hclog"
 	memdb "github.com/hashicorp/go-memdb"
 	"github.com/hashicorp/go-msgpack/codec"
+	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/state"
 	"github.com/hashicorp/nomad/nomad/structs"
@@ -605,7 +606,7 @@ func (n *nomadFSM) applyDeregisterJob(msgType structs.MessageType, buf []byte, i
 	}
 
 	err := n.state.WithWriteTransaction(msgType, index, func(tx state.Txn) error {
-		err := n.handleJobDeregister(index, req.JobID, req.Namespace, req.Purge, tx)
+		err := n.handleJobDeregister(index, req.JobID, req.Namespace, req.Purge, req.NoShutdownDelay, tx)
 
 		if err != nil {
 			n.logger.Error("deregistering job failed",
@@ -645,7 +646,7 @@ func (n *nomadFSM) applyBatchDeregisterJob(msgType structs.MessageType, buf []by
 	// evals for jobs whose deregistering didn't get committed yet.
 	err := n.state.WithWriteTransaction(msgType, index, func(tx state.Txn) error {
 		for jobNS, options := range req.Jobs {
-			if err := n.handleJobDeregister(index, jobNS.ID, jobNS.Namespace, options.Purge, tx); err != nil {
+			if err := n.handleJobDeregister(index, jobNS.ID, jobNS.Namespace, options.Purge, false, tx); err != nil {
 				n.logger.Error("deregistering job failed", "job", jobNS.ID, "error", err)
 				return err
 			}
@@ -670,12 +671,27 @@ func (n *nomadFSM) applyBatchDeregisterJob(msgType structs.MessageType, buf []by
 
 // handleJobDeregister is used to deregister a job. Leaves error logging up to
 // caller.
-func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, purge bool, tx state.Txn) error {
+func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, purge bool, noShutdownDelay bool, tx state.Txn) error {
 	// If it is periodic remove it from the dispatcher
 	if err := n.periodicDispatcher.Remove(namespace, jobID); err != nil {
 		return fmt.Errorf("periodicDispatcher.Remove failed: %w", err)
 	}
 
+	if noShutdownDelay {
+		ws := memdb.NewWatchSet()
+		allocs, err := n.state.AllocsByJob(ws, namespace, jobID, false)
+		if err != nil {
+			return err
+		}
+		transition := &structs.DesiredTransition{NoShutdownDelay: helper.BoolToPtr(true)}
+		for _, alloc := range allocs {
+			err := n.state.UpdateAllocDesiredTransitionTxn(tx, index, alloc.ID, transition)
+			if err != nil {
+				return err
+			}
+		}
+	}
+
 	if purge {
 		if err := n.state.DeleteJobTxn(index, namespace, jobID, tx); err != nil {
 			return fmt.Errorf("DeleteJob failed: %w", err)
diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go
index a3e12332f311..92351c86ceec 100644
--- a/nomad/job_endpoint_test.go
+++ b/nomad/job_endpoint_test.go
@@ -3737,6 +3737,97 @@ func TestJobEndpoint_Deregister_EvalCreation_Legacy(t *testing.T) {
 	})
 }
 
+func TestJobEndpoint_Deregister_NoShutdownDelay(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+
+	s1, cleanupS1 := TestServer(t, func(c *Config) {
+		c.NumSchedulers = 0 // Prevent automatic dequeue
+	})
+	defer cleanupS1()
+	codec := rpcClient(t, s1)
+	testutil.WaitForLeader(t, s1.RPC)
+
+	// Create the register requests
+	job := mock.Job()
+	reg := &structs.JobRegisterRequest{
+		Job: job,
+		WriteRequest: structs.WriteRequest{
+			Region:    "global",
+			Namespace: job.Namespace,
+		},
+	}
+
+	// Fetch the response
+	var resp0 structs.JobRegisterResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", reg, &resp0))
+
+	// Deregister but don't purge
+	dereg1 := &structs.JobDeregisterRequest{
+		JobID: job.ID,
+		WriteRequest: structs.WriteRequest{
+			Region:    "global",
+			Namespace: job.Namespace,
+		},
+	}
+	var resp1 structs.JobDeregisterResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Deregister", dereg1, &resp1))
+	require.NotZero(resp1.Index)
+
+	// Check for the job in the FSM
+	state := s1.fsm.State()
+	out, err := state.JobByID(nil, job.Namespace, job.ID)
+	require.NoError(err)
+	require.NotNil(out)
+	require.True(out.Stop)
+
+	// Lookup the evaluation
+	eval, err := state.EvalByID(nil, resp1.EvalID)
+	require.NoError(err)
+	require.NotNil(eval)
+	require.EqualValues(resp1.EvalCreateIndex, eval.CreateIndex)
+	require.Equal(structs.EvalTriggerJobDeregister, eval.TriggeredBy)
+
+	// Lookup allocation transitions
+	var ws memdb.WatchSet
+	allocs, err := state.AllocsByJob(ws, job.Namespace, job.ID, true)
+	require.NoError(err)
+
+	for _, alloc := range allocs {
+		require.Nil(alloc.DesiredTransition)
+	}
+
+	// Deregister with no shutdown delay
+	dereg2 := &structs.JobDeregisterRequest{
+		JobID:           job.ID,
+		NoShutdownDelay: true,
+		WriteRequest: structs.WriteRequest{
+			Region:    "global",
+			Namespace: job.Namespace,
+		},
+	}
+	var resp2 structs.JobDeregisterResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Deregister", dereg2, &resp2))
+	require.NotZero(resp2.Index)
+
+	// Lookup the evaluation
+	eval, err = state.EvalByID(nil, resp2.EvalID)
+	require.NoError(err)
+	require.NotNil(eval)
+	require.EqualValues(resp2.EvalCreateIndex, eval.CreateIndex)
+	require.Equal(structs.EvalTriggerJobDeregister, eval.TriggeredBy)
+
+	// Lookup allocation transitions
+	allocs, err = state.AllocsByJob(ws, job.Namespace, job.ID, true)
+	require.NoError(err)
+
+	for _, alloc := range allocs {
+		require.NotNil(alloc.DesiredTransition)
+		require.True(*(alloc.DesiredTransition.NoShutdownDelay))
+	}
+
+}
+
 func TestJobEndpoint_BatchDeregister(t *testing.T) {
 	t.Parallel()
 	require := require.New(t)
diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
index 3d08fdca768a..4f2086ad4d15 100644
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -1600,7 +1600,7 @@ func (s *StateStore) upsertJobImpl(index uint64, job *structs.Job, keepVersion b
 	}
 
 	if err := s.updateJobCSIPlugins(index, job, existingJob, txn); err != nil {
-		return fmt.Errorf("unable to update job scaling policies: %v", err)
+		return fmt.Errorf("unable to update job csi plugins: %v", err)
 	}
 
 	// Insert the job
@@ -3371,7 +3371,7 @@ func (s *StateStore) UpdateAllocsDesiredTransitions(msgType structs.MessageType,
 
 	// Handle each of the updated allocations
 	for id, transition := range allocs {
-		if err := s.nestedUpdateAllocDesiredTransition(txn, index, id, transition); err != nil {
+		if err := s.UpdateAllocDesiredTransitionTxn(txn, index, id, transition); err != nil {
 			return err
 		}
 	}
@@ -3392,7 +3392,7 @@ func (s *StateStore) UpdateAllocsDesiredTransitions(msgType structs.MessageType,
 
 // nestedUpdateAllocDesiredTransition is used to nest an update of an
 // allocations desired transition
-func (s *StateStore) nestedUpdateAllocDesiredTransition(
+func (s *StateStore) UpdateAllocDesiredTransitionTxn(
 	txn *txn, index uint64, allocID string,
 	transition *structs.DesiredTransition) error {
 
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 50f9f640b7cb..55be2cbe886b 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -621,6 +621,11 @@ type JobDeregisterRequest struct {
 	// in busy clusters with a large evaluation backlog.
 	EvalPriority int
 
+	// NoShutdownDelay, if set to true, will override the group and
+	// task shutdown_delay configuration and set it to "0s" for any
+	// allocations stopped as a result of this Deregister call.
+	NoShutdownDelay bool
+
 	// Eval is the evaluation to create that's associated with job deregister
 	Eval *Evaluation
 
@@ -934,7 +939,8 @@ type AllocUpdateDesiredTransitionRequest struct {
 
 // AllocStopRequest is used to stop and reschedule a running Allocation.
 type AllocStopRequest struct {
-	AllocID string
+	AllocID         string
+	NoShutdownDelay bool
 
 	WriteRequest
 }
@@ -9119,6 +9125,11 @@ type DesiredTransition struct {
 	// This field is only used when operators want to force a placement even if
 	// a failed allocation is not eligible to be rescheduled
 	ForceReschedule *bool
+
+	// NoShutdownDelay is used to indicate that whatever transition is
+	// desired should be applied immediately without waiting for
+	// shutdown delays
+	NoShutdownDelay *bool
 }
 
 // Merge merges the two desired transitions, preferring the values from the
@@ -9135,6 +9146,10 @@ func (d *DesiredTransition) Merge(o *DesiredTransition) {
 	if o.ForceReschedule != nil {
 		d.ForceReschedule = o.ForceReschedule
 	}
+
+	if o.NoShutdownDelay != nil {
+		d.NoShutdownDelay = o.NoShutdownDelay
+	}
 }
 
 // ShouldMigrate returns whether the transition object dictates a migration.
@@ -9157,6 +9172,12 @@ func (d *DesiredTransition) ShouldForceReschedule() bool {
 	return d.ForceReschedule != nil && *d.ForceReschedule
 }
 
+// ShouldIgnoreShutdownDelay returns whether the transition object dictates an
+// immediate transition, skipping shutdown delays.
+func (d *DesiredTransition) ShouldIgnoreShutdownDelay() bool {
+	return d.NoShutdownDelay != nil && *d.NoShutdownDelay
+}
+
 const (
 	AllocDesiredStatusRun   = "run"   // Allocation should run
 	AllocDesiredStatusStop  = "stop"  // Allocation should stop
diff --git a/website/content/docs/commands/alloc/stop.mdx b/website/content/docs/commands/alloc/stop.mdx
index b82195172a44..a92eaf322271 100644
--- a/website/content/docs/commands/alloc/stop.mdx
+++ b/website/content/docs/commands/alloc/stop.mdx
@@ -42,6 +42,14 @@ allocation's namespace.
 
 - `-verbose`: Display verbose output.
 
+- `-no-shutdown-delay`
+  Ignore the the group and task [`shutdown_delay`] configuration so
+  there is no delay between service deregistration and task
+  shutdown. Note that using this flag will result in failed network
+  connections to the allocation being stopped. Also note that an
+  allocation that is already waiting for the `shutdown_delay` will not
+  be interrupted by running `alloc stop` again with this flag.
+
 ## Examples
 
 ```shell-session
@@ -58,3 +66,4 @@ $ nomad alloc stop -detach eb17e557
 ```
 
 [eval status]: /docs/commands/eval-status
+[`shutdown_delay`]: /docs/job-specification/group#shutdown_delay
diff --git a/website/content/docs/commands/job/stop.mdx b/website/content/docs/commands/job/stop.mdx
index 004520b978e2..ee8fdc17ce9a 100644
--- a/website/content/docs/commands/job/stop.mdx
+++ b/website/content/docs/commands/job/stop.mdx
@@ -55,6 +55,14 @@ When ACLs are enabled, this command requires a token with the `submit-job`,
   Stop a [multi-region] job in all its regions. By default, `job stop` will
   stop only a single region at a time. Ignored for single-region jobs.
 
+- `-no-shutdown-delay`
+  Ignore the the group and task [`shutdown_delay`] configuration so
+  there is no delay between service deregistration and task
+  shutdown. Note that using this flag will result in failed network
+  connections to the allocation being stopped. Also note that an
+  allocation that is already waiting for the `shutdown_delay` will not
+  be interrupted by running `job stop` again with this flag.
+
 ## Examples
 
 Stop the job with ID "job1":
@@ -75,3 +83,4 @@ $ nomad job stop -detach job1
 
 [eval status]: /docs/commands/eval-status
 [multi-region]: /docs/job-specification/multiregion
+[`shutdown_delay`]: /docs/job-specification/group#shutdown_delay