hashicorp · tgross · Apr 13, 2023 · Apr 12, 2023
diff --git a/.changelog/14348.txt b/.changelog/14348.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+drain: Fixed a bug where drains would complete based on the server status and not the client status of an allocation
+```
diff --git a/e2e/nodedrain/input/drain_killtimeout.nomad b/e2e/nodedrain/input/drain_killtimeout.nomad
@@ -0,0 +1,43 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: MPL-2.0
+
+job "drain_killtimeout" {
+
+  constraint {
+    attribute = "${attr.kernel.name}"
+    value     = "linux"
+  }
+
+  group "group" {
+
+    task "task" {
+      driver = "docker"
+
+      kill_timeout = "30s" # matches the agent's max_kill_timeout
+
+      config {
+        image   = "busybox:1"
+        command = "/bin/sh"
+        args    = ["local/script.sh"]
+      }
+
+      # this job traps SIGINT so that we can assert that we've forced the drain
+      # to wait until the client status has been updated
+      template {
+        data = <<EOF
+#!/bin/sh
+trap 'sleep 60' 2
+sleep 600
+EOF
+
+        destination = "local/script.sh"
+        change_mode = "noop"
+      }
+
+      resources {
+        cpu    = 256
+        memory = 128
+      }
+    }
+  }
+}
diff --git a/e2e/nodedrain/node_drain_test.go b/e2e/nodedrain/node_drain_test.go
@@ -30,6 +30,7 @@ func TestNodeDrain(t *testing.T) {
 	t.Run("IgnoreSystem", testIgnoreSystem)
 	t.Run("EphemeralMigrate", testEphemeralMigrate)
 	t.Run("KeepIneligible", testKeepIneligible)
+	t.Run("KillTimeout", testKillTimeout)
 	t.Run("DeadlineFlag", testDeadlineFlag)
 	t.Run("ForceFlag", testForceFlag)
 }
@@ -184,6 +185,62 @@ func testKeepIneligible(t *testing.T) {
 	}
 }
 
+// testKillTimeout tests that we block drains until the client status has been
+// updated, not the server status.
+func testKillTimeout(t *testing.T) {
+
+	nomadClient := e2eutil.NomadClient(t)
+	t.Cleanup(cleanupDrainState(t))
+
+	jobID := "test-node-drain-" + uuid.Short()
+
+	must.NoError(t, e2eutil.Register(jobID, "./input/drain_killtimeout.nomad"))
+	allocs := waitForRunningAllocs(t, nomadClient, jobID, 1)
+
+	t.Cleanup(cleanupJobState(t, jobID))
+	oldAllocID := allocs[0].ID
+	oldNodeID := allocs[0].NodeID
+
+	t.Logf("draining node %v", oldNodeID)
+	out, err := e2eutil.Command(
+		"nomad", "node", "drain",
+		"-enable", "-yes", "-detach", oldNodeID)
+	must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID, err, out))
+
+	// the job will hang with kill_timeout for up to 30s, so we want to assert
+	// that we don't complete draining before that window expires. But we also
+	// can't guarantee we've started this assertion with exactly 30s left on the
+	// clock, so cut the deadline close without going over to avoid test
+	// flakiness
+	t.Log("waiting for kill_timeout to expire")
+	must.Wait(t, wait.ContinualSuccess(
+		wait.BoolFunc(func() bool {
+			node, _, err := nomadClient.Nodes().Info(oldNodeID, nil)
+			must.NoError(t, err)
+			return node.DrainStrategy != nil
+		}),
+		wait.Timeout(time.Second*25),
+		wait.Gap(500*time.Millisecond),
+	))
+
+	// the allocation will then get force-killed, so wait for the alloc
+	// eventually be migrated and for the node's drain to be complete
+	t.Log("waiting for migration to complete")
+	newAllocs := waitForAllocDrainComplete(t, nomadClient, jobID,
+		oldAllocID, oldNodeID, time.Second*60)
+	must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))
+
+	must.Wait(t, wait.InitialSuccess(
+		wait.BoolFunc(func() bool {
+			node, _, err := nomadClient.Nodes().Info(oldNodeID, nil)
+			must.NoError(t, err)
+			return node.DrainStrategy == nil
+		}),
+		wait.Timeout(time.Second*5),
+		wait.Gap(500*time.Millisecond),
+	))
+}
+
 // testDeadlineFlag tests the enforcement of the node drain deadline so that
 // allocations are moved even if max_parallel says we should be waiting
 func testDeadlineFlag(t *testing.T) {

diff --git a/nomad/drainer/draining_node.go b/nomad/drainer/draining_node.go
@@ -76,7 +76,7 @@ func (n *drainingNode) IsDone() (bool, error) {
 		}
 
 		// If there is a non-terminal we aren't done
-		if !alloc.TerminalStatus() {
+		if !alloc.ClientTerminalStatus() {
 			return false, nil
 		}
 	}

diff --git a/nomad/drainer/watch_jobs.go b/nomad/drainer/watch_jobs.go
@@ -367,10 +367,10 @@ func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGrou
 		}
 
 		// Check if the alloc should be considered migrated. A migrated
-		// allocation is one that is terminal, is on a draining
-		// allocation, and has only happened since our last handled index to
+		// allocation is one that is terminal on the client, is on a draining
+		// allocation, and has been updated since our last handled index to
 		// avoid emitting many duplicate migrate events.
-		if alloc.TerminalStatus() &&
+		if alloc.ClientTerminalStatus() &&
 			onDrainingNode &&
 			alloc.ModifyIndex > lastHandledIndex {
 			result.migrated = append(result.migrated, alloc)
@@ -385,8 +385,8 @@ func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGrou
 
 		// An alloc can't be considered for migration if:
 		// - It isn't on a draining node
-		// - It is already terminal
-		if !onDrainingNode || alloc.TerminalStatus() {
+		// - It is already terminal on the client
+		if !onDrainingNode || alloc.ClientTerminalStatus() {
 			continue
 		}