scheduler: system scheduler should reconcile overlapping live allocs

The system scheduler uses a separate code path for reconciliation. During the investigation into the "plan for node rejected" bug which was fixed in #16401, it was discovered this code path doesn't maintain the invariant that no more than 1 allocation per system job task group (or `count` allocations for sysbatch jobs) should be left running on a given client. While this condition should be impossible to encounter, the scheduler should be reconciling these cases. Add a new `ensureSingleSystemAlloc` function that enforces the invariant that there can be only a single desired-running allocation for a given system job on a given node.
hashicorp · Mar 13, 2023 · 8e7a610 · 8e7a610
1 parent 5f37b2f
commit 8e7a610
Show file tree

Hide file tree

Showing 5 changed files with 350 additions and 67 deletions.
diff --git a/.changelog/16097.txt b/.changelog/16097.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+scheduler: Fixed a bug where overlapping system allocations would not be stopped
+```
diff --git a/nomad/mock/job.go b/nomad/mock/job.go
@@ -437,9 +437,10 @@ func SystemJob() *structs.Job {
 		Meta: map[string]string{
 			"owner": "armon",
 		},
-		Status:      structs.JobStatusPending,
-		CreateIndex: 42,
-		ModifyIndex: 99,
+		Status:         structs.JobStatusPending,
+		CreateIndex:    42,
+		ModifyIndex:    99,
+		JobModifyIndex: 99,
 	}
 	job.Canonicalize()
 	return job

diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go
@@ -8,14 +8,16 @@ import (
 	"time"
 
 	memdb "github.com/hashicorp/go-memdb"
+	"github.com/shoenig/test"
+	"github.com/shoenig/test/must"
+	"github.com/stretchr/testify/require"
+
 	"github.com/hashicorp/nomad/ci"
 	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/helper/pointer"
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
-	"github.com/shoenig/test/must"
-	"github.com/stretchr/testify/require"
 )
 
 func TestSystemSched_JobRegister(t *testing.T) {
@@ -3084,3 +3086,66 @@ func TestSystemSched_CSITopology(t *testing.T) {
 	must.Eq(t, structs.EvalStatusComplete, h.Evals[0].Status)
 
 }
+
+func TestSystemSched_OverlappingAllocations(t *testing.T) {
+	ci.Parallel(t)
+	h := NewHarness(t)
+
+	node := mock.Node()
+	node.Name = "good-node-1"
+	must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node))
+
+	// Create 2 allocs for 2 versions of the same system job on the same node.
+	// This should not happen but has been observed, so the scheduler must "fix"
+	// this by stopping the old version.
+	oldJob := mock.SystemJob()
+	oldJob.ID = "my-job"
+	must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), oldJob))
+
+	job := oldJob.Copy()
+	job.Version++
+	must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), job))
+
+	allocOld := mock.Alloc()
+	allocOld.Job = oldJob
+	allocOld.JobID = oldJob.ID
+	allocOld.NodeID = node.ID
+	allocOld.Name = "my-job.web[0]"
+
+	allocCurrent := allocOld.Copy()
+	allocCurrent.ID = uuid.Generate()
+	allocCurrent.Job = job
+
+	must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(),
+		[]*structs.Allocation{
+			allocOld, allocCurrent}))
+
+	// Create a mock evaluation to deregister the job
+	eval := &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    job.Priority,
+		TriggeredBy: structs.EvalTriggerJobRegister,
+		JobID:       job.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	lastIndex := h.NextIndex()
+	must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup,
+		lastIndex, []*structs.Evaluation{eval}))
+
+	// Process the evaluation
+	must.NoError(t, h.Process(NewSystemScheduler, eval))
+
+	// Ensure a single plan
+	must.Len(t, 1, h.Plans)
+	plan := h.Plans[0]
+	must.Nil(t, plan.Annotations, must.Sprint("expected no annotations"))
+
+	test.Len(t, 1, plan.NodeUpdate[node.ID], test.Sprint("expected 1 evict/stop"))
+	if len(plan.NodeUpdate[node.ID]) > 0 {
+		test.Eq(t, allocOld.ID, plan.NodeUpdate[node.ID][0].ID,
+			test.Sprintf("expected alloc=%s to be evicted/stopped", allocOld.ID))
+	}
+
+	test.Len(t, 0, plan.NodeAllocation[node.ID], test.Sprint("expected no updates"))
+}