From 3716a67b307c8f7e413d197542de4a5a286cbb81 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 3 Dec 2019 06:14:49 -0800 Subject: [PATCH] scheduler: fix job update placement on prev node penalized (#6781) Fixes #5856 When the scheduler looks for a placement for an allocation that's replacing another allocation, it's supposed to penalize the previous node if the allocation had been rescheduled or failed. But we're currently always penalizing the node, which leads to unnecessary migrations on job update. This commit leaves in place the existing behavior where if the previous alloc was itself rescheduled, its previous nodes are also penalized. This is conservative but the right behavior especially on larger clusters where a group of hosts might be having correlated trouble (like an AZ failure). Co-Authored-By: Michael Schurter --- scheduler/generic_sched.go | 7 +- scheduler/generic_sched_test.go | 130 ++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 3f27783dc57c..d9e5c7cc31c9 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -570,7 +570,12 @@ func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs selectOptions := &SelectOptions{} if prevAllocation != nil { penaltyNodes := make(map[string]struct{}) - penaltyNodes[prevAllocation.NodeID] = struct{}{} + + // If alloc failed, penalize the node it failed on to encourage + // rescheduling on a new node. + if prevAllocation.ClientStatus == structs.AllocClientStatusFailed { + penaltyNodes[prevAllocation.NodeID] = struct{}{} + } if prevAllocation.RescheduleTracker != nil { for _, reschedEvent := range prevAllocation.RescheduleTracker.Events { penaltyNodes[reschedEvent.PrevNodeID] = struct{}{} diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 18ec4160670d..e689cfddc84a 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -2359,6 +2359,136 @@ func TestServiceSched_JobModify_DistinctProperty(t *testing.T) { h.AssertEvalStatus(t, structs.EvalStatusComplete) } +// TestServiceSched_JobModify_NodeReschedulePenalty ensures that +// a failing allocation gets rescheduled with a penalty to the old +// node, but an updated job doesn't apply the penalty. +func TestServiceSched_JobModify_NodeReschedulePenalty(t *testing.T) { + h := NewHarness(t) + require := require.New(t) + + // Create some nodes + var nodes []*structs.Node + for i := 0; i < 10; i++ { + node := mock.Node() + nodes = append(nodes, node) + require.NoError(h.State.UpsertNode(h.NextIndex(), node)) + } + + // Generate a fake job with allocations and an update policy. + job := mock.Job() + job.TaskGroups[0].Count = 2 + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 1, + Interval: 15 * time.Minute, + Delay: 5 * time.Second, + MaxDelay: 1 * time.Minute, + DelayFunction: "constant", + } + tgName := job.TaskGroups[0].Name + now := time.Now() + + require.NoError(h.State.UpsertJob(h.NextIndex(), job)) + + var allocs []*structs.Allocation + for i := 0; i < 2; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = nodes[i].ID + alloc.Name = fmt.Sprintf("my-job.web[%d]", i) + allocs = append(allocs, alloc) + } + // Mark one of the allocations as failed + allocs[1].ClientStatus = structs.AllocClientStatusFailed + allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "dead", + StartedAt: now.Add(-1 * time.Hour), + FinishedAt: now.Add(-10 * time.Second)}} + failedAlloc := allocs[1] + failedAllocID := failedAlloc.ID + successAllocID := allocs[0].ID + + require.NoError(h.State.UpsertAllocs(h.NextIndex(), allocs)) + + // Create and process a mock evaluation + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + require.NoError(h.Process(NewServiceScheduler, eval)) + + // Ensure we have one plan + require.Equal(1, len(h.Plans)) + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + require.NoError(err) + + // Verify that one new allocation got created with its restart tracker info + require.Equal(3, len(out)) + var newAlloc *structs.Allocation + for _, alloc := range out { + if alloc.ID != successAllocID && alloc.ID != failedAllocID { + newAlloc = alloc + } + } + require.Equal(failedAllocID, newAlloc.PreviousAllocation) + require.Equal(1, len(newAlloc.RescheduleTracker.Events)) + require.Equal(failedAllocID, newAlloc.RescheduleTracker.Events[0].PrevAllocID) + + // Verify that the node-reschedule penalty was applied to the new alloc + for _, scoreMeta := range newAlloc.Metrics.ScoreMetaData { + if scoreMeta.NodeID == failedAlloc.NodeID { + require.Equal(-1.0, scoreMeta.Scores["node-reschedule-penalty"], + "eval to replace failed alloc missing node-reshedule-penalty: %v", + scoreMeta.Scores, + ) + } + } + + // Update the job, such that it cannot be done in-place + job2 := job.Copy() + job2.TaskGroups[0].Tasks[0].Config["command"] = "/bin/other" + require.NoError(h.State.UpsertJob(h.NextIndex(), job2)) + + // Create and process a mock evaluation + eval = &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.NoError(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + require.NoError(h.Process(NewServiceScheduler, eval)) + + // Lookup the new allocations by JobID + out, err = h.State.AllocsByJob(ws, job.Namespace, job2.ID, false) + require.NoError(err) + out, _ = structs.FilterTerminalAllocs(out) + require.Equal(2, len(out)) + + // No new allocs have node-reschedule-penalty + for _, alloc := range out { + require.Nil(alloc.RescheduleTracker) + require.NotNil(alloc.Metrics) + for _, scoreMeta := range alloc.Metrics.ScoreMetaData { + if scoreMeta.NodeID != failedAlloc.NodeID { + require.Equal(0.0, scoreMeta.Scores["node-reschedule-penalty"], + "eval for updated job should not include node-reshedule-penalty: %v", + scoreMeta.Scores, + ) + } + } + } +} + func TestServiceSched_JobDeregister_Purged(t *testing.T) { h := NewHarness(t)