Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop already rescheduled but somehow running allocs #8886

Merged
merged 2 commits into from
Sep 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions scheduler/generic_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5501,3 +5501,77 @@ func TestServiceSched_Migrate_CanaryStatus(t *testing.T) {
require.Equal(t, updateDeployment, node2Allocs[3].DeploymentID)
}
}

// TestServiceSched_RunningWithNextAllocation asserts that if a running allocation has
// NextAllocation Set, the allocation is not ignored and will be stopped
func TestServiceSched_RunningWithNextAllocation(t *testing.T) {
h := NewHarness(t)

node1 := mock.Node()
require.NoError(t, h.State.UpsertNode(h.NextIndex(), node1))

totalCount := 2
job := mock.Job()
job.Version = 0
job.Stable = true
job.TaskGroups[0].Count = totalCount
job.TaskGroups[0].Update = nil
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))

var allocs []*structs.Allocation
for i := 0; i < totalCount+1; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node1.ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
allocs = append(allocs, alloc)
}

// simulate a case where .NextAllocation is set but alloc is still running
allocs[2].PreviousAllocation = allocs[0].ID
allocs[0].NextAllocation = allocs[2].ID
require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))

// new update with new task group
job2 := job.Copy()
job2.Version = 1
job2.TaskGroups[0].Tasks[0].Config["command"] = "/bin/other"
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job2))

// Create a mock evaluation
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: 50,
TriggeredBy: structs.EvalTriggerJobRegister,
JobID: job.ID,
Status: structs.EvalStatusPending,
}
require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))

// Process the evaluation
err := h.Process(NewServiceScheduler, eval)
require.NoError(t, err)

// assert that all original allocations have been stopped
for _, alloc := range allocs {
updated, err := h.State.AllocByID(nil, alloc.ID)
require.NoError(t, err)
require.Equalf(t, structs.AllocDesiredStatusStop, updated.DesiredStatus, "alloc %v", alloc.ID)
}

// assert that the new job has proper allocations

jobAllocs, err := h.State.AllocsByJob(nil, job.Namespace, job.ID, true)
require.NoError(t, err)

require.Len(t, jobAllocs, 5)

allocsByVersion := map[uint64][]string{}
for _, alloc := range jobAllocs {
allocsByVersion[alloc.Job.Version] = append(allocsByVersion[alloc.Job.Version], alloc.ID)
}
require.Len(t, allocsByVersion[1], 2)
require.Len(t, allocsByVersion[0], 3)
}
6 changes: 4 additions & 2 deletions scheduler/reconcile_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,10 @@ func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID str
var eligibleNow, eligibleLater bool
var rescheduleTime time.Time

// Ignore allocs that have already been rescheduled
if alloc.NextAllocation != "" {
// Ignore failing allocs that have already been rescheduled
// only failed allocs should be rescheduled, but protect against a bug allowing rescheduling
// running allocs
if alloc.NextAllocation != "" && alloc.TerminalStatus() {
continue
}

Expand Down