Skip to content

Commit

Permalink
Merge pull request #6902 from hashicorp/b-update-tainted-evict-to-lost
Browse files Browse the repository at this point in the history
Update Evicted allocations to lost when lost
  • Loading branch information
drewbailey committed Jan 7, 2020
2 parents 93cab3f + 8d73cda commit 41686ce
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 81 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ BUG FIXES:
* cli: Fixed a bug where `nomad monitor -node-id` would cause a cli panic when no nodes where found. [[GH-6828](https://github.com/hashicorp/nomad/issues/6828)]
* config: Fixed a bug where agent startup would fail if the `consul.timeout` configuration was set. [[GH-6907](https://github.com/hashicorp/nomad/issues/6907)]
* consul/connect: Fixed a bug where Connect-enabled jobs failed to validate when service names used interpolation. [[GH-6855](https://github.com/hashicorp/nomad/issues/6855)]
* scheduler: Fixed a bug that caused evicted allocs on a lost node to be stuck in running. [[GH-6902](https://github.com/hashicorp/nomad/issues/6902)]

## 0.10.2 (December 4, 2019)

Expand Down
177 changes: 99 additions & 78 deletions scheduler/generic_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2628,99 +2628,120 @@ func TestServiceSched_JobDeregister_Stopped(t *testing.T) {
}

func TestServiceSched_NodeDown(t *testing.T) {
h := NewHarness(t)

// Register a node
node := mock.Node()
node.Status = structs.NodeStatusDown
require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))

// Generate a fake job with allocations and an update policy.
job := mock.Job()
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))

var allocs []*structs.Allocation
for i := 0; i < 10; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
allocs = append(allocs, alloc)
cases := []struct {
desired string
client string
migrate bool
reschedule bool
terminal bool
lost bool
}{
{
desired: structs.AllocDesiredStatusStop,
client: structs.AllocClientStatusRunning,
lost: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusPending,
migrate: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusRunning,
migrate: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusLost,
terminal: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusComplete,
terminal: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusFailed,
reschedule: true,
},
{
desired: structs.AllocDesiredStatusEvict,
client: structs.AllocClientStatusRunning,
lost: true,
},
}

// Cover each terminal case and ensure it doesn't change to lost
allocs[7].DesiredStatus = structs.AllocDesiredStatusRun
allocs[7].ClientStatus = structs.AllocClientStatusLost
allocs[8].DesiredStatus = structs.AllocDesiredStatusRun
allocs[8].ClientStatus = structs.AllocClientStatusFailed
allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
allocs[9].ClientStatus = structs.AllocClientStatusComplete

toBeRescheduled := map[string]bool{allocs[8].ID: true}
for i, tc := range cases {
t.Run(fmt.Sprintf(""), func(t *testing.T) {
h := NewHarness(t)

// Mark some allocs as running
for i := 0; i < 4; i++ {
out := allocs[i]
out.ClientStatus = structs.AllocClientStatusRunning
}
// Register a node
node := mock.Node()
node.Status = structs.NodeStatusDown
require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))

// Mark appropriate allocs for migration
toBeMigrated := map[string]bool{}
for i := 0; i < 3; i++ {
out := allocs[i]
out.DesiredTransition.Migrate = helper.BoolToPtr(true)
toBeMigrated[out.ID] = true
}
// Generate a fake job with allocations and an update policy.
job := mock.Job()
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))

toBeLost := map[string]bool{}
for i := len(toBeMigrated); i < 7; i++ {
toBeLost[allocs[i].ID] = true
}
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)

require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
alloc.DesiredStatus = tc.desired
alloc.ClientStatus = tc.client

// Create a mock evaluation to deal with drain
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: 50,
TriggeredBy: structs.EvalTriggerNodeUpdate,
JobID: job.ID,
NodeID: node.ID,
Status: structs.EvalStatusPending,
}
require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
// Mark for migration if necessary
alloc.DesiredTransition.Migrate = helper.BoolToPtr(tc.migrate)

// Process the evaluation
err := h.Process(NewServiceScheduler, eval)
require.NoError(t, err)
allocs := []*structs.Allocation{alloc}
require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))

// Ensure a single plan
require.Len(t, h.Plans, 1)
plan := h.Plans[0]

// Test the scheduler marked all non-terminal allocations as lost
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))
// Create a mock evaluation to deal with drain
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: 50,
TriggeredBy: structs.EvalTriggerNodeUpdate,
JobID: job.ID,
NodeID: node.ID,
Status: structs.EvalStatusPending,
}
require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))

for _, out := range plan.NodeUpdate[node.ID] {
t.Run("alloc "+out.ID, func(t *testing.T) {
require.Equal(t, structs.AllocDesiredStatusStop, out.DesiredStatus)
// Process the evaluation
err := h.Process(NewServiceScheduler, eval)
require.NoError(t, err)

if toBeMigrated[out.ID] {
// there is no indicator on job itself that marks it as migrated
require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
} else if toBeLost[out.ID] {
require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
} else if toBeRescheduled[out.ID] {
require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
if tc.terminal {
// No plan for terminal state allocs
require.Len(t, h.Plans, 0)
} else {
require.Fail(t, "unexpected alloc update")
require.Len(t, h.Plans, 1)

plan := h.Plans[0]
out := plan.NodeUpdate[node.ID]
require.Len(t, out, 1)

outAlloc := out[0]
if tc.migrate {
require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
} else if tc.reschedule {
require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus)
} else if tc.lost {
require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
} else {
require.Fail(t, "unexpected alloc update")
}
}

h.AssertEvalStatus(t, structs.EvalStatusComplete)
})
}

h.AssertEvalStatus(t, structs.EvalStatusComplete)
}

func TestServiceSched_NodeUpdate(t *testing.T) {
Expand Down
7 changes: 4 additions & 3 deletions scheduler/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -809,9 +809,10 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc
continue
}

// If the scheduler has marked it as stop already but the alloc wasn't
// terminal on the client change the status to lost.
if alloc.DesiredStatus == structs.AllocDesiredStatusStop &&
// If the scheduler has marked it as stop or evict already but the alloc
// wasn't terminal on the client change the status to lost.
if (alloc.DesiredStatus == structs.AllocDesiredStatusStop ||
alloc.DesiredStatus == structs.AllocDesiredStatusEvict) &&
(alloc.ClientStatus == structs.AllocClientStatusRunning ||
alloc.ClientStatus == structs.AllocClientStatusPending) {
plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost)
Expand Down

0 comments on commit 41686ce

Please sign in to comment.