Skip to content

Commit

Permalink
Merge pull request #1715 from hashicorp/b-dead-system-nodes
Browse files Browse the repository at this point in the history
Fix bug where dead nodes weren't properly handled by system scheduler
  • Loading branch information
dadgar committed Sep 19, 2016
2 parents 887c1b1 + 926f5ee commit 60e114a
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 24 deletions.
87 changes: 87 additions & 0 deletions scheduler/system_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1228,3 +1228,90 @@ func TestSystemSched_ChainedAlloc(t *testing.T) {
t.Fatalf("expected: %v, actual: %v", 2, len(newAllocs))
}
}

func TestSystemSched_PlanWithDrainedNode(t *testing.T) {
h := NewHarness(t)

// Register two nodes with two different classes
node := mock.Node()
node.NodeClass = "green"
node.Drain = true
node.ComputeClass()
noErr(t, h.State.UpsertNode(h.NextIndex(), node))

node2 := mock.Node()
node2.NodeClass = "blue"
node2.ComputeClass()
noErr(t, h.State.UpsertNode(h.NextIndex(), node2))

// Create a Job with two task groups, each constrianed on node class
job := mock.SystemJob()
tg1 := job.TaskGroups[0]
tg1.Constraints = append(tg1.Constraints,
&structs.Constraint{
LTarget: "${node.class}",
RTarget: "green",
Operand: "==",
})

tg2 := tg1.Copy()
tg2.Name = "web2"
tg2.Constraints[0].RTarget = "blue"
job.TaskGroups = append(job.TaskGroups, tg2)
noErr(t, h.State.UpsertJob(h.NextIndex(), job))

// Create an allocation on each node
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = "my-job.web[0]"
alloc.TaskGroup = "web"

alloc2 := mock.Alloc()
alloc2.Job = job
alloc2.JobID = job.ID
alloc2.NodeID = node2.ID
alloc2.Name = "my-job.web2[0]"
alloc2.TaskGroup = "web2"
noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc, alloc2}))

// Create a mock evaluation to deal with drain
eval := &structs.Evaluation{
ID: structs.GenerateUUID(),
Priority: 50,
TriggeredBy: structs.EvalTriggerNodeUpdate,
JobID: job.ID,
NodeID: node.ID,
}

// Process the evaluation
err := h.Process(NewSystemScheduler, eval)
if err != nil {
t.Fatalf("err: %v", err)
}

// Ensure a single plan
if len(h.Plans) != 1 {
t.Fatalf("bad: %#v", h.Plans)
}
plan := h.Plans[0]

// Ensure the plan evicted the alloc on the failed node
planned := plan.NodeUpdate[node.ID]
if len(planned) != 1 {
t.Fatalf("bad: %#v", plan)
}

// Ensure the plan didn't place
if len(plan.NodeAllocation) != 0 {
t.Fatalf("bad: %#v", plan)
}

// Ensure the allocations is stopped
if planned[0].DesiredStatus != structs.AllocDesiredStatusStop {
t.Fatalf("bad: %#v", planned[0])
}

h.AssertEvalStatus(t, structs.EvalStatusComplete)
}
23 changes: 14 additions & 9 deletions scheduler/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,20 @@ func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[
for nodeID, allocs := range nodeAllocs {
diff := diffAllocs(job, taintedNodes, required, allocs, terminalAllocs)

// Mark the alloc as being for a specific node.
for i := range diff.place {
alloc := &diff.place[i]

// If the new allocation isn't annotated with a previous allocation
// or if the previous allocation isn't from the same node then we
// annotate the allocTuple with a new Allocation
if alloc.Alloc == nil || alloc.Alloc.NodeID != nodeID {
alloc.Alloc = &structs.Allocation{NodeID: nodeID}
// If the node is tainted there should be no placements made
if _, ok := taintedNodes[nodeID]; ok {
diff.place = nil
} else {
// Mark the alloc as being for a specific node.
for i := range diff.place {
alloc := &diff.place[i]

// If the new allocation isn't annotated with a previous allocation
// or if the previous allocation isn't from the same node then we
// annotate the allocTuple with a new Allocation
if alloc.Alloc == nil || alloc.Alloc.NodeID != nodeID {
alloc.Alloc = &structs.Allocation{NodeID: nodeID}
}
}
}

Expand Down
30 changes: 15 additions & 15 deletions scheduler/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,26 +175,26 @@ func TestDiffAllocs(t *testing.T) {
func TestDiffSystemAllocs(t *testing.T) {
job := mock.SystemJob()

// Create three alive nodes.
nodes := []*structs.Node{{ID: "foo"}, {ID: "bar"}, {ID: "baz"},
{ID: "pipe"}}

// The "old" job has a previous modify index
oldJob := new(structs.Job)
*oldJob = *job
oldJob.JobModifyIndex -= 1

drainNode := mock.Node()
drainNode.Drain = true

deadNode := mock.Node()
deadNode.Status = structs.NodeStatusDown

tainted := map[string]*structs.Node{
"dead": deadNode,
"drainNode": drainNode,
deadNode.ID: deadNode,
drainNode.ID: drainNode,
}

// Create three alive nodes.
nodes := []*structs.Node{{ID: "foo"}, {ID: "bar"}, {ID: "baz"},
{ID: "pipe"}, {ID: drainNode.ID}, {ID: deadNode.ID}}

// The "old" job has a previous modify index
oldJob := new(structs.Job)
*oldJob = *job
oldJob.JobModifyIndex -= 1

allocs := []*structs.Allocation{
// Update allocation on baz
&structs.Allocation{
Expand All @@ -215,14 +215,14 @@ func TestDiffSystemAllocs(t *testing.T) {
// Stop allocation on draining node.
&structs.Allocation{
ID: structs.GenerateUUID(),
NodeID: "drainNode",
NodeID: drainNode.ID,
Name: "my-job.web[0]",
Job: oldJob,
},
// Mark as lost on a dead node
&structs.Allocation{
ID: structs.GenerateUUID(),
NodeID: "dead",
NodeID: deadNode.ID,
Name: "my-job.web[0]",
Job: oldJob,
},
Expand Down Expand Up @@ -272,8 +272,8 @@ func TestDiffSystemAllocs(t *testing.T) {
}

// We should place 1
if len(place) != 2 {
t.Fatalf("bad: %#v", place)
if l := len(place); l != 2 {
t.Fatalf("bad: %#v", l)
}

// Ensure that the allocations which are replacements of terminal allocs are
Expand Down

0 comments on commit 60e114a

Please sign in to comment.