Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug where dead nodes weren't properly handled by system scheduler #1715

Merged
merged 1 commit into from
Sep 19, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions scheduler/system_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1228,3 +1228,90 @@ func TestSystemSched_ChainedAlloc(t *testing.T) {
t.Fatalf("expected: %v, actual: %v", 2, len(newAllocs))
}
}

func TestSystemSched_PlanWithDrainedNode(t *testing.T) {
h := NewHarness(t)

// Register two nodes with two different classes
node := mock.Node()
node.NodeClass = "green"
node.Drain = true
node.ComputeClass()
noErr(t, h.State.UpsertNode(h.NextIndex(), node))

node2 := mock.Node()
node2.NodeClass = "blue"
node2.ComputeClass()
noErr(t, h.State.UpsertNode(h.NextIndex(), node2))

// Create a Job with two task groups, each constrianed on node class
job := mock.SystemJob()
tg1 := job.TaskGroups[0]
tg1.Constraints = append(tg1.Constraints,
&structs.Constraint{
LTarget: "${node.class}",
RTarget: "green",
Operand: "==",
})

tg2 := tg1.Copy()
tg2.Name = "web2"
tg2.Constraints[0].RTarget = "blue"
job.TaskGroups = append(job.TaskGroups, tg2)
noErr(t, h.State.UpsertJob(h.NextIndex(), job))

// Create an allocation on each node
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = "my-job.web[0]"
alloc.TaskGroup = "web"

alloc2 := mock.Alloc()
alloc2.Job = job
alloc2.JobID = job.ID
alloc2.NodeID = node2.ID
alloc2.Name = "my-job.web2[0]"
alloc2.TaskGroup = "web2"
noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc, alloc2}))

// Create a mock evaluation to deal with drain
eval := &structs.Evaluation{
ID: structs.GenerateUUID(),
Priority: 50,
TriggeredBy: structs.EvalTriggerNodeUpdate,
JobID: job.ID,
NodeID: node.ID,
}

// Process the evaluation
err := h.Process(NewSystemScheduler, eval)
if err != nil {
t.Fatalf("err: %v", err)
}

// Ensure a single plan
if len(h.Plans) != 1 {
t.Fatalf("bad: %#v", h.Plans)
}
plan := h.Plans[0]

// Ensure the plan evicted the alloc on the failed node
planned := plan.NodeUpdate[node.ID]
if len(planned) != 1 {
t.Fatalf("bad: %#v", plan)
}

// Ensure the plan didn't place
if len(plan.NodeAllocation) != 0 {
t.Fatalf("bad: %#v", plan)
}

// Ensure the allocations is stopped
if planned[0].DesiredStatus != structs.AllocDesiredStatusStop {
t.Fatalf("bad: %#v", planned[0])
}

h.AssertEvalStatus(t, structs.EvalStatusComplete)
}
23 changes: 14 additions & 9 deletions scheduler/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,20 @@ func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[
for nodeID, allocs := range nodeAllocs {
diff := diffAllocs(job, taintedNodes, required, allocs, terminalAllocs)

// Mark the alloc as being for a specific node.
for i := range diff.place {
alloc := &diff.place[i]

// If the new allocation isn't annotated with a previous allocation
// or if the previous allocation isn't from the same node then we
// annotate the allocTuple with a new Allocation
if alloc.Alloc == nil || alloc.Alloc.NodeID != nodeID {
alloc.Alloc = &structs.Allocation{NodeID: nodeID}
// If the node is tainted there should be no placements made
if _, ok := taintedNodes[nodeID]; ok {
diff.place = nil
} else {
// Mark the alloc as being for a specific node.
for i := range diff.place {
alloc := &diff.place[i]

// If the new allocation isn't annotated with a previous allocation
// or if the previous allocation isn't from the same node then we
// annotate the allocTuple with a new Allocation
if alloc.Alloc == nil || alloc.Alloc.NodeID != nodeID {
alloc.Alloc = &structs.Allocation{NodeID: nodeID}
}
}
}

Expand Down
30 changes: 15 additions & 15 deletions scheduler/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,26 +175,26 @@ func TestDiffAllocs(t *testing.T) {
func TestDiffSystemAllocs(t *testing.T) {
job := mock.SystemJob()

// Create three alive nodes.
nodes := []*structs.Node{{ID: "foo"}, {ID: "bar"}, {ID: "baz"},
{ID: "pipe"}}

// The "old" job has a previous modify index
oldJob := new(structs.Job)
*oldJob = *job
oldJob.JobModifyIndex -= 1

drainNode := mock.Node()
drainNode.Drain = true

deadNode := mock.Node()
deadNode.Status = structs.NodeStatusDown

tainted := map[string]*structs.Node{
"dead": deadNode,
"drainNode": drainNode,
deadNode.ID: deadNode,
drainNode.ID: drainNode,
}

// Create three alive nodes.
nodes := []*structs.Node{{ID: "foo"}, {ID: "bar"}, {ID: "baz"},
{ID: "pipe"}, {ID: drainNode.ID}, {ID: deadNode.ID}}

// The "old" job has a previous modify index
oldJob := new(structs.Job)
*oldJob = *job
oldJob.JobModifyIndex -= 1

allocs := []*structs.Allocation{
// Update allocation on baz
&structs.Allocation{
Expand All @@ -215,14 +215,14 @@ func TestDiffSystemAllocs(t *testing.T) {
// Stop allocation on draining node.
&structs.Allocation{
ID: structs.GenerateUUID(),
NodeID: "drainNode",
NodeID: drainNode.ID,
Name: "my-job.web[0]",
Job: oldJob,
},
// Mark as lost on a dead node
&structs.Allocation{
ID: structs.GenerateUUID(),
NodeID: "dead",
NodeID: deadNode.ID,
Name: "my-job.web[0]",
Job: oldJob,
},
Expand Down Expand Up @@ -272,8 +272,8 @@ func TestDiffSystemAllocs(t *testing.T) {
}

// We should place 1
if len(place) != 2 {
t.Fatalf("bad: %#v", place)
if l := len(place); l != 2 {
t.Fatalf("bad: %#v", l)
}

// Ensure that the allocations which are replacements of terminal allocs are
Expand Down