Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not restart successful batch jobs when the node is tainted (removed/drained) #1205

Merged
merged 3 commits into from
May 25, 2016
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -1904,6 +1904,21 @@ func (ts *TaskState) Failed() bool {
}
}

// Successful returns whether a task finished successfully.
func (ts *TaskState) Successful() bool {
l := len(ts.Events)
if ts.State != TaskStateDead || l == 0 {
return false
}

e := ts.Events[l-1]
if e.Type != TaskTerminated {
return false
}

return e.ExitCode == 0
}

const (
// TaskDriveFailure indicates that the task could not be started due to a
// failure in the driver.
Expand Down Expand Up @@ -2336,6 +2351,23 @@ func (a *Allocation) TerminalStatus() bool {
}
}

// RanSuccessfully returns whether the client has ran the allocation and all
// tasks finished successfully
func (a *Allocation) RanSuccessfully() bool {
// Handle the case the client hasn't started the allocation.
if len(a.TaskStates) == 0 {
return false
}

// Check to see if all the tasks finised successfully in the allocation
allSuccess := true
for _, state := range a.TaskStates {
allSuccess = allSuccess && state.Successful()
}

return allSuccess
}

// Stub returns a list stub for the allocation
func (a *Allocation) Stub() *AllocListStub {
return &AllocListStub{
Expand Down
8 changes: 4 additions & 4 deletions scheduler/generic_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,12 +235,12 @@ func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) []
filter := func(a *structs.Allocation) bool {
if s.batch {
// Allocs from batch jobs should be filtered when the desired status
// is terminal or when the client status is failed so that they will
// be replaced. If they are complete but not failed, they shouldn't
// be replaced.
// is terminal and the client did not finish or when the client
// status is failed so that they will be replaced. If they are
// complete but not failed, they shouldn't be replaced.
switch a.DesiredStatus {
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict, structs.AllocDesiredStatusFailed:
return true
return !a.RanSuccessfully()
default:
}

Expand Down
68 changes: 68 additions & 0 deletions scheduler/generic_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1316,3 +1316,71 @@ func TestBatchSched_Run_FailedAlloc(t *testing.T) {

h.AssertEvalStatus(t, structs.EvalStatusComplete)
}

func TestBatchSched_ReRun_SuccessfullyFinishedAlloc(t *testing.T) {
h := NewHarness(t)

// Create two nodes, one that is drained and has a successfully finished
// alloc and a fresh undrained one
node := mock.Node()
node.Drain = true
node2 := mock.Node()
noErr(t, h.State.UpsertNode(h.NextIndex(), node))
noErr(t, h.State.UpsertNode(h.NextIndex(), node2))

// Create a job
job := mock.Job()
job.Type = structs.JobTypeBatch
job.TaskGroups[0].Count = 1
noErr(t, h.State.UpsertJob(h.NextIndex(), job))

// Create a successful alloc
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = "my-job.web[0]"
alloc.ClientStatus = structs.AllocClientStatusComplete
alloc.TaskStates = map[string]*structs.TaskState{
"web": &structs.TaskState{
State: structs.TaskStateDead,
Events: []*structs.TaskEvent{
{
Type: structs.TaskTerminated,
ExitCode: 0,
},
},
},
}
noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))

// Create a mock evaluation to rerun the job
eval := &structs.Evaluation{
ID: structs.GenerateUUID(),
Priority: job.Priority,
TriggeredBy: structs.EvalTriggerJobRegister,
JobID: job.ID,
}

// Process the evaluation
err := h.Process(NewBatchScheduler, eval)
if err != nil {
t.Fatalf("err: %v", err)
}

// Ensure no plan
if len(h.Plans) != 0 {
t.Fatalf("bad: %#v", h.Plans)
}

// Lookup the allocations by JobID
out, err := h.State.AllocsByJob(job.ID)
noErr(t, err)

// Ensure no replacement alloc was placed.
if len(out) != 1 {
t.Fatalf("bad: %#v", out)
}

h.AssertEvalStatus(t, structs.EvalStatusComplete)
}
7 changes: 6 additions & 1 deletion scheduler/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,12 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool,
continue
}

// If we are on a tainted node, we must migrate
// If we are on a tainted node, we must migrate if we are a service or
// if the batch allocation did not finish
if taintedNodes[exist.NodeID] {
if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comment as to why only batch, since this seems like it could work for any.

goto IGNORE
}
result.migrate = append(result.migrate, allocTuple{
Name: name,
TaskGroup: tg,
Expand All @@ -102,6 +106,7 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool,
}

// Everything is up-to-date
IGNORE:
result.ignore = append(result.ignore, allocTuple{
Name: name,
TaskGroup: tg,
Expand Down
3 changes: 3 additions & 0 deletions scheduler/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,15 @@ func TestDiffAllocs(t *testing.T) {
ID: structs.GenerateUUID(),
NodeID: "zip",
Name: "my-job.web[10]",
Job: oldJob,
},

// Migrate the 3rd
&structs.Allocation{
ID: structs.GenerateUUID(),
NodeID: "dead",
Name: "my-job.web[2]",
Job: oldJob,
},
}

Expand Down Expand Up @@ -155,6 +157,7 @@ func TestDiffSystemAllocs(t *testing.T) {
ID: structs.GenerateUUID(),
NodeID: "dead",
Name: "my-job.web[0]",
Job: oldJob,
},
}

Expand Down