Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix detection of successful batch allocations #3717

Merged
merged 4 commits into from
Jan 4, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

BUG FIXES:
* core: Fix search endpoint forwarding for multi-region clusters [[GH-3680](https://github.com/hashicorp/nomad/issues/3680)]
* core: Fix an issue in which batch jobs with queued placements and lost
allocations could result in improper placement counts [[GH-3717](https://github.com/hashicorp/nomad/issues/3717)]
* config: Revert minimum CPU limit back to 20 from 100.

## 0.7.1 (December 19, 2017)
Expand Down
24 changes: 21 additions & 3 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -3813,7 +3813,9 @@ func (ts *TaskState) Copy() *TaskState {
return copy
}

// Successful returns whether a task finished successfully.
// Successful returns whether a task finished successfully. This doesn't really
// have meaning on a non-batch allocation because a service and system
// allocation should not finish.
func (ts *TaskState) Successful() bool {
l := len(ts.Events)
if ts.State != TaskStateDead || l == 0 {
Expand Down Expand Up @@ -5019,9 +5021,25 @@ func (a *Allocation) Terminated() bool {
}

// RanSuccessfully returns whether the client has ran the allocation and all
// tasks finished successfully
// tasks finished successfully. Critically this function returns whether the
// allocation has ran to completion and not just that the alloc has converged to
// its desired state. That is to say that a batch allocation must have finished
// with exit code 0 on all task groups. This doesn't really have meaning on a
// non-batch allocation because a service and system allocation should not
// finish.
func (a *Allocation) RanSuccessfully() bool {
return a.ClientStatus == AllocClientStatusComplete
// Handle the case the client hasn't started the allocation.
if len(a.TaskStates) == 0 {
return false
}

// Check to see if all the tasks finised successfully in the allocation
allSuccess := true
for _, state := range a.TaskStates {
allSuccess = allSuccess && state.Successful()
}

return allSuccess
}

// ShouldMigrate returns if the allocation needs data migration
Expand Down
99 changes: 99 additions & 0 deletions scheduler/generic_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestServiceSched_JobRegister(t *testing.T) {
Expand Down Expand Up @@ -2825,6 +2826,94 @@ func TestBatchSched_Run_FailedAlloc(t *testing.T) {
h.AssertEvalStatus(t, structs.EvalStatusComplete)
}

func TestBatchSched_Run_LostAlloc(t *testing.T) {
h := NewHarness(t)

// Create a node
node := mock.Node()
noErr(t, h.State.UpsertNode(h.NextIndex(), node))

// Create a job
job := mock.Job()
job.ID = "my-job"
job.Type = structs.JobTypeBatch
job.TaskGroups[0].Count = 3
noErr(t, h.State.UpsertJob(h.NextIndex(), job))

// Desired = 3
// Mark one as lost and then schedule
// [(0, run, running), (1, run, running), (1, stop, lost)]

// Create two running allocations
var allocs []*structs.Allocation
for i := 0; i <= 1; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
alloc.ClientStatus = structs.AllocClientStatusRunning
allocs = append(allocs, alloc)
}

// Create a failed alloc
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = "my-job.web[1]"
alloc.DesiredStatus = structs.AllocDesiredStatusStop
alloc.ClientStatus = structs.AllocClientStatusComplete
allocs = append(allocs, alloc)
noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))

// Create a mock evaluation to register the job
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: job.Priority,
TriggeredBy: structs.EvalTriggerJobRegister,
JobID: job.ID,
Status: structs.EvalStatusPending,
}
noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))

// Process the evaluation
err := h.Process(NewBatchScheduler, eval)
if err != nil {
t.Fatalf("err: %v", err)
}

// Ensure a plan
if len(h.Plans) != 1 {
t.Fatalf("bad: %#v", h.Plans)
}

// Lookup the allocations by JobID
ws := memdb.NewWatchSet()
out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false)
noErr(t, err)

// Ensure a replacement alloc was placed.
if len(out) != 4 {
t.Fatalf("bad: %#v", out)
}

// Assert that we have the correct number of each alloc name
expected := map[string]int{
"my-job.web[0]": 1,
"my-job.web[1]": 2,
"my-job.web[2]": 1,
}
actual := make(map[string]int, 3)
for _, alloc := range out {
actual[alloc.Name] += 1
}
require.Equal(t, actual, expected)

h.AssertEvalStatus(t, structs.EvalStatusComplete)
}

func TestBatchSched_Run_FailedAllocQueuedAllocations(t *testing.T) {
h := NewHarness(t)

Expand Down Expand Up @@ -3177,6 +3266,16 @@ func TestBatchSched_NodeDrain_Complete(t *testing.T) {
alloc.NodeID = node.ID
alloc.Name = "my-job.web[0]"
alloc.ClientStatus = structs.AllocClientStatusComplete
alloc.TaskStates = make(map[string]*structs.TaskState)
alloc.TaskStates["web"] = &structs.TaskState{
State: structs.TaskStateDead,
Events: []*structs.TaskEvent{
{
Type: structs.TaskTerminated,
ExitCode: 0,
},
},
}
noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))

// Create a mock evaluation to register the job
Expand Down
28 changes: 28 additions & 0 deletions vendor/github.com/stretchr/testify/require/doc.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading