Skip to content

Commit

Permalink
Merge pull request #3717 from hashicorp/b-lost-batch
Browse files Browse the repository at this point in the history
Fix detection of successful batch allocations
  • Loading branch information
dadgar committed Jan 4, 2018
2 parents 9d93baa + 73303b4 commit b5425a7
Show file tree
Hide file tree
Showing 11 changed files with 1,844 additions and 3 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

BUG FIXES:
* core: Fix search endpoint forwarding for multi-region clusters [[GH-3680](https://github.com/hashicorp/nomad/issues/3680)]
* core: Fix an issue in which batch jobs with queued placements and lost
allocations could result in improper placement counts [[GH-3717](https://github.com/hashicorp/nomad/issues/3717)]
* config: Revert minimum CPU limit back to 20 from 100.

## 0.7.1 (December 19, 2017)
Expand Down
24 changes: 21 additions & 3 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -3813,7 +3813,9 @@ func (ts *TaskState) Copy() *TaskState {
return copy
}

// Successful returns whether a task finished successfully.
// Successful returns whether a task finished successfully. This doesn't really
// have meaning on a non-batch allocation because a service and system
// allocation should not finish.
func (ts *TaskState) Successful() bool {
l := len(ts.Events)
if ts.State != TaskStateDead || l == 0 {
Expand Down Expand Up @@ -5019,9 +5021,25 @@ func (a *Allocation) Terminated() bool {
}

// RanSuccessfully returns whether the client has ran the allocation and all
// tasks finished successfully
// tasks finished successfully. Critically this function returns whether the
// allocation has ran to completion and not just that the alloc has converged to
// its desired state. That is to say that a batch allocation must have finished
// with exit code 0 on all task groups. This doesn't really have meaning on a
// non-batch allocation because a service and system allocation should not
// finish.
func (a *Allocation) RanSuccessfully() bool {
return a.ClientStatus == AllocClientStatusComplete
// Handle the case the client hasn't started the allocation.
if len(a.TaskStates) == 0 {
return false
}

// Check to see if all the tasks finised successfully in the allocation
allSuccess := true
for _, state := range a.TaskStates {
allSuccess = allSuccess && state.Successful()
}

return allSuccess
}

// ShouldMigrate returns if the allocation needs data migration
Expand Down
99 changes: 99 additions & 0 deletions scheduler/generic_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestServiceSched_JobRegister(t *testing.T) {
Expand Down Expand Up @@ -2825,6 +2826,94 @@ func TestBatchSched_Run_FailedAlloc(t *testing.T) {
h.AssertEvalStatus(t, structs.EvalStatusComplete)
}

func TestBatchSched_Run_LostAlloc(t *testing.T) {
h := NewHarness(t)

// Create a node
node := mock.Node()
noErr(t, h.State.UpsertNode(h.NextIndex(), node))

// Create a job
job := mock.Job()
job.ID = "my-job"
job.Type = structs.JobTypeBatch
job.TaskGroups[0].Count = 3
noErr(t, h.State.UpsertJob(h.NextIndex(), job))

// Desired = 3
// Mark one as lost and then schedule
// [(0, run, running), (1, run, running), (1, stop, lost)]

// Create two running allocations
var allocs []*structs.Allocation
for i := 0; i <= 1; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
alloc.ClientStatus = structs.AllocClientStatusRunning
allocs = append(allocs, alloc)
}

// Create a failed alloc
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = "my-job.web[1]"
alloc.DesiredStatus = structs.AllocDesiredStatusStop
alloc.ClientStatus = structs.AllocClientStatusComplete
allocs = append(allocs, alloc)
noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))

// Create a mock evaluation to register the job
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: job.Priority,
TriggeredBy: structs.EvalTriggerJobRegister,
JobID: job.ID,
Status: structs.EvalStatusPending,
}
noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))

// Process the evaluation
err := h.Process(NewBatchScheduler, eval)
if err != nil {
t.Fatalf("err: %v", err)
}

// Ensure a plan
if len(h.Plans) != 1 {
t.Fatalf("bad: %#v", h.Plans)
}

// Lookup the allocations by JobID
ws := memdb.NewWatchSet()
out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false)
noErr(t, err)

// Ensure a replacement alloc was placed.
if len(out) != 4 {
t.Fatalf("bad: %#v", out)
}

// Assert that we have the correct number of each alloc name
expected := map[string]int{
"my-job.web[0]": 1,
"my-job.web[1]": 2,
"my-job.web[2]": 1,
}
actual := make(map[string]int, 3)
for _, alloc := range out {
actual[alloc.Name] += 1
}
require.Equal(t, actual, expected)

h.AssertEvalStatus(t, structs.EvalStatusComplete)
}

func TestBatchSched_Run_FailedAllocQueuedAllocations(t *testing.T) {
h := NewHarness(t)

Expand Down Expand Up @@ -3177,6 +3266,16 @@ func TestBatchSched_NodeDrain_Complete(t *testing.T) {
alloc.NodeID = node.ID
alloc.Name = "my-job.web[0]"
alloc.ClientStatus = structs.AllocClientStatusComplete
alloc.TaskStates = make(map[string]*structs.TaskState)
alloc.TaskStates["web"] = &structs.TaskState{
State: structs.TaskStateDead,
Events: []*structs.TaskEvent{
{
Type: structs.TaskTerminated,
ExitCode: 0,
},
},
}
noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))

// Create a mock evaluation to register the job
Expand Down
28 changes: 28 additions & 0 deletions vendor/github.com/stretchr/testify/require/doc.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit b5425a7

Please sign in to comment.