Skip to content

Commit

Permalink
Merge pull request #2432 from hashicorp/b-batch-gc
Browse files Browse the repository at this point in the history
Eval GC will collect allocs from stopped batch job
  • Loading branch information
dadgar committed Mar 13, 2017
2 parents 86da4de + 0e8715d commit 8ab86b0
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 8 deletions.
9 changes: 2 additions & 7 deletions nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,6 @@ func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {

// The Evaluation GC should not handle batch jobs since those need to be
// garbage collected in one shot
// XXX believe there is a bug that if a batch job gets stopped, there is no
// way for it to GC the eval/allocs
gc, allocs, err := c.gcEval(eval, oldThreshold, false)
if err != nil {
return err
Expand Down Expand Up @@ -239,18 +237,15 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
// terminal allocations get GC'd the scheduler would re-run the
// allocations.
if eval.Type == structs.JobTypeBatch {
if !allowBatch {
return false, nil, nil
}

// Check if the job is running
job, err := c.snap.JobByID(ws, eval.JobID)
if err != nil {
return false, nil, err
}

// We don't want to gc anything related to a job which is not dead
if job != nil && job.Status != structs.JobStatusDead {
// If the batch job doesn't exist we can GC it regardless of allowBatch
if job != nil && (!allowBatch || job.Status != structs.JobStatusDead) {
return false, nil, nil
}
}
Expand Down
90 changes: 89 additions & 1 deletion nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ func TestCoreScheduler_EvalGC(t *testing.T) {
}
}

// An EvalGC should never reap a batch job
// An EvalGC should never reap a batch job that has not been stopped
func TestCoreScheduler_EvalGC_Batch(t *testing.T) {
s1 := testServer(t, nil)
defer s1.Shutdown()
Expand Down Expand Up @@ -190,6 +190,94 @@ func TestCoreScheduler_EvalGC_Batch(t *testing.T) {
}
}

// An EvalGC should reap a batch job that has been stopped
func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) {
s1 := testServer(t, nil)
defer s1.Shutdown()
testutil.WaitForLeader(t, s1.RPC)

// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)

// Create a "dead" job
state := s1.fsm.State()
job := mock.Job()
job.Type = structs.JobTypeBatch
job.Status = structs.JobStatusDead

// Insert "complete" eval
eval := mock.Eval()
eval.Status = structs.EvalStatusComplete
eval.Type = structs.JobTypeBatch
eval.JobID = job.ID
err := state.UpsertEvals(1001, []*structs.Evaluation{eval})
if err != nil {
t.Fatalf("err: %v", err)
}

// Insert "failed" alloc
alloc := mock.Alloc()
alloc.JobID = job.ID
alloc.EvalID = eval.ID
alloc.DesiredStatus = structs.AllocDesiredStatusStop

// Insert "lost" alloc
alloc2 := mock.Alloc()
alloc2.JobID = job.ID
alloc2.EvalID = eval.ID
alloc2.DesiredStatus = structs.AllocDesiredStatusRun
alloc2.ClientStatus = structs.AllocClientStatusLost

err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
if err != nil {
t.Fatalf("err: %v", err)
}

// Update the time tables to make this work
tt := s1.fsm.TimeTable()
tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))

// Create a core scheduler
snap, err := state.Snapshot()
if err != nil {
t.Fatalf("err: %v", err)
}
core := NewCoreScheduler(s1, snap)

// Attempt the GC
gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
err = core.Process(gc)
if err != nil {
t.Fatalf("err: %v", err)
}

// Everything should be gone
ws := memdb.NewWatchSet()
out, err := state.EvalByID(ws, eval.ID)
if err != nil {
t.Fatalf("err: %v", err)
}
if out != nil {
t.Fatalf("bad: %v", out)
}

outA, err := state.AllocByID(ws, alloc.ID)
if err != nil {
t.Fatalf("err: %v", err)
}
if outA != nil {
t.Fatalf("bad: %v", outA)
}

outA2, err := state.AllocByID(ws, alloc2.ID)
if err != nil {
t.Fatalf("err: %v", err)
}
if outA2 != nil {
t.Fatalf("bad: %v", outA2)
}
}

func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
s1 := testServer(t, nil)
defer s1.Shutdown()
Expand Down

0 comments on commit 8ab86b0

Please sign in to comment.