From 0e8715d1f9cf2893c2ba3960cfad1b3508a5b41c Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Sat, 11 Mar 2017 15:48:57 -0800
Subject: [PATCH] Eval GC will collect allocs from stopped batch job

This PR fixes a bug in which allocations from stopped batch jobs could
not be garbage collected.
---
 nomad/core_sched.go      |  9 +---
 nomad/core_sched_test.go | 90 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 91 insertions(+), 8 deletions(-)

diff --git a/nomad/core_sched.go b/nomad/core_sched.go
index e5dbea5a7064..a191effddbfa 100644
--- a/nomad/core_sched.go
+++ b/nomad/core_sched.go
@@ -196,8 +196,6 @@ func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error {
 
 		// The Evaluation GC should not handle batch jobs since those need to be
 		// garbage collected in one shot
-		// XXX believe there is a bug that if a batch job gets stopped, there is no
-		// way for it to GC the eval/allocs
 		gc, allocs, err := c.gcEval(eval, oldThreshold, false)
 		if err != nil {
 			return err
@@ -239,10 +237,6 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 	// terminal allocations get GC'd the scheduler would re-run the
 	// allocations.
 	if eval.Type == structs.JobTypeBatch {
-		if !allowBatch {
-			return false, nil, nil
-		}
-
 		// Check if the job is running
 		job, err := c.snap.JobByID(ws, eval.JobID)
 		if err != nil {
@@ -250,7 +244,8 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 		}
 
 		// We don't want to gc anything related to a job which is not dead
-		if job != nil && job.Status != structs.JobStatusDead {
+		// If the batch job doesn't exist we can GC it regardless of allowBatch
+		if job != nil && (!allowBatch || job.Status != structs.JobStatusDead) {
 			return false, nil, nil
 		}
 	}
diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go
index 3f1c6d247167..ce1e39cd3524 100644
--- a/nomad/core_sched_test.go
+++ b/nomad/core_sched_test.go
@@ -90,7 +90,7 @@ func TestCoreScheduler_EvalGC(t *testing.T) {
 	}
 }
 
-// An EvalGC should never reap a batch job
+// An EvalGC should never reap a batch job that has not been stopped
 func TestCoreScheduler_EvalGC_Batch(t *testing.T) {
 	s1 := testServer(t, nil)
 	defer s1.Shutdown()
@@ -190,6 +190,94 @@ func TestCoreScheduler_EvalGC_Batch(t *testing.T) {
 	}
 }
 
+// An EvalGC should  reap a batch job that has been stopped
+func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) {
+	s1 := testServer(t, nil)
+	defer s1.Shutdown()
+	testutil.WaitForLeader(t, s1.RPC)
+
+	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
+	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
+
+	// Create a "dead" job
+	state := s1.fsm.State()
+	job := mock.Job()
+	job.Type = structs.JobTypeBatch
+	job.Status = structs.JobStatusDead
+
+	// Insert "complete" eval
+	eval := mock.Eval()
+	eval.Status = structs.EvalStatusComplete
+	eval.Type = structs.JobTypeBatch
+	eval.JobID = job.ID
+	err := state.UpsertEvals(1001, []*structs.Evaluation{eval})
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Insert "failed" alloc
+	alloc := mock.Alloc()
+	alloc.JobID = job.ID
+	alloc.EvalID = eval.ID
+	alloc.DesiredStatus = structs.AllocDesiredStatusStop
+
+	// Insert "lost" alloc
+	alloc2 := mock.Alloc()
+	alloc2.JobID = job.ID
+	alloc2.EvalID = eval.ID
+	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
+	alloc2.ClientStatus = structs.AllocClientStatusLost
+
+	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Update the time tables to make this work
+	tt := s1.fsm.TimeTable()
+	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
+
+	// Create a core scheduler
+	snap, err := state.Snapshot()
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	core := NewCoreScheduler(s1, snap)
+
+	// Attempt the GC
+	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
+	err = core.Process(gc)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Everything should be gone
+	ws := memdb.NewWatchSet()
+	out, err := state.EvalByID(ws, eval.ID)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	if out != nil {
+		t.Fatalf("bad: %v", out)
+	}
+
+	outA, err := state.AllocByID(ws, alloc.ID)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	if outA != nil {
+		t.Fatalf("bad: %v", outA)
+	}
+
+	outA2, err := state.AllocByID(ws, alloc2.ID)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	if outA2 != nil {
+		t.Fatalf("bad: %v", outA2)
+	}
+}
+
 func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
 	s1 := testServer(t, nil)
 	defer s1.Shutdown()