don't emit follow-up eval for core jobs

hashicorp · Aug 17, 2020 · 025a613 · 025a613
1 parent 5895555
commit 025a613
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 21 deletions.
diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go
@@ -2449,7 +2449,9 @@ func TestCoreScheduler_FailLoop(t *testing.T) {
 
 	out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5)
 	require.NoError(err)
-	require.Nil(out,
-		"failed core jobs should not result in follow-up. TriggeredBy: %v",
-		out.TriggeredBy)
+	if out != nil {
+		t.Fatalf(
+			"failed core jobs should not result in follow-up. TriggeredBy: %v",
+			out.TriggeredBy)
+	}
 }
diff --git a/nomad/leader.go b/nomad/leader.go
@@ -640,25 +640,31 @@ func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
 			updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
 			s.logger.Warn("eval reached delivery limit, marking as failed", "eval", updateEval.GoString())
 
-			// Create a follow-up evaluation that will be used to retry the
-			// scheduling for the job after the cluster is hopefully more stable
-			// due to the fairly large backoff.
-			followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
-				time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
-
-			followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
-			updateEval.NextEval = followupEval.ID
-			updateEval.UpdateModifyTime()
-
-			// Update via Raft
-			req := structs.EvalUpdateRequest{
-				Evals: []*structs.Evaluation{updateEval, followupEval},
-			}
-			if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
-				s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err)
-				continue
+			// Core job evals that fail or span leader elections will never
+			// succeed because the follow-up doesn't have the leader ACL. We
+			// rely on the leader to schedule new core jobs periodically
+			// instead.
+			if eval.Type != structs.JobTypeCore {
+
+				// Create a follow-up evaluation that will be used to retry the
+				// scheduling for the job after the cluster is hopefully more stable
+				// due to the fairly large backoff.
+				followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
+					time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))
+
+				followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
+				updateEval.NextEval = followupEval.ID
+				updateEval.UpdateModifyTime()
+
+				// Update via Raft
+				req := structs.EvalUpdateRequest{
+					Evals: []*structs.Evaluation{updateEval, followupEval},
+				}
+				if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
+					s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err)
+					continue
+				}
 			}
-
 			// Ack completion
 			s.evalBroker.Ack(eval.ID, token)
 		}