Skip to content

Commit

Permalink
don't emit follow-up eval for core jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
tgross committed Aug 17, 2020
1 parent 5895555 commit 025a613
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 21 deletions.
8 changes: 5 additions & 3 deletions nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2449,7 +2449,9 @@ func TestCoreScheduler_FailLoop(t *testing.T) {

out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5)
require.NoError(err)
require.Nil(out,
"failed core jobs should not result in follow-up. TriggeredBy: %v",
out.TriggeredBy)
if out != nil {
t.Fatalf(
"failed core jobs should not result in follow-up. TriggeredBy: %v",
out.TriggeredBy)
}
}
42 changes: 24 additions & 18 deletions nomad/leader.go
Original file line number Diff line number Diff line change
Expand Up @@ -640,25 +640,31 @@ func (s *Server) reapFailedEvaluations(stopCh chan struct{}) {
updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit)
s.logger.Warn("eval reached delivery limit, marking as failed", "eval", updateEval.GoString())

// Create a follow-up evaluation that will be used to retry the
// scheduling for the job after the cluster is hopefully more stable
// due to the fairly large backoff.
followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))

followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
updateEval.NextEval = followupEval.ID
updateEval.UpdateModifyTime()

// Update via Raft
req := structs.EvalUpdateRequest{
Evals: []*structs.Evaluation{updateEval, followupEval},
}
if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err)
continue
// Core job evals that fail or span leader elections will never
// succeed because the follow-up doesn't have the leader ACL. We
// rely on the leader to schedule new core jobs periodically
// instead.
if eval.Type != structs.JobTypeCore {

// Create a follow-up evaluation that will be used to retry the
// scheduling for the job after the cluster is hopefully more stable
// due to the fairly large backoff.
followupEvalWait := s.config.EvalFailedFollowupBaselineDelay +
time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange)))

followupEval := eval.CreateFailedFollowUpEval(followupEvalWait)
updateEval.NextEval = followupEval.ID
updateEval.UpdateModifyTime()

// Update via Raft
req := structs.EvalUpdateRequest{
Evals: []*structs.Evaluation{updateEval, followupEval},
}
if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil {
s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err)
continue
}
}

// Ack completion
s.evalBroker.Ack(eval.ID, token)
}
Expand Down

0 comments on commit 025a613

Please sign in to comment.