diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index a075b6377ce4..6ecb078fa487 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -2394,3 +2394,64 @@ func TestCoreScheduler_CSIVolumeClaimGC(t *testing.T) { }, time.Second*1, 10*time.Millisecond, "claims were released unexpectedly") } + +func TestCoreScheduler_FailLoop(t *testing.T) { + t.Parallel() + require := require.New(t) + + srv, cleanupSrv := TestServer(t, func(c *Config) { + c.NumSchedulers = 0 // Prevent automatic dequeue + c.EvalDeliveryLimit = 2 + c.EvalFailedFollowupBaselineDelay = time.Duration(50 * time.Millisecond) + c.EvalFailedFollowupDelayRange = time.Duration(1 * time.Millisecond) + }) + defer cleanupSrv() + codec := rpcClient(t, srv) + sched := []string{structs.JobTypeCore} + + testutil.WaitForResult(func() (bool, error) { + return srv.evalBroker.Enabled(), nil + }, func(err error) { + t.Fatalf("should enable eval broker") + }) + + // Enqueue a core job eval that can never succeed because it was enqueued + // by another leader that's now gone + expected := srv.coreJobEval(structs.CoreJobCSIPluginGC, 100) + expected.LeaderACL = "nonsense" + srv.evalBroker.Enqueue(expected) + + nack := func(evalID, token string) error { + req := &structs.EvalAckRequest{ + EvalID: evalID, + Token: token, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + var resp structs.GenericResponse + return msgpackrpc.CallWithCodec(codec, "Eval.Nack", req, &resp) + } + + out, token, err := srv.evalBroker.Dequeue(sched, time.Second*5) + require.NoError(err) + require.NotNil(out) + require.Equal(expected, out) + + // first fail + require.NoError(nack(out.ID, token)) + + out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5) + require.NoError(err) + require.NotNil(out) + require.Equal(expected, out) + + // second fail, should not result in failed-follow-up + require.NoError(nack(out.ID, token)) + + out, token, err = srv.evalBroker.Dequeue(sched, time.Second*5) + require.NoError(err) + if out != nil { + t.Fatalf( + "failed core jobs should not result in follow-up. TriggeredBy: %v", + out.TriggeredBy) + } +} diff --git a/nomad/leader.go b/nomad/leader.go index c19b2159b261..8ad929b181eb 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -640,25 +640,31 @@ func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) s.logger.Warn("eval reached delivery limit, marking as failed", "eval", updateEval.GoString()) - // Create a follow-up evaluation that will be used to retry the - // scheduling for the job after the cluster is hopefully more stable - // due to the fairly large backoff. - followupEvalWait := s.config.EvalFailedFollowupBaselineDelay + - time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange))) - - followupEval := eval.CreateFailedFollowUpEval(followupEvalWait) - updateEval.NextEval = followupEval.ID - updateEval.UpdateModifyTime() - - // Update via Raft - req := structs.EvalUpdateRequest{ - Evals: []*structs.Evaluation{updateEval, followupEval}, - } - if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { - s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err) - continue + // Core job evals that fail or span leader elections will never + // succeed because the follow-up doesn't have the leader ACL. We + // rely on the leader to schedule new core jobs periodically + // instead. + if eval.Type != structs.JobTypeCore { + + // Create a follow-up evaluation that will be used to retry the + // scheduling for the job after the cluster is hopefully more stable + // due to the fairly large backoff. + followupEvalWait := s.config.EvalFailedFollowupBaselineDelay + + time.Duration(rand.Int63n(int64(s.config.EvalFailedFollowupDelayRange))) + + followupEval := eval.CreateFailedFollowUpEval(followupEvalWait) + updateEval.NextEval = followupEval.ID + updateEval.UpdateModifyTime() + + // Update via Raft + req := structs.EvalUpdateRequest{ + Evals: []*structs.Evaluation{updateEval, followupEval}, + } + if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { + s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err) + continue + } } - // Ack completion s.evalBroker.Ack(eval.ID, token) }