Skip to content

Commit

Permalink
Delayed evaluations for stop_after_client_disconnect can cause unwa…
Browse files Browse the repository at this point in the history
…nted extra followup evaluations around job garbage collection (#8099)

* client/heartbeatstop: reversed time condition for startup grace

* scheduler/generic_sched: use `delayInstead` to avoid a loop

Without protecting the loop that creates followUpEvals, a delayed eval
is allowed to create an immediate subsequent delayed eval. For both
`stop_after_client_disconnect` and the `reschedule` block, a delayed
eval should always produce some immediate result (running or blocked)
and then only after the outcome of that eval produce a second delayed
eval.

* scheduler/reconcile: lostLater are different than delayedReschedules

Just slightly. `lostLater` allocs should be used to create batched
evaluations, but `handleDelayedReschedules` assumes that the
allocations are in the untainted set. When it creates the in-place
updates to those allocations at the end, it causes the allocation to
be treated as running over in the planner, which causes the initial
`stop_after_client_disconnect` evaluation to be retried by the worker.
  • Loading branch information
langmartin committed Jun 3, 2020
1 parent c7bf94e commit 846a2e5
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 23 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ BUG FIXES:
* core: Fixed a bug where new leader may take a long time until it can process requests [[GH-8036](https://github.com/hashicorp/nomad/issues/8036)]
* core: Fixed a bug where an internal metadata, ClusterID, may not be initialized properly upon a slow server upgrade [[GH-8078](https://github.com/hashicorp/nomad/issues/8078)].
* api: Fixed a bug where setting connect sidecar task resources could fail [[GH-7993](https://github.com/hashicorp/nomad/issues/7993)]
* core: Fixed a bug where stop_after_client_disconnect could cause the server to become unresponsive [[GH-8098](https://github.com/hashicorp/nomad/issues/8098)
* client: Fixed a bug where artifact downloads failed on redirects [[GH-7854](https://github.com/hashicorp/nomad/issues/7854)]
* csi: Validate empty volume arguments in API. [[GH-8027](https://github.com/hashicorp/nomad/issues/8027)]

Expand Down
2 changes: 1 addition & 1 deletion client/heartbeatstop.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func (h *heartbeatStop) shouldStop(alloc *structs.Allocation) bool {
func (h *heartbeatStop) shouldStopAfter(now time.Time, interval time.Duration) bool {
lastOk := h.getLastOk()
if lastOk.IsZero() {
return h.startupGrace.After(now)
return now.After(h.startupGrace)
}
return now.After(lastOk.Add(interval))
}
Expand Down
2 changes: 1 addition & 1 deletion nomad/plan_apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan
if !fit {
// Log the reason why the node's allocations could not be made
if reason != "" {
logger.Debug("plan for node rejected", "node_id", nodeID, "reason", reason)
logger.Debug("plan for node rejected", "node_id", nodeID, "reason", reason, "eval_id", plan.EvalID)
}
// Set that this is a partial commit
partialCommit = true
Expand Down
17 changes: 11 additions & 6 deletions scheduler/generic_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ type GenericScheduler struct {
ctx *EvalContext
stack *GenericStack

// followUpEvals are evals with WaitUntil set, which are delayed until that time
// before being rescheduled
followUpEvals []*structs.Evaluation

deployment *structs.Deployment
Expand Down Expand Up @@ -258,11 +260,13 @@ func (s *GenericScheduler) process() (bool, error) {

// If there are failed allocations, we need to create a blocked evaluation
// to place the failed allocations when resources become available. If the
// current evaluation is already a blocked eval, we reuse it by submitting
// a new eval to the planner in createBlockedEval. If the current eval is
// pending with WaitUntil set, it's delayed rather than blocked.
// current evaluation is already a blocked eval, we reuse it. If not, submit
// a new eval to the planner in createBlockedEval. If rescheduling should
// be delayed, do that instead.
delayInstead := len(s.followUpEvals) > 0 && s.eval.WaitUntil.IsZero()

if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil &&
s.eval.WaitUntil.IsZero() {
!delayInstead {
if err := s.createBlockedEval(false); err != nil {
s.logger.Error("failed to make blocked eval", "error", err)
return false, err
Expand All @@ -276,8 +280,9 @@ func (s *GenericScheduler) process() (bool, error) {
return true, nil
}

// Create follow up evals for any delayed reschedule eligible allocations
if len(s.followUpEvals) > 0 {
// Create follow up evals for any delayed reschedule eligible allocations, except in
// the case that this evaluation was already delayed.
if delayInstead {
for _, eval := range s.followUpEvals {
eval.PreviousEval = s.eval.ID
// TODO(preetha) this should be batching evals before inserting them
Expand Down
4 changes: 2 additions & 2 deletions scheduler/generic_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2846,8 +2846,8 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) {
require.Equal(t, h.Evals[0].Status, structs.EvalStatusComplete)
require.Len(t, h.Plans, 1, "plan")

// Followup eval created
require.True(t, len(h.CreateEvals) > 0)
// One followup eval created, either delayed or blocked
require.Len(t, h.CreateEvals, 1)
e := h.CreateEvals[0]
require.Equal(t, eval.ID, e.PreviousEval)

Expand Down
33 changes: 20 additions & 13 deletions scheduler/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,18 +355,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {

// Find delays for any lost allocs that have stop_after_client_disconnect
lostLater := lost.delayByStopAfterClientDisconnect()
rescheduleLater = append(rescheduleLater, lostLater...)
a.handleDelayedLost(lostLater, all, tg.Name)

// Create batched follow up evaluations for allocations that are
// reschedulable later and mark the allocations for in place updating
a.handleDelayedReschedules(rescheduleLater, all, tg.Name)

// Allocs that are lost and delayed have an attributeUpdate that correctly links to
// the eval, but incorrectly has the current (running) status
for _, d := range lostLater {
a.result.attributeUpdates[d.allocID].SetStop(structs.AllocClientStatusLost, structs.AllocClientStatusLost)
}

// Create a structure for choosing names. Seed with the taken names which is
// the union of untainted and migrating nodes (includes canaries)
nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow))
Expand Down Expand Up @@ -423,7 +417,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
// * The deployment is not paused or failed
// * Not placing any canaries
// * If there are any canaries that they have been promoted
// * There is no delayed stop_after_client_disconnect alloc
// * There is no delayed stop_after_client_disconnect alloc, which delays scheduling for the whole group
var place []allocPlaceResult
if len(lostLater) == 0 {
place = a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow)
Expand Down Expand Up @@ -845,6 +839,17 @@ func (a *allocReconciler) computeUpdates(group *structs.TaskGroup, untainted all
// handleDelayedReschedules creates batched followup evaluations with the WaitUntil field set
// for allocations that are eligible to be rescheduled later
func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) {
a.handleDelayedReschedulesImpl(rescheduleLater, all, tgName, true)
}

// handleDelayedLost creates batched followup evaluations with the WaitUntil field set for lost allocations
func (a *allocReconciler) handleDelayedLost(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) {
a.handleDelayedReschedulesImpl(rescheduleLater, all, tgName, false)
}

// handleDelayedReschedulesImpl creates batched followup evaluations with the WaitUntil field set
func (a *allocReconciler) handleDelayedReschedulesImpl(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string,
createUpdates bool) {
if len(rescheduleLater) == 0 {
return
}
Expand Down Expand Up @@ -905,10 +910,12 @@ func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRes
}

// Create in-place updates for every alloc ID that needs to be updated with its follow up eval ID
for allocID, evalID := range allocIDToFollowupEvalID {
existingAlloc := all[allocID]
updatedAlloc := existingAlloc.Copy()
updatedAlloc.FollowupEvalID = evalID
a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc
if createUpdates {
for allocID, evalID := range allocIDToFollowupEvalID {
existingAlloc := all[allocID]
updatedAlloc := existingAlloc.Copy()
updatedAlloc.FollowupEvalID = evalID
a.result.attributeUpdates[updatedAlloc.ID] = updatedAlloc
}
}
}

0 comments on commit 846a2e5

Please sign in to comment.