From 66c59b064da59c965a5b10b7ffec2396b9e67bce Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Thu, 3 Aug 2017 17:42:14 -0700 Subject: [PATCH] Lost allocs replaced even if deployment failed This PR allows the scheduler to replace lost allocations even if the job has a failed or paused deployment. The prior behavior was confusing to users. Fixes https://github.com/hashicorp/nomad/issues/2958 --- scheduler/reconcile.go | 20 ++++++++++++++++++-- scheduler/reconcile_test.go | 11 +++++++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 4c8dc1282ce0..23313e5cf23b 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -365,13 +365,29 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { dstate.DesiredTotal += len(place) } - if !a.deploymentPaused && !a.deploymentFailed && !canaryState { - // Place all new allocations + // deploymentPlaceReady tracks whether the deployment is in a state where + // placements can be made without any other consideration. + deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState + + if deploymentPlaceReady { desiredChanges.Place += uint64(len(place)) for _, p := range place { a.result.place = append(a.result.place, p) } + } else if !deploymentPlaceReady && len(lost) != 0 { + // We are in a situation where we shouldn't be placing more than we need + // to but we have lost allocations. It is a very weird user experience + // if you have a node go down and Nomad doesn't replace the allocations + // because the deployment is paused/failed so we only place to recover + // the lost allocations. + allowed := helper.IntMin(len(lost), len(place)) + desiredChanges.Place += uint64(allowed) + for _, p := range place[:allowed] { + a.result.place = append(a.result.place, p) + } + } + if deploymentPlaceReady { // Do all destructive updates min := helper.IntMin(len(destructive), limit) limit -= min diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index 85b823fd71de..c9a61164ef81 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -47,7 +47,7 @@ Update stanza Tests: √ Don't create a deployment if there are no changes √ Deployment created by all inplace updates √ Paused or failed deployment doesn't create any more canaries -√ Paused or failed deployment doesn't do any placements +√ Paused or failed deployment doesn't do any placements unless replacing lost allocs √ Paused or failed deployment doesn't do destructive updates √ Paused does do migrations √ Failed deployment doesn't do migrations @@ -2538,8 +2538,9 @@ func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) { assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) } -// Tests the reconciler handles a failed deployment and does no placements -func TestReconciler_FailedDeployment_NoPlacements(t *testing.T) { +// Tests the reconciler handles a failed deployment and only replaces lost +// deployments +func TestReconciler_FailedDeployment_PlacementLost(t *testing.T) { job := mock.Job() job.TaskGroups[0].Update = noCanaryUpdate @@ -2602,18 +2603,20 @@ func TestReconciler_FailedDeployment_NoPlacements(t *testing.T) { assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, - place: 0, + place: 1, // Only replace the lost node inplace: 0, stop: 2, followupEvalWait: 0, // Since the deployment is failed, there should be no followup desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { + Place: 1, Stop: 2, Ignore: 8, }, }, }) + assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place)) assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) }