deploymentwatcher: reset progress deadline on promotion (#10042)

In a deployment with two groups (ex. A and B), if group A's canary becomes healthy before group B's, the deadline for the overall deployment will be set to that of group A. When the deployment is promoted, if group A is done it will not contribute to the next deadline cutoff. Group B's old deadline will be used instead, which will be in the past and immediately trigger a deployment progress failure. Reset the progress deadline when the job is promotion to avoid this bug, and to better conform with implicit user expectations around how the progress deadline should interact with promotions.
hashicorp · Feb 22, 2021 · 174c206 · 174c206
1 parent fcf81ab
commit 174c206
Show file tree

Hide file tree

Showing 6 changed files with 237 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@ BUG FIXES:
  * consul/connect: Fixed a bug preventing more than one connect gateway per Nomad client [[GH-9849](https://github.com/hashicorp/nomad/pull/9849)]
  * consul/connect: Fixed a bug where connect sidecar services would be re-registered unnecessarily. [[GH-10059](https://github.com/hashicorp/nomad/pull/10059)]
  * consul/connect: Fixed a bug where the sidecar health checks would fail if `host_network` was defined. [[GH-9975](https://github.com/hashicorp/nomad/issues/9975)]
+ * deployments: Fixed a bug where deployments with multiple task groups and manual promotion would fail if promoted after the progress deadline. [[GH-10042](https://github.com/hashicorp/nomad/issues/10042)]
  * drivers/docker: Fixed a bug preventing multiple ports to be mapped to the same container port [[GH-9951](https://github.com/hashicorp/nomad/issues/9951)]
  * driver/qemu: Fixed a bug where network namespaces were not supported for QEMU workloads [[GH-9861](https://github.com/hashicorp/nomad/pull/9861)]
  * nomad/structs: Fixed a bug where static ports with the same value but different `host_network` were invalid [[GH-9946](https://github.com/hashicorp/nomad/issues/9946)]

diff --git a/nomad/deploymentwatcher/deployment_watcher.go b/nomad/deploymentwatcher/deployment_watcher.go
@@ -427,10 +427,13 @@ FAIL:
 			// This is the successful case, and we stop the loop
 			return
 		case <-deadlineTimer.C:
-			// We have hit the progress deadline so fail the deployment. We need
-			// to determine whether we should roll back the job by inspecting
-			// which allocs as part of the deployment are healthy and which
-			// aren't.
+			// We have hit the progress deadline, so fail the deployment
+			// unless we're waiting for manual promotion. We need to determine
+			// whether we should roll back the job by inspecting which allocs
+			// as part of the deployment are healthy and which aren't. The
+			// deadlineHit flag is never reset, so even in the case of a
+			// manual promotion, we'll describe any failure as a progress
+			// deadline failure at this point.
 			deadlineHit = true
 			fail, rback, err := w.shouldFail()
 			if err != nil {
@@ -475,6 +478,7 @@ FAIL:
 				// to avoid resetting, causing a deployment failure.
 				if !next.IsZero() {
 					deadlineTimer.Reset(time.Until(next))
+					w.logger.Trace("resetting deadline")
 				}
 			}
 
@@ -674,7 +678,7 @@ func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) {
 		}
 
 		// Unhealthy allocs and we need to autorevert
-		return true, true, nil
+		return fail, true, nil
 	}
 
 	return fail, false, nil

diff --git a/nomad/deploymentwatcher/deployments_watcher_test.go b/nomad/deploymentwatcher/deployments_watcher_test.go
@@ -1334,6 +1334,212 @@ func TestDeploymentWatcher_PromotedCanary_UpdatedAllocs(t *testing.T) {
 	})
 }
 
+func TestDeploymentWatcher_ProgressDeadline_LatePromote(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	mtype := structs.MsgTypeTestSetup
+
+	w, m := defaultTestDeploymentWatcher(t)
+	w.SetEnabled(true, m.state)
+
+	m.On("UpdateDeploymentStatus", mocker.MatchedBy(func(args *structs.DeploymentStatusUpdateRequest) bool {
+		return true
+	})).Return(nil).Maybe()
+
+	progressTimeout := time.Millisecond * 1000
+	j := mock.Job()
+	j.TaskGroups[0].Name = "group1"
+	j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
+	j.TaskGroups[0].Update.MaxParallel = 2
+	j.TaskGroups[0].Update.AutoRevert = false
+	j.TaskGroups[0].Update.ProgressDeadline = progressTimeout
+	j.TaskGroups = append(j.TaskGroups, j.TaskGroups[0].Copy())
+	j.TaskGroups[0].Name = "group2"
+
+	d := mock.Deployment()
+	d.JobID = j.ID
+	d.TaskGroups = map[string]*structs.DeploymentState{
+		"group1": {
+			ProgressDeadline: progressTimeout,
+			Promoted:         false,
+			PlacedCanaries:   []string{},
+			DesiredCanaries:  1,
+			DesiredTotal:     3,
+			PlacedAllocs:     0,
+			HealthyAllocs:    0,
+			UnhealthyAllocs:  0,
+		},
+		"group2": {
+			ProgressDeadline: progressTimeout,
+			Promoted:         false,
+			PlacedCanaries:   []string{},
+			DesiredCanaries:  1,
+			DesiredTotal:     1,
+			PlacedAllocs:     0,
+			HealthyAllocs:    0,
+			UnhealthyAllocs:  0,
+		},
+	}
+
+	require.NoError(m.state.UpsertJob(mtype, m.nextIndex(), j))
+	require.NoError(m.state.UpsertDeployment(m.nextIndex(), d))
+
+	// require that we get a call to UpsertDeploymentPromotion
+	matchConfig := &matchDeploymentPromoteRequestConfig{
+		Promotion: &structs.DeploymentPromoteRequest{
+			DeploymentID: d.ID,
+			All:          true,
+		},
+		Eval: true,
+	}
+	matcher := matchDeploymentPromoteRequest(matchConfig)
+	m.On("UpdateDeploymentPromotion", mocker.MatchedBy(matcher)).Return(nil)
+	m1 := matchUpdateAllocDesiredTransitions([]string{d.ID})
+	m.On("UpdateAllocDesiredTransition", mocker.MatchedBy(m1)).Return(nil)
+
+	// create canaries
+
+	now := time.Now()
+
+	canary1 := mock.Alloc()
+	canary1.Job = j
+	canary1.DeploymentID = d.ID
+	canary1.TaskGroup = "group1"
+	canary1.DesiredStatus = structs.AllocDesiredStatusRun
+	canary1.ModifyTime = now.UnixNano()
+
+	canary2 := mock.Alloc()
+	canary2.Job = j
+	canary2.DeploymentID = d.ID
+	canary2.TaskGroup = "group2"
+	canary2.DesiredStatus = structs.AllocDesiredStatusRun
+	canary2.ModifyTime = now.UnixNano()
+
+	allocs := []*structs.Allocation{canary1, canary2}
+	err := m.state.UpsertAllocs(mtype, m.nextIndex(), allocs)
+	require.NoError(err)
+
+	// 2nd group's canary becomes healthy
+
+	now = time.Now()
+
+	canary2 = canary2.Copy()
+	canary2.ModifyTime = now.UnixNano()
+	canary2.DeploymentStatus = &structs.AllocDeploymentStatus{
+		Canary:    true,
+		Healthy:   helper.BoolToPtr(true),
+		Timestamp: now,
+	}
+
+	allocs = []*structs.Allocation{canary2}
+	err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs)
+	require.NoError(err)
+
+	// wait for long enough to ensure we read deployment update channel
+	// this sleep creates the race condition associated with #7058
+	time.Sleep(50 * time.Millisecond)
+
+	// 1st group's canary becomes healthy
+	now = time.Now()
+
+	canary1 = canary1.Copy()
+	canary1.ModifyTime = now.UnixNano()
+	canary1.DeploymentStatus = &structs.AllocDeploymentStatus{
+		Canary:    true,
+		Healthy:   helper.BoolToPtr(true),
+		Timestamp: now,
+	}
+
+	allocs = []*structs.Allocation{canary1}
+	err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs)
+	require.NoError(err)
+
+	// ensure progress_deadline has definitely expired
+	time.Sleep(progressTimeout)
+
+	// promote the deployment
+
+	req := &structs.DeploymentPromoteRequest{
+		DeploymentID: d.ID,
+		All:          true,
+	}
+	err = w.PromoteDeployment(req, &structs.DeploymentUpdateResponse{})
+	require.NoError(err)
+
+	// wait for long enough to ensure we read deployment update channel
+	time.Sleep(50 * time.Millisecond)
+
+	// create new allocs for promoted deployment
+	// (these come from plan_apply, not a client update)
+	now = time.Now()
+
+	alloc1a := mock.Alloc()
+	alloc1a.Job = j
+	alloc1a.DeploymentID = d.ID
+	alloc1a.TaskGroup = "group1"
+	alloc1a.ClientStatus = structs.AllocClientStatusPending
+	alloc1a.DesiredStatus = structs.AllocDesiredStatusRun
+	alloc1a.ModifyTime = now.UnixNano()
+
+	alloc1b := mock.Alloc()
+	alloc1b.Job = j
+	alloc1b.DeploymentID = d.ID
+	alloc1b.TaskGroup = "group1"
+	alloc1b.ClientStatus = structs.AllocClientStatusPending
+	alloc1b.DesiredStatus = structs.AllocDesiredStatusRun
+	alloc1b.ModifyTime = now.UnixNano()
+
+	allocs = []*structs.Allocation{alloc1a, alloc1b}
+	err = m.state.UpsertAllocs(mtype, m.nextIndex(), allocs)
+	require.NoError(err)
+
+	// allocs become healthy
+
+	now = time.Now()
+
+	alloc1a = alloc1a.Copy()
+	alloc1a.ClientStatus = structs.AllocClientStatusRunning
+	alloc1a.ModifyTime = now.UnixNano()
+	alloc1a.DeploymentStatus = &structs.AllocDeploymentStatus{
+		Canary:    false,
+		Healthy:   helper.BoolToPtr(true),
+		Timestamp: now,
+	}
+
+	alloc1b = alloc1b.Copy()
+	alloc1b.ClientStatus = structs.AllocClientStatusRunning
+	alloc1b.ModifyTime = now.UnixNano()
+	alloc1b.DeploymentStatus = &structs.AllocDeploymentStatus{
+		Canary:    false,
+		Healthy:   helper.BoolToPtr(true),
+		Timestamp: now,
+	}
+
+	allocs = []*structs.Allocation{alloc1a, alloc1b}
+	err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs)
+	require.NoError(err)
+
+	// ensure any progress deadline has expired
+	time.Sleep(progressTimeout)
+
+	// without a scheduler running we'll never mark the deployment as
+	// successful, so test that healthy == desired and that we haven't failed
+	deployment, err := m.state.DeploymentByID(nil, d.ID)
+	require.NoError(err)
+	require.Equal(structs.DeploymentStatusRunning, deployment.Status)
+
+	group1 := deployment.TaskGroups["group1"]
+
+	require.Equal(group1.DesiredTotal, group1.HealthyAllocs, "not enough healthy")
+	require.Equal(group1.DesiredTotal, group1.PlacedAllocs, "not enough placed")
+	require.Equal(0, group1.UnhealthyAllocs)
+
+	group2 := deployment.TaskGroups["group2"]
+	require.Equal(group2.DesiredTotal, group2.HealthyAllocs, "not enough healthy")
+	require.Equal(group2.DesiredTotal, group2.PlacedAllocs, "not enough placed")
+	require.Equal(0, group2.UnhealthyAllocs)
+}
+
 // Test scenario where deployment initially has no progress deadline
 // After the deployment is updated, a failed alloc's DesiredTransition should be set
 func TestDeploymentWatcher_Watch_StartWithoutProgressDeadline(t *testing.T) {

diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
@@ -4046,6 +4046,10 @@ func (s *StateStore) UpdateDeploymentPromotion(msgType structs.MessageType, inde
 			continue
 		}
 
+		// reset the progress deadline
+		if status.ProgressDeadline > 0 && !status.RequireProgressBy.IsZero() {
+			status.RequireProgressBy = time.Now().Add(status.ProgressDeadline)
+		}
 		status.Promoted = true
 	}
 

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
@@ -8776,11 +8776,13 @@ type DeploymentState struct {
 	AutoPromote bool
 
 	// ProgressDeadline is the deadline by which an allocation must transition
-	// to healthy before the deployment is considered failed.
+	// to healthy before the deployment is considered failed. This value is set
+	// by the jobspec `update.progress_deadline` field.
 	ProgressDeadline time.Duration
 
-	// RequireProgressBy is the time by which an allocation must transition
-	// to healthy before the deployment is considered failed.
+	// RequireProgressBy is the time by which an allocation must transition to
+	// healthy before the deployment is considered failed. This value is reset
+	// to "now" + ProgressDeadline when an allocation updates the deployment.
 	RequireProgressBy time.Time
 
 	// Promoted marks whether the canaries have been promoted

diff --git a/website/content/docs/job-specification/update.mdx b/website/content/docs/job-specification/update.mdx
@@ -83,20 +83,23 @@ a future release.
 - `progress_deadline` `(string: "10m")` - Specifies the deadline in which an
   allocation must be marked as healthy. The deadline begins when the first
   allocation for the deployment is created and is reset whenever an allocation
-  as part of the deployment transitions to a healthy state. If no allocation
-  transitions to the healthy state before the progress deadline, the deployment
-  is marked as failed. If the `progress_deadline` is set to `0`, the first
-  allocation to be marked as unhealthy causes the deployment to fail. This is
-  specified using a label suffix like "2m" or "1h".
+  as part of the deployment transitions to a healthy state or when a
+  deployment is manually promoted. If no allocation transitions to the healthy
+  state before the progress deadline, the deployment is marked as failed. If
+  the `progress_deadline` is set to `0`, the first allocation to be marked as
+  unhealthy causes the deployment to fail. This is specified using a label
+  suffix like "2m" or "1h".
 
 - `auto_revert` `(bool: false)` - Specifies if the job should auto-revert to the
   last stable job on deployment failure. A job is marked as stable if all the
   allocations as part of its deployment were marked healthy.
 
-- `auto_promote` `(bool: false)` - Specifies if the job should auto-promote to the
-  canary version when all canaries become healthy during a deployment. Defaults to
-  false which means canaries must be manually updated with the `nomad deployment promote`
-  command.
+- `auto_promote` `(bool: false)` - Specifies if the job should auto-promote to
+  the canary version when all canaries become healthy during a
+  deployment. Defaults to false which means canaries must be manually updated
+  with the `nomad deployment promote` command. If a job has multiple task
+  groups, all must be set to `auto_promote = true` in order for the deployment
+  to be promoted automatically.
 
 - `canary` `(int: 0)` - Specifies that changes to the job that would result in
   destructive updates should create the specified number of canaries without