hashicorp · tgross · Feb 22, 2021 · Feb 9, 2021 · Feb 11, 2021 · Feb 17, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@ BUG FIXES:
  * consul/connect: Fixed a bug preventing more than one connect gateway per Nomad client [[GH-9849](https://github.com/hashicorp/nomad/pull/9849)]
  * consul/connect: Fixed a bug where connect sidecar services would be re-registered unnecessarily. [[GH-10059](https://github.com/hashicorp/nomad/pull/10059)]
  * consul/connect: Fixed a bug where the sidecar health checks would fail if `host_network` was defined. [[GH-9975](https://github.com/hashicorp/nomad/issues/9975)]
+ * deployments: Fixed a bug where deployments with multiple task groups and manual promotion would fail if promoted after the progress deadline. [[GH-10042](https://github.com/hashicorp/nomad/issues/10042)]
  * drivers/docker: Fixed a bug preventing multiple ports to be mapped to the same container port [[GH-9951](https://github.com/hashicorp/nomad/issues/9951)]
  * driver/qemu: Fixed a bug where network namespaces were not supported for QEMU workloads [[GH-9861](https://github.com/hashicorp/nomad/pull/9861)]
  * nomad/structs: Fixed a bug where static ports with the same value but different `host_network` were invalid [[GH-9946](https://github.com/hashicorp/nomad/issues/9946)]

diff --git a/nomad/deploymentwatcher/deployment_watcher.go b/nomad/deploymentwatcher/deployment_watcher.go
@@ -427,10 +427,13 @@ FAIL:
 			// This is the successful case, and we stop the loop
 			return
 		case <-deadlineTimer.C:
-			// We have hit the progress deadline so fail the deployment. We need
-			// to determine whether we should roll back the job by inspecting
-			// which allocs as part of the deployment are healthy and which
-			// aren't.
+			// We have hit the progress deadline, so fail the deployment
+			// unless we're waiting for manual promotion. We need to determine
+			// whether we should roll back the job by inspecting which allocs
+			// as part of the deployment are healthy and which aren't. The
+			// deadlineHit flag is never reset, so even in the case of a
+			// manual promotion, we'll describe any failure as a progress
+			// deadline failure at this point.
 			deadlineHit = true
 			fail, rback, err := w.shouldFail()
 			if err != nil {
@@ -475,6 +478,7 @@ FAIL:
 				// to avoid resetting, causing a deployment failure.
 				if !next.IsZero() {
 					deadlineTimer.Reset(time.Until(next))
+					w.logger.Trace("resetting deadline")
 				}
 			}
 
@@ -674,7 +678,7 @@ func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) {
 		}
 
 		// Unhealthy allocs and we need to autorevert
-		return true, true, nil
+		return fail, true, nil
 	}
 
 	return fail, false, nil

diff --git a/nomad/deploymentwatcher/deployments_watcher_test.go b/nomad/deploymentwatcher/deployments_watcher_test.go
@@ -1334,6 +1334,212 @@ func TestDeploymentWatcher_PromotedCanary_UpdatedAllocs(t *testing.T) {
 	})
 }
 
+func TestDeploymentWatcher_ProgressDeadline_LatePromote(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	mtype := structs.MsgTypeTestSetup
+
+	w, m := defaultTestDeploymentWatcher(t)
+	w.SetEnabled(true, m.state)
+
+	m.On("UpdateDeploymentStatus", mocker.MatchedBy(func(args *structs.DeploymentStatusUpdateRequest) bool {
+		return true
+	})).Return(nil).Maybe()
+
+	progressTimeout := time.Millisecond * 1000
+	j := mock.Job()
+	j.TaskGroups[0].Name = "group1"
+	j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
+	j.TaskGroups[0].Update.MaxParallel = 2
+	j.TaskGroups[0].Update.AutoRevert = false
+	j.TaskGroups[0].Update.ProgressDeadline = progressTimeout
+	j.TaskGroups = append(j.TaskGroups, j.TaskGroups[0].Copy())
+	j.TaskGroups[0].Name = "group2"
+
+	d := mock.Deployment()
+	d.JobID = j.ID
+	d.TaskGroups = map[string]*structs.DeploymentState{
+		"group1": {
+			ProgressDeadline: progressTimeout,
+			Promoted:         false,
+			PlacedCanaries:   []string{},
+			DesiredCanaries:  1,
+			DesiredTotal:     3,
+			PlacedAllocs:     0,
+			HealthyAllocs:    0,
+			UnhealthyAllocs:  0,
+		},
+		"group2": {
+			ProgressDeadline: progressTimeout,
+			Promoted:         false,
+			PlacedCanaries:   []string{},
+			DesiredCanaries:  1,
+			DesiredTotal:     1,
+			PlacedAllocs:     0,
+			HealthyAllocs:    0,
+			UnhealthyAllocs:  0,
+		},
+	}
+
+	require.NoError(m.state.UpsertJob(mtype, m.nextIndex(), j))
+	require.NoError(m.state.UpsertDeployment(m.nextIndex(), d))
+
+	// require that we get a call to UpsertDeploymentPromotion
+	matchConfig := &matchDeploymentPromoteRequestConfig{
+		Promotion: &structs.DeploymentPromoteRequest{
+			DeploymentID: d.ID,
+			All:          true,
+		},
+		Eval: true,
+	}
+	matcher := matchDeploymentPromoteRequest(matchConfig)
+	m.On("UpdateDeploymentPromotion", mocker.MatchedBy(matcher)).Return(nil)
+	m1 := matchUpdateAllocDesiredTransitions([]string{d.ID})
+	m.On("UpdateAllocDesiredTransition", mocker.MatchedBy(m1)).Return(nil)
+
+	// create canaries
+
+	now := time.Now()
+
+	canary1 := mock.Alloc()
+	canary1.Job = j
+	canary1.DeploymentID = d.ID
+	canary1.TaskGroup = "group1"
+	canary1.DesiredStatus = structs.AllocDesiredStatusRun
+	canary1.ModifyTime = now.UnixNano()
+
+	canary2 := mock.Alloc()
+	canary2.Job = j
+	canary2.DeploymentID = d.ID
+	canary2.TaskGroup = "group2"
+	canary2.DesiredStatus = structs.AllocDesiredStatusRun
+	canary2.ModifyTime = now.UnixNano()
+
+	allocs := []*structs.Allocation{canary1, canary2}
+	err := m.state.UpsertAllocs(mtype, m.nextIndex(), allocs)
+	require.NoError(err)
+
+	// 2nd group's canary becomes healthy
+
+	now = time.Now()
+
+	canary2 = canary2.Copy()
+	canary2.ModifyTime = now.UnixNano()
+	canary2.DeploymentStatus = &structs.AllocDeploymentStatus{
+		Canary:    true,
+		Healthy:   helper.BoolToPtr(true),
+		Timestamp: now,
+	}
+
+	allocs = []*structs.Allocation{canary2}
+	err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs)
+	require.NoError(err)
+
+	// wait for long enough to ensure we read deployment update channel
+	// this sleep creates the race condition associated with #7058
+	time.Sleep(50 * time.Millisecond)
+
+	// 1st group's canary becomes healthy
+	now = time.Now()
+
+	canary1 = canary1.Copy()
+	canary1.ModifyTime = now.UnixNano()
+	canary1.DeploymentStatus = &structs.AllocDeploymentStatus{
+		Canary:    true,
+		Healthy:   helper.BoolToPtr(true),
+		Timestamp: now,
+	}
+
+	allocs = []*structs.Allocation{canary1}
+	err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs)
+	require.NoError(err)
+
+	// ensure progress_deadline has definitely expired
+	time.Sleep(progressTimeout)
+
+	// promote the deployment
+
+	req := &structs.DeploymentPromoteRequest{
+		DeploymentID: d.ID,
+		All:          true,
+	}
+	err = w.PromoteDeployment(req, &structs.DeploymentUpdateResponse{})
+	require.NoError(err)
+
+	// wait for long enough to ensure we read deployment update channel
+	time.Sleep(50 * time.Millisecond)
+
+	// create new allocs for promoted deployment
+	// (these come from plan_apply, not a client update)
+	now = time.Now()
+
+	alloc1a := mock.Alloc()
+	alloc1a.Job = j
+	alloc1a.DeploymentID = d.ID
+	alloc1a.TaskGroup = "group1"
+	alloc1a.ClientStatus = structs.AllocClientStatusPending
+	alloc1a.DesiredStatus = structs.AllocDesiredStatusRun
+	alloc1a.ModifyTime = now.UnixNano()
+
+	alloc1b := mock.Alloc()
+	alloc1b.Job = j
+	alloc1b.DeploymentID = d.ID
+	alloc1b.TaskGroup = "group1"
+	alloc1b.ClientStatus = structs.AllocClientStatusPending
+	alloc1b.DesiredStatus = structs.AllocDesiredStatusRun
+	alloc1b.ModifyTime = now.UnixNano()
+
+	allocs = []*structs.Allocation{alloc1a, alloc1b}
+	err = m.state.UpsertAllocs(mtype, m.nextIndex(), allocs)
+	require.NoError(err)
+
+	// allocs become healthy
+
+	now = time.Now()
+
+	alloc1a = alloc1a.Copy()
+	alloc1a.ClientStatus = structs.AllocClientStatusRunning
+	alloc1a.ModifyTime = now.UnixNano()
+	alloc1a.DeploymentStatus = &structs.AllocDeploymentStatus{
+		Canary:    false,
+		Healthy:   helper.BoolToPtr(true),
+		Timestamp: now,
+	}
+
+	alloc1b = alloc1b.Copy()
+	alloc1b.ClientStatus = structs.AllocClientStatusRunning
+	alloc1b.ModifyTime = now.UnixNano()
+	alloc1b.DeploymentStatus = &structs.AllocDeploymentStatus{
+		Canary:    false,
+		Healthy:   helper.BoolToPtr(true),
+		Timestamp: now,
+	}
+
+	allocs = []*structs.Allocation{alloc1a, alloc1b}
+	err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs)
+	require.NoError(err)
+
+	// ensure any progress deadline has expired
+	time.Sleep(progressTimeout)
+
+	// without a scheduler running we'll never mark the deployment as
+	// successful, so test that healthy == desired and that we haven't failed
+	deployment, err := m.state.DeploymentByID(nil, d.ID)
+	require.NoError(err)
+	require.Equal(structs.DeploymentStatusRunning, deployment.Status)
+
+	group1 := deployment.TaskGroups["group1"]
+
+	require.Equal(group1.DesiredTotal, group1.HealthyAllocs, "not enough healthy")
+	require.Equal(group1.DesiredTotal, group1.PlacedAllocs, "not enough placed")
+	require.Equal(0, group1.UnhealthyAllocs)
+
+	group2 := deployment.TaskGroups["group2"]
+	require.Equal(group2.DesiredTotal, group2.HealthyAllocs, "not enough healthy")
+	require.Equal(group2.DesiredTotal, group2.PlacedAllocs, "not enough placed")
+	require.Equal(0, group2.UnhealthyAllocs)
+}
+
 // Test scenario where deployment initially has no progress deadline
 // After the deployment is updated, a failed alloc's DesiredTransition should be set
 func TestDeploymentWatcher_Watch_StartWithoutProgressDeadline(t *testing.T) {

diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
@@ -4046,6 +4046,10 @@ func (s *StateStore) UpdateDeploymentPromotion(msgType structs.MessageType, inde
 			continue
 		}
 
+		// reset the progress deadline
+		if status.ProgressDeadline > 0 && !status.RequireProgressBy.IsZero() {
+			status.RequireProgressBy = time.Now().Add(status.ProgressDeadline)
+		}
 		status.Promoted = true
 	}
 

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
@@ -8776,11 +8776,13 @@ type DeploymentState struct {
 	AutoPromote bool
 
 	// ProgressDeadline is the deadline by which an allocation must transition
-	// to healthy before the deployment is considered failed.
+	// to healthy before the deployment is considered failed. This value is set
+	// by the jobspec `update.progress_deadline` field.
 	ProgressDeadline time.Duration
 
-	// RequireProgressBy is the time by which an allocation must transition
-	// to healthy before the deployment is considered failed.
+	// RequireProgressBy is the time by which an allocation must transition to
+	// healthy before the deployment is considered failed. This value is reset
+	// to "now" + ProgressDeadline when an allocation updates the deployment.
 	RequireProgressBy time.Time
 
 	// Promoted marks whether the canaries have been promoted

diff --git a/website/content/docs/job-specification/update.mdx b/website/content/docs/job-specification/update.mdx
@@ -83,20 +83,23 @@ a future release.
 - `progress_deadline` `(string: "10m")` - Specifies the deadline in which an
   allocation must be marked as healthy. The deadline begins when the first
   allocation for the deployment is created and is reset whenever an allocation
-  as part of the deployment transitions to a healthy state. If no allocation
-  transitions to the healthy state before the progress deadline, the deployment
-  is marked as failed. If the `progress_deadline` is set to `0`, the first
-  allocation to be marked as unhealthy causes the deployment to fail. This is
-  specified using a label suffix like "2m" or "1h".
+  as part of the deployment transitions to a healthy state or when a
+  deployment is manually promoted. If no allocation transitions to the healthy
+  state before the progress deadline, the deployment is marked as failed. If
+  the `progress_deadline` is set to `0`, the first allocation to be marked as
+  unhealthy causes the deployment to fail. This is specified using a label
+  suffix like "2m" or "1h".
 
 - `auto_revert` `(bool: false)` - Specifies if the job should auto-revert to the
   last stable job on deployment failure. A job is marked as stable if all the
   allocations as part of its deployment were marked healthy.
 
-- `auto_promote` `(bool: false)` - Specifies if the job should auto-promote to the
-  canary version when all canaries become healthy during a deployment. Defaults to
-  false which means canaries must be manually updated with the `nomad deployment promote`
-  command.
+- `auto_promote` `(bool: false)` - Specifies if the job should auto-promote to
+  the canary version when all canaries become healthy during a
+  deployment. Defaults to false which means canaries must be manually updated
+  with the `nomad deployment promote` command. If a job has multiple task
+  groups, all must be set to `auto_promote = true` in order for the deployment
+  to be promoted automatically.
 
 - `canary` `(int: 0)` - Specifies that changes to the job that would result in
   destructive updates should create the specified number of canaries without