Skip to content

Commit

Permalink
fix auto promote of canary task groups when deploying them alongside …
Browse files Browse the repository at this point in the history
…rolling deploy taskgroups that do not use the canary deployment system
  • Loading branch information
kainoaseto committed Jan 19, 2022
1 parent a0c0b80 commit b8db8f3
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 18 deletions.
9 changes: 8 additions & 1 deletion nomad/deploymentwatcher/deployment_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,9 +283,16 @@ func (w *deploymentWatcher) autoPromoteDeployment(allocs []*structs.AllocListStu
return nil
}

// AutoPromote iff every task group is marked auto_promote and is healthy. The whole
// AutoPromote iff every task group with canaries is marked auto_promote and is healthy. The whole
// job version has been incremented, so we promote together. See also AutoRevert
for _, dstate := range d.TaskGroups {

// skip auto promote canary validation if the task group has no canaries
// to prevent auto promote hanging on mixed canary/non-canary taskgroup deploys
if dstate.DesiredCanaries < 1 {
continue
}

if !dstate.AutoPromote || dstate.DesiredCanaries != len(dstate.PlacedCanaries) {
return nil
}
Expand Down
66 changes: 49 additions & 17 deletions nomad/deploymentwatcher/deployments_watcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -535,30 +535,40 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) {
w, m := defaultTestDeploymentWatcher(t)
now := time.Now()

// Create 1 UpdateStrategy, 1 job (1 TaskGroup), 2 canaries, and 1 deployment
upd := structs.DefaultUpdateStrategy.Copy()
upd.AutoPromote = true
upd.MaxParallel = 2
upd.Canary = 2
upd.ProgressDeadline = 5 * time.Second
// Create 1 UpdateStrategy, 1 job (2 TaskGroups), 2 canaries, and 1 deployment
canaryUpd := structs.DefaultUpdateStrategy.Copy()
canaryUpd.AutoPromote = true
canaryUpd.MaxParallel = 2
canaryUpd.Canary = 2
canaryUpd.ProgressDeadline = 5 * time.Second

j := mock.Job()
j.TaskGroups[0].Update = upd
rollingUpd := structs.DefaultUpdateStrategy.Copy()
rollingUpd.ProgressDeadline = 5 * time.Second

j := mock.MultiTaskGroupJob()
j.TaskGroups[0].Update = canaryUpd
j.TaskGroups[1].Update = rollingUpd

d := mock.Deployment()
d.JobID = j.ID
// This is created in scheduler.computeGroup at runtime, where properties from the
// UpdateStrategy are copied in
d.TaskGroups = map[string]*structs.DeploymentState{
"web": {
AutoPromote: upd.AutoPromote,
AutoRevert: upd.AutoRevert,
ProgressDeadline: upd.ProgressDeadline,
AutoPromote: canaryUpd.AutoPromote,
AutoRevert: canaryUpd.AutoRevert,
ProgressDeadline: canaryUpd.ProgressDeadline,
DesiredTotal: 2,
},
"api": {
AutoPromote: rollingUpd.AutoPromote,
AutoRevert: rollingUpd.AutoRevert,
ProgressDeadline: rollingUpd.ProgressDeadline,
DesiredTotal: 2,
},
}

alloc := func() *structs.Allocation {
canaryAlloc := func() *structs.Allocation {
a := mock.Alloc()
a.DeploymentID = d.ID
a.CreateTime = now.UnixNano()
Expand All @@ -569,14 +579,36 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) {
return a
}

a := alloc()
b := alloc()
rollingAlloc := func() *structs.Allocation {
a := mock.Alloc()
a.DeploymentID = d.ID
a.CreateTime = now.UnixNano()
a.ModifyTime = now.UnixNano()
a.TaskGroup = "api"
a.AllocatedResources.Tasks["api"] = a.AllocatedResources.Tasks["web"].Copy()
delete(a.AllocatedResources.Tasks, "web")
a.TaskResources["api"] = a.TaskResources["web"].Copy()
delete(a.TaskResources, "web")
a.DeploymentStatus = &structs.AllocDeploymentStatus{
Canary: false,
}
return a
}

// Web taskgroup (0)
a := canaryAlloc()
b := canaryAlloc()

// Api taskgroup (1)
c := rollingAlloc()
e := rollingAlloc()

d.TaskGroups[a.TaskGroup].PlacedCanaries = []string{a.ID, b.ID}
d.TaskGroups[a.TaskGroup].DesiredCanaries = 2
d.TaskGroups[c.TaskGroup].PlacedAllocs = 2
require.NoError(t, m.state.UpsertJob(structs.MsgTypeTestSetup, m.nextIndex(), j), "UpsertJob")
require.NoError(t, m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
require.NoError(t, m.state.UpsertAllocs(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{a, b}), "UpsertAllocs")
require.NoError(t, m.state.UpsertAllocs(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{a, b, c, e}), "UpsertAllocs")

// =============================================================
// Support method calls
Expand All @@ -595,7 +627,7 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) {

matchConfig1 := &matchDeploymentAllocHealthRequestConfig{
DeploymentID: d.ID,
Healthy: []string{a.ID, b.ID},
Healthy: []string{a.ID, b.ID, c.ID, e.ID},
Eval: true,
}
matcher1 := matchDeploymentAllocHealthRequest(matchConfig1)
Expand Down Expand Up @@ -629,7 +661,7 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) {
// Mark the canaries healthy
req := &structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
HealthyAllocationIDs: []string{a.ID, b.ID},
HealthyAllocationIDs: []string{a.ID, b.ID, c.ID, e.ID},
}
var resp structs.DeploymentUpdateResponse
// Calls w.raft.UpdateDeploymentAllocHealth, which is implemented by StateStore in
Expand Down
82 changes: 82 additions & 0 deletions nomad/mock/mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,88 @@ func Job() *structs.Job {
return job
}

func MultiTaskGroupJob() *structs.Job {
job := Job()
apiTaskGroup := &structs.TaskGroup{
Name: "api",
Count: 10,
EphemeralDisk: &structs.EphemeralDisk{
SizeMB: 150,
},
RestartPolicy: &structs.RestartPolicy{
Attempts: 3,
Interval: 10 * time.Minute,