From 0f61a8d034a7125cacf38c74309132a996ec10d8 Mon Sep 17 00:00:00 2001 From: kainoaseto Date: Tue, 18 Jan 2022 16:40:44 -0800 Subject: [PATCH] fix auto promote of canary task groups when deploying them alongside rolling deploy taskgroups that do not use the canary deployment system --- nomad/deploymentwatcher/deployment_watcher.go | 9 +- .../deployments_watcher_test.go | 66 +++++++++++---- nomad/mock/mock.go | 84 ++++++++++++++++++- 3 files changed, 140 insertions(+), 19 deletions(-) diff --git a/nomad/deploymentwatcher/deployment_watcher.go b/nomad/deploymentwatcher/deployment_watcher.go index f12357d15514..bb7bc1f52584 100644 --- a/nomad/deploymentwatcher/deployment_watcher.go +++ b/nomad/deploymentwatcher/deployment_watcher.go @@ -283,9 +283,16 @@ func (w *deploymentWatcher) autoPromoteDeployment(allocs []*structs.AllocListStu return nil } - // AutoPromote iff every task group is marked auto_promote and is healthy. The whole + // AutoPromote iff every task group with canaries is marked auto_promote and is healthy. The whole // job version has been incremented, so we promote together. See also AutoRevert for _, dstate := range d.TaskGroups { + + // skip auto promote canary validation if the task group has no canaries + // to prevent auto promote hanging on mixed canary/non-canary taskgroup deploys + if dstate.DesiredCanaries < 1 { + continue + } + if !dstate.AutoPromote || dstate.DesiredCanaries != len(dstate.PlacedCanaries) { return nil } diff --git a/nomad/deploymentwatcher/deployments_watcher_test.go b/nomad/deploymentwatcher/deployments_watcher_test.go index 64fc6a724a4c..0770320245f6 100644 --- a/nomad/deploymentwatcher/deployments_watcher_test.go +++ b/nomad/deploymentwatcher/deployments_watcher_test.go @@ -535,15 +535,19 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) { w, m := defaultTestDeploymentWatcher(t) now := time.Now() - // Create 1 UpdateStrategy, 1 job (1 TaskGroup), 2 canaries, and 1 deployment - upd := structs.DefaultUpdateStrategy.Copy() - upd.AutoPromote = true - upd.MaxParallel = 2 - upd.Canary = 2 - upd.ProgressDeadline = 5 * time.Second + // Create 1 UpdateStrategy, 1 job (2 TaskGroups), 2 canaries, and 1 deployment + canaryUpd := structs.DefaultUpdateStrategy.Copy() + canaryUpd.AutoPromote = true + canaryUpd.MaxParallel = 2 + canaryUpd.Canary = 2 + canaryUpd.ProgressDeadline = 5 * time.Second - j := mock.Job() - j.TaskGroups[0].Update = upd + rollingUpd := structs.DefaultUpdateStrategy.Copy() + rollingUpd.ProgressDeadline = 5 * time.Second + + j := mock.MultiTaskGroupJob() + j.TaskGroups[0].Update = canaryUpd + j.TaskGroups[1].Update = rollingUpd d := mock.Deployment() d.JobID = j.ID @@ -551,14 +555,20 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) { // UpdateStrategy are copied in d.TaskGroups = map[string]*structs.DeploymentState{ "web": { - AutoPromote: upd.AutoPromote, - AutoRevert: upd.AutoRevert, - ProgressDeadline: upd.ProgressDeadline, + AutoPromote: canaryUpd.AutoPromote, + AutoRevert: canaryUpd.AutoRevert, + ProgressDeadline: canaryUpd.ProgressDeadline, DesiredTotal: 2, }, + "api": { + AutoPromote: rollingUpd.AutoPromote, + AutoRevert: rollingUpd.AutoRevert, + ProgressDeadline: rollingUpd.ProgressDeadline, + DesiredTotal: 2, + }, } - alloc := func() *structs.Allocation { + canaryAlloc := func() *structs.Allocation { a := mock.Alloc() a.DeploymentID = d.ID a.CreateTime = now.UnixNano() @@ -569,14 +579,36 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) { return a } - a := alloc() - b := alloc() + rollingAlloc := func() *structs.Allocation { + a := mock.Alloc() + a.DeploymentID = d.ID + a.CreateTime = now.UnixNano() + a.ModifyTime = now.UnixNano() + a.TaskGroup = "api" + a.AllocatedResources.Tasks["api"] = a.AllocatedResources.Tasks["web"].Copy() + delete(a.AllocatedResources.Tasks, "web") + a.TaskResources["api"] = a.TaskResources["web"].Copy() + delete(a.TaskResources, "web") + a.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: false, + } + return a + } + + // Web taskgroup (0) + a := canaryAlloc() + b := canaryAlloc() + + // Api taskgroup (1) + c := rollingAlloc() + e := rollingAlloc() d.TaskGroups[a.TaskGroup].PlacedCanaries = []string{a.ID, b.ID} d.TaskGroups[a.TaskGroup].DesiredCanaries = 2 + d.TaskGroups[c.TaskGroup].PlacedAllocs = 2 require.NoError(t, m.state.UpsertJob(structs.MsgTypeTestSetup, m.nextIndex(), j), "UpsertJob") require.NoError(t, m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment") - require.NoError(t, m.state.UpsertAllocs(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{a, b}), "UpsertAllocs") + require.NoError(t, m.state.UpsertAllocs(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{a, b, c, e}), "UpsertAllocs") // ============================================================= // Support method calls @@ -595,7 +627,7 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) { matchConfig1 := &matchDeploymentAllocHealthRequestConfig{ DeploymentID: d.ID, - Healthy: []string{a.ID, b.ID}, + Healthy: []string{a.ID, b.ID, c.ID, e.ID}, Eval: true, } matcher1 := matchDeploymentAllocHealthRequest(matchConfig1) @@ -629,7 +661,7 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) { // Mark the canaries healthy req := &structs.DeploymentAllocHealthRequest{ DeploymentID: d.ID, - HealthyAllocationIDs: []string{a.ID, b.ID}, + HealthyAllocationIDs: []string{a.ID, b.ID, c.ID, e.ID}, } var resp structs.DeploymentUpdateResponse // Calls w.raft.UpdateDeploymentAllocHealth, which is implemented by StateStore in diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 95886654624c..a2cac7266167 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -337,8 +337,90 @@ func Job() *structs.Job { return job } +func MultiTaskGroupJob() *structs.Job { + job := Job() + apiTaskGroup := &structs.TaskGroup{ + Name: "api", + Count: 10, + EphemeralDisk: &structs.EphemeralDisk{ + SizeMB: 150, + }, + RestartPolicy: &structs.RestartPolicy{ + Attempts: 3, + Interval: 10 * time.Minute, + Delay: 1 * time.Minute, + Mode: structs.RestartPolicyModeDelay, + }, + ReschedulePolicy: &structs.ReschedulePolicy{ + Attempts: 2, + Interval: 10 * time.Minute, + Delay: 5 * time.Second, + DelayFunction: "constant", + }, + Migrate: structs.DefaultMigrateStrategy(), + Networks: []*structs.NetworkResource{ + { + Mode: "host", + DynamicPorts: []structs.Port{ + {Label: "http"}, + {Label: "admin"}, + }, + }, + }, + Tasks: []*structs.Task{ + { + Name: "api", + Driver: "exec", + Config: map[string]interface{}{ + "command": "/bin/date", + }, + Env: map[string]string{ + "FOO": "bar", + }, + Services: []*structs.Service{ + { + Name: "${TASK}-backend", + PortLabel: "http", + Tags: []string{"pci:${meta.pci-dss}", "datacenter:${node.datacenter}"}, + Checks: []*structs.ServiceCheck{ + { + Name: "check-table", + Type: structs.ServiceCheckScript, + Command: "/usr/local/check-table-${meta.database}", + Args: []string{"${meta.version}"}, + Interval: 30 * time.Second, + Timeout: 5 * time.Second, + }, + }, + }, + { + Name: "${TASK}-admin", + PortLabel: "admin", + }, + }, + LogConfig: structs.DefaultLogConfig(), + Resources: &structs.Resources{ + CPU: 500, + MemoryMB: 256, + }, + Meta: map[string]string{ + "foo": "bar", + }, + }, + }, + Meta: map[string]string{ + "elb_check_type": "http", + "elb_check_interval": "30s", + "elb_check_min": "3", + }, + } + job.TaskGroups = append(job.TaskGroups, apiTaskGroup) + job.Canonicalize() + return job +} + func LifecycleSideTask(resources structs.Resources, i int) *structs.Task { - return &structs.Task{ + return &structs.Task{ Name: fmt.Sprintf("side-%d", i), Driver: "exec", Config: map[string]interface{}{