diff --git a/.changelog/11878.txt b/.changelog/11878.txt new file mode 100644 index 000000000000..7023c3c3a73e --- /dev/null +++ b/.changelog/11878.txt @@ -0,0 +1,3 @@ +```release-note:bug +core: Fixed auto-promotion of canaries in jobs with at least one task group without canaries. +``` diff --git a/nomad/deploymentwatcher/deployment_watcher.go b/nomad/deploymentwatcher/deployment_watcher.go index f12357d15514..bb7bc1f52584 100644 --- a/nomad/deploymentwatcher/deployment_watcher.go +++ b/nomad/deploymentwatcher/deployment_watcher.go @@ -283,9 +283,16 @@ func (w *deploymentWatcher) autoPromoteDeployment(allocs []*structs.AllocListStu return nil } - // AutoPromote iff every task group is marked auto_promote and is healthy. The whole + // AutoPromote iff every task group with canaries is marked auto_promote and is healthy. The whole // job version has been incremented, so we promote together. See also AutoRevert for _, dstate := range d.TaskGroups { + + // skip auto promote canary validation if the task group has no canaries + // to prevent auto promote hanging on mixed canary/non-canary taskgroup deploys + if dstate.DesiredCanaries < 1 { + continue + } + if !dstate.AutoPromote || dstate.DesiredCanaries != len(dstate.PlacedCanaries) { return nil } diff --git a/nomad/deploymentwatcher/deployments_watcher_test.go b/nomad/deploymentwatcher/deployments_watcher_test.go index 64fc6a724a4c..50835b74cc56 100644 --- a/nomad/deploymentwatcher/deployments_watcher_test.go +++ b/nomad/deploymentwatcher/deployments_watcher_test.go @@ -535,15 +535,19 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) { w, m := defaultTestDeploymentWatcher(t) now := time.Now() - // Create 1 UpdateStrategy, 1 job (1 TaskGroup), 2 canaries, and 1 deployment - upd := structs.DefaultUpdateStrategy.Copy() - upd.AutoPromote = true - upd.MaxParallel = 2 - upd.Canary = 2 - upd.ProgressDeadline = 5 * time.Second + // Create 1 UpdateStrategy, 1 job (2 TaskGroups), 2 canaries, and 1 deployment + canaryUpd := structs.DefaultUpdateStrategy.Copy() + canaryUpd.AutoPromote = true + canaryUpd.MaxParallel = 2 + canaryUpd.Canary = 2 + canaryUpd.ProgressDeadline = 5 * time.Second - j := mock.Job() - j.TaskGroups[0].Update = upd + rollingUpd := structs.DefaultUpdateStrategy.Copy() + rollingUpd.ProgressDeadline = 5 * time.Second + + j := mock.MultiTaskGroupJob() + j.TaskGroups[0].Update = canaryUpd + j.TaskGroups[1].Update = rollingUpd d := mock.Deployment() d.JobID = j.ID @@ -551,14 +555,20 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) { // UpdateStrategy are copied in d.TaskGroups = map[string]*structs.DeploymentState{ "web": { - AutoPromote: upd.AutoPromote, - AutoRevert: upd.AutoRevert, - ProgressDeadline: upd.ProgressDeadline, + AutoPromote: canaryUpd.AutoPromote, + AutoRevert: canaryUpd.AutoRevert, + ProgressDeadline: canaryUpd.ProgressDeadline, + DesiredTotal: 2, + }, + "api": { + AutoPromote: rollingUpd.AutoPromote, + AutoRevert: rollingUpd.AutoRevert, + ProgressDeadline: rollingUpd.ProgressDeadline, DesiredTotal: 2, }, } - alloc := func() *structs.Allocation { + canaryAlloc := func() *structs.Allocation { a := mock.Alloc() a.DeploymentID = d.ID a.CreateTime = now.UnixNano() @@ -569,14 +579,36 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) { return a } - a := alloc() - b := alloc() + rollingAlloc := func() *structs.Allocation { + a := mock.Alloc() + a.DeploymentID = d.ID + a.CreateTime = now.UnixNano() + a.ModifyTime = now.UnixNano() + a.TaskGroup = "api" + a.AllocatedResources.Tasks["api"] = a.AllocatedResources.Tasks["web"].Copy() + delete(a.AllocatedResources.Tasks, "web") + a.TaskResources["api"] = a.TaskResources["web"].Copy() + delete(a.TaskResources, "web") + a.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: false, + } + return a + } - d.TaskGroups[a.TaskGroup].PlacedCanaries = []string{a.ID, b.ID} - d.TaskGroups[a.TaskGroup].DesiredCanaries = 2 + // Web taskgroup (0) + ca1 := canaryAlloc() + ca2 := canaryAlloc() + + // Api taskgroup (1) + ra1 := rollingAlloc() + ra2 := rollingAlloc() + + d.TaskGroups[ca1.TaskGroup].PlacedCanaries = []string{ca1.ID, ca2.ID} + d.TaskGroups[ca1.TaskGroup].DesiredCanaries = 2 + d.TaskGroups[ra1.TaskGroup].PlacedAllocs = 2 require.NoError(t, m.state.UpsertJob(structs.MsgTypeTestSetup, m.nextIndex(), j), "UpsertJob") require.NoError(t, m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment") - require.NoError(t, m.state.UpsertAllocs(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{a, b}), "UpsertAllocs") + require.NoError(t, m.state.UpsertAllocs(structs.MsgTypeTestSetup, m.nextIndex(), []*structs.Allocation{ca1, ca2, ra1, ra2}), "UpsertAllocs") // ============================================================= // Support method calls @@ -595,7 +627,7 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) { matchConfig1 := &matchDeploymentAllocHealthRequestConfig{ DeploymentID: d.ID, - Healthy: []string{a.ID, b.ID}, + Healthy: []string{ca1.ID, ca2.ID, ra1.ID, ra2.ID}, Eval: true, } matcher1 := matchDeploymentAllocHealthRequest(matchConfig1) @@ -629,7 +661,7 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) { // Mark the canaries healthy req := &structs.DeploymentAllocHealthRequest{ DeploymentID: d.ID, - HealthyAllocationIDs: []string{a.ID, b.ID}, + HealthyAllocationIDs: []string{ca1.ID, ca2.ID, ra1.ID, ra2.ID}, } var resp structs.DeploymentUpdateResponse // Calls w.raft.UpdateDeploymentAllocHealth, which is implemented by StateStore in @@ -654,12 +686,12 @@ func TestWatcher_AutoPromoteDeployment(t *testing.T) { require.Equal(t, "running", d.Status) require.True(t, d.TaskGroups["web"].Promoted) - a1, _ := m.state.AllocByID(ws, a.ID) + a1, _ := m.state.AllocByID(ws, ca1.ID) require.False(t, a1.DeploymentStatus.Canary) require.Equal(t, "pending", a1.ClientStatus) require.Equal(t, "run", a1.DesiredStatus) - b1, _ := m.state.AllocByID(ws, b.ID) + b1, _ := m.state.AllocByID(ws, ca2.ID) require.False(t, b1.DeploymentStatus.Canary) } diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 95886654624c..beda95a7cf16 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -337,6 +337,88 @@ func Job() *structs.Job { return job } +func MultiTaskGroupJob() *structs.Job { + job := Job() + apiTaskGroup := &structs.TaskGroup{ + Name: "api", + Count: 10, + EphemeralDisk: &structs.EphemeralDisk{ + SizeMB: 150, + }, + RestartPolicy: &structs.RestartPolicy{ + Attempts: 3, + Interval: 10 * time.Minute, + Delay: 1 * time.Minute, + Mode: structs.RestartPolicyModeDelay, + }, + ReschedulePolicy: &structs.ReschedulePolicy{ + Attempts: 2, + Interval: 10 * time.Minute, + Delay: 5 * time.Second, + DelayFunction: "constant", + }, + Migrate: structs.DefaultMigrateStrategy(), + Networks: []*structs.NetworkResource{ + { + Mode: "host", + DynamicPorts: []structs.Port{ + {Label: "http"}, + {Label: "admin"}, + }, + }, + }, + Tasks: []*structs.Task{ + { + Name: "api", + Driver: "exec", + Config: map[string]interface{}{ + "command": "/bin/date", + }, + Env: map[string]string{ + "FOO": "bar", + }, + Services: []*structs.Service{ + { + Name: "${TASK}-backend", + PortLabel: "http", + Tags: []string{"pci:${meta.pci-dss}", "datacenter:${node.datacenter}"}, + Checks: []*structs.ServiceCheck{ + { + Name: "check-table", + Type: structs.ServiceCheckScript, + Command: "/usr/local/check-table-${meta.database}", + Args: []string{"${meta.version}"}, + Interval: 30 * time.Second, + Timeout: 5 * time.Second, + }, + }, + }, + { + Name: "${TASK}-admin", + PortLabel: "admin", + }, + }, + LogConfig: structs.DefaultLogConfig(), + Resources: &structs.Resources{ + CPU: 500, + MemoryMB: 256, + }, + Meta: map[string]string{ + "foo": "bar", + }, + }, + }, + Meta: map[string]string{ + "elb_check_type": "http", + "elb_check_interval": "30s", + "elb_check_min": "3", + }, + } + job.TaskGroups = append(job.TaskGroups, apiTaskGroup) + job.Canonicalize() + return job +} + func LifecycleSideTask(resources structs.Resources, i int) *structs.Task { return &structs.Task{ Name: fmt.Sprintf("side-%d", i),