From 492d62d3df296b5e869dab100d9552cf192f515f Mon Sep 17 00:00:00 2001 From: Drew Bailey Date: Tue, 12 Jan 2021 13:03:48 -0500 Subject: [PATCH] ignore poststop task in alloc health tracker (#9548), fixes #9361 * investigating where to ignore poststop task in alloc health tracker * ignore poststop when setting latest start time for allocation * clean up logic * lifecycle: isolate mocks for poststop deployment test * lifecycle: update comments in tracker Co-authored-by: Jasmine Dahilig --- client/allochealth/tracker.go | 20 +++- client/allochealth/tracker_test.go | 40 +++++++ nomad/mock/mock.go | 180 +++++++++++++++++++++++++++++ 3 files changed, 234 insertions(+), 6 deletions(-) diff --git a/client/allochealth/tracker.go b/client/allochealth/tracker.go index fd124b656ff9..c61f25664559 100644 --- a/client/allochealth/tracker.go +++ b/client/allochealth/tracker.go @@ -66,9 +66,9 @@ type Tracker struct { // not needed allocStopped chan struct{} - // lifecycleTasks is a set of tasks with lifecycle hook set and may - // terminate without affecting alloc health - lifecycleTasks map[string]bool + // lifecycleTasks is a map of ephemeral tasks and their lifecycle hooks. + // These tasks may terminate without affecting alloc health + lifecycleTasks map[string]string // l is used to lock shared fields listed below l sync.Mutex @@ -110,7 +110,7 @@ func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.A consulClient: consulClient, checkLookupInterval: consulCheckLookupInterval, logger: logger, - lifecycleTasks: map[string]bool{}, + lifecycleTasks: map[string]string{}, } t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks)) @@ -118,7 +118,7 @@ func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.A t.taskHealth[task.Name] = &taskHealthState{task: task} if task.Lifecycle != nil && !task.Lifecycle.Sidecar { - t.lifecycleTasks[task.Name] = true + t.lifecycleTasks[task.Name] = task.Lifecycle.Hook } for _, s := range task.Services { @@ -277,8 +277,15 @@ func (t *Tracker) watchTaskEvents() { // Detect if the alloc is unhealthy or if all tasks have started yet latestStartTime := time.Time{} for taskName, state := range alloc.TaskStates { + // If the task is a poststop task we do not want to evaluate it + // since it will remain pending until the main task has finished + // or exited. + if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststop { + continue + } + // One of the tasks has failed so we can exit watching - if state.Failed || (!state.FinishedAt.IsZero() && !t.lifecycleTasks[taskName]) { + if state.Failed || !state.FinishedAt.IsZero() { t.setTaskHealth(false, true) return } @@ -299,6 +306,7 @@ func (t *Tracker) watchTaskEvents() { t.l.Lock() t.allocFailed = true t.l.Unlock() + t.setTaskHealth(false, true) return } diff --git a/client/allochealth/tracker_test.go b/client/allochealth/tracker_test.go index 979af227b34d..6f8c702ef015 100644 --- a/client/allochealth/tracker_test.go +++ b/client/allochealth/tracker_test.go @@ -89,6 +89,46 @@ func TestTracker_Checks_Healthy(t *testing.T) { } } +func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) { + t.Parallel() + + alloc := mock.LifecycleAllocWithPoststopDeploy() + alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up + + // Synthesize running alloc and tasks + alloc.ClientStatus = structs.AllocClientStatusRunning + alloc.TaskStates = map[string]*structs.TaskState{ + "web": { + State: structs.TaskStateRunning, + StartedAt: time.Now(), + }, + "post": { + State: structs.TaskStatePending, + }, + } + + logger := testlog.HCLogger(t) + b := cstructs.NewAllocBroadcaster(logger) + defer b.Close() + + consul := consul.NewMockConsulServiceClient(t, logger) + ctx, cancelFn := context.WithCancel(context.Background()) + defer cancelFn() + + checkInterval := 10 * time.Millisecond + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, + time.Millisecond, true) + tracker.checkLookupInterval = checkInterval + tracker.Start() + + select { + case <-time.After(4 * checkInterval): + require.Fail(t, "timed out while waiting for health") + case h := <-tracker.HealthyCh(): + require.True(t, h) + } +} + func TestTracker_Checks_Unhealthy(t *testing.T) { t.Parallel() diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 0273eeeead5f..0125827ad627 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -529,6 +529,186 @@ func LifecycleAlloc() *structs.Allocation { return alloc } +func LifecycleJobWithPoststopDeploy() *structs.Job { + job := &structs.Job{ + Region: "global", + ID: fmt.Sprintf("mock-service-%s", uuid.Generate()), + Name: "my-job", + Namespace: structs.DefaultNamespace, + Type: structs.JobTypeBatch, + Priority: 50, + AllAtOnce: false, + Datacenters: []string{"dc1"}, + Constraints: []*structs.Constraint{ + { + LTarget: "${attr.kernel.name}", + RTarget: "linux", + Operand: "=", + }, + }, + TaskGroups: []*structs.TaskGroup{ + { + Name: "web", + Count: 1, + Migrate: structs.DefaultMigrateStrategy(), + RestartPolicy: &structs.RestartPolicy{ + Attempts: 0, + Interval: 10 * time.Minute, + Delay: 1 * time.Minute, + Mode: structs.RestartPolicyModeFail, + }, + Tasks: []*structs.Task{ + { + Name: "web", + Driver: "mock_driver", + Config: map[string]interface{}{ + "run_for": "1s", + }, + LogConfig: structs.DefaultLogConfig(), + Resources: &structs.Resources{ + CPU: 1000, + MemoryMB: 256, + }, + }, + { + Name: "side", + Driver: "mock_driver", + Config: map[string]interface{}{ + "run_for": "1s", + }, + Lifecycle: &structs.TaskLifecycleConfig{ + Hook: structs.TaskLifecycleHookPrestart, + Sidecar: true, + }, + LogConfig: structs.DefaultLogConfig(), + Resources: &structs.Resources{ + CPU: 1000, + MemoryMB: 256, + }, + }, + { + Name: "post", + Driver: "mock_driver", + Config: map[string]interface{}{ + "run_for": "1s", + }, + Lifecycle: &structs.TaskLifecycleConfig{ + Hook: structs.TaskLifecycleHookPoststop, + }, + LogConfig: structs.DefaultLogConfig(), + Resources: &structs.Resources{ + CPU: 1000, + MemoryMB: 256, + }, + }, + { + Name: "init", + Driver: "mock_driver", + Config: map[string]interface{}{ + "run_for": "1s", + }, + Lifecycle: &structs.TaskLifecycleConfig{ + Hook: structs.TaskLifecycleHookPrestart, + Sidecar: false, + }, + LogConfig: structs.DefaultLogConfig(), + Resources: &structs.Resources{ + CPU: 1000, + MemoryMB: 256, + }, + }, + }, + }, + }, + Meta: map[string]string{ + "owner": "armon", + }, + Status: structs.JobStatusPending, + Version: 0, + CreateIndex: 42, + ModifyIndex: 99, + JobModifyIndex: 99, + } + job.Canonicalize() + return job +} + +func LifecycleAllocWithPoststopDeploy() *structs.Allocation { + alloc := &structs.Allocation{ + ID: uuid.Generate(), + EvalID: uuid.Generate(), + NodeID: "12345678-abcd-efab-cdef-123456789abc", + Namespace: structs.DefaultNamespace, + TaskGroup: "web", + + // TODO Remove once clientv2 gets merged + Resources: &structs.Resources{ + CPU: 500, + MemoryMB: 256, + }, + TaskResources: map[string]*structs.Resources{ + "web": { + CPU: 1000, + MemoryMB: 256, + }, + "init": { + CPU: 1000, + MemoryMB: 256, + }, + "side": { + CPU: 1000, + MemoryMB: 256, + }, + "post": { + CPU: 1000, + MemoryMB: 256, + }, + }, + + AllocatedResources: &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: 1000, + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 256, + }, + }, + "init": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: 1000, + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 256, + }, + }, + "side": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: 1000, + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 256, + }, + }, + "post": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: 1000, + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 256, + }, + }, + }, + }, + Job: LifecycleJobWithPoststopDeploy(), + DesiredStatus: structs.AllocDesiredStatusRun, + ClientStatus: structs.AllocClientStatusPending, + } + alloc.JobID = alloc.Job.ID + return alloc +} + func MaxParallelJob() *structs.Job { update := *structs.DefaultUpdateStrategy update.MaxParallel = 0