Skip to content

Commit

Permalink
Fix health checking for ephemeral poststart tasks (#11945)
Browse files Browse the repository at this point in the history
Update the logic in the Nomad client's alloc health tracker which
erroneously marks existing healthy allocations with dead poststart ephemeral
tasks as unhealthy even if they were already successful during a previous
deployment.
  • Loading branch information
beautifulentropy committed Feb 2, 2022
1 parent c613dc5 commit 37c14b2
Show file tree
Hide file tree
Showing 5 changed files with 233 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .changelog/11945.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
Fixed a bug where successful poststart tasks were marked as unhealthy
```
6 changes: 6 additions & 0 deletions client/allochealth/tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,12 @@ func (t *Tracker) watchTaskEvents() {
continue
}

// If this is a poststart task which has already succeeded, we
// should skip evaluation.
if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststart && state.Successful() {
continue
}

// One of the tasks has failed so we can exit watching
if state.Failed || (!state.FinishedAt.IsZero() && t.lifecycleTasks[taskName] != structs.TaskLifecycleHookPrestart) {
t.setTaskHealth(false, true)
Expand Down
41 changes: 41 additions & 0 deletions client/allochealth/tracker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,47 @@ func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) {
}
}

func TestTracker_Succeeded_PostStart_Healthy(t *testing.T) {
t.Parallel()

alloc := mock.LifecycleAllocWithPoststartDeploy()
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = time.Millisecond * 1
// Synthesize running alloc and tasks
alloc.ClientStatus = structs.AllocClientStatusRunning
alloc.TaskStates = map[string]*structs.TaskState{
"web": {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
"post": {
State: structs.TaskStateDead,
StartedAt: time.Now(),
FinishedAt: time.Now().Add(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime / 2),
},
}

logger := testlog.HCLogger(t)
b := cstructs.NewAllocBroadcaster(logger)
defer b.Close()

consul := consul.NewMockConsulServiceClient(t, logger)
ctx, cancelFn := context.WithCancel(context.Background())
defer cancelFn()

checkInterval := 10 * time.Millisecond
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime, true)
tracker.checkLookupInterval = checkInterval
tracker.Start()

select {
case <-time.After(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime * 2):
require.Fail(t, "timed out while waiting for health")
case h := <-tracker.HealthyCh():
require.True(t, h)
}
}

func TestTracker_Checks_Unhealthy(t *testing.T) {
t.Parallel()

Expand Down
180 changes: 180 additions & 0 deletions nomad/mock/mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,110 @@ func LifecycleJobWithPoststopDeploy() *structs.Job {
return job
}

func LifecycleJobWithPoststartDeploy() *structs.Job {
job := &structs.Job{
Region: "global",
ID: fmt.Sprintf("mock-service-%s", uuid.Generate()),
Name: "my-job",
Namespace: structs.DefaultNamespace,
Type: structs.JobTypeBatch,
Priority: 50,
AllAtOnce: false,
Datacenters: []string{"dc1"},
Constraints: []*structs.Constraint{
{
LTarget: "${attr.kernel.name}",
RTarget: "linux",
Operand: "=",
},
},
TaskGroups: []*structs.TaskGroup{
{
Name: "web",
Count: 1,
Migrate: structs.DefaultMigrateStrategy(),
RestartPolicy: &structs.RestartPolicy{
Attempts: 0,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
Mode: structs.RestartPolicyModeFail,
},
Tasks: []*structs.Task{
{
Name: "web",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "side",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPrestart,
Sidecar: true,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "post",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPoststart,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "init",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPrestart,
Sidecar: false,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
},
},
},
Meta: map[string]string{
"owner": "armon",
},
Status: structs.JobStatusPending,
Version: 0,
CreateIndex: 42,
ModifyIndex: 99,
JobModifyIndex: 99,
}
job.Canonicalize()
return job
}

func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
alloc := &structs.Allocation{
ID: uuid.Generate(),
Expand Down Expand Up @@ -841,6 +945,82 @@ func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
return alloc
}

func LifecycleAllocWithPoststartDeploy() *structs.Allocation {
alloc := &structs.Allocation{
ID: uuid.Generate(),
EvalID: uuid.Generate(),
NodeID: "12345678-abcd-efab-cdef-123456789xyz",
Namespace: structs.DefaultNamespace,
TaskGroup: "web",

// TODO Remove once clientv2 gets merged
Resources: &structs.Resources{
CPU: 500,
MemoryMB: 256,
},
TaskResources: map[string]*structs.Resources{
"web": {
CPU: 1000,
MemoryMB: 256,
},
"init": {
CPU: 1000,
MemoryMB: 256,
},
"side": {
CPU: 1000,
MemoryMB: 256,
},
"post": {
CPU: 1000,
MemoryMB: 256,
},
},

AllocatedResources: &structs.AllocatedResources{
Tasks: map[string]*structs.AllocatedTaskResources{
"web": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"init": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"side": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"post": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
},
},
Job: LifecycleJobWithPoststartDeploy(),
DesiredStatus: structs.AllocDesiredStatusRun,
ClientStatus: structs.AllocClientStatusPending,
}
alloc.JobID = alloc.Job.ID
return alloc
}

func MaxParallelJob() *structs.Job {
update := *structs.DefaultUpdateStrategy
update.MaxParallel = 0
Expand Down
6 changes: 3 additions & 3 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -7769,9 +7769,9 @@ func (ts *TaskState) Copy() *TaskState {
return newTS
}

// Successful returns whether a task finished successfully. This doesn't really
// have meaning on a non-batch allocation because a service and system
// allocation should not finish.
// Successful returns whether a task finished successfully. Only meaningful for
// for batch allocations or ephemeral (non-sidecar) lifecycle tasks part of a
// service or system allocation.
func (ts *TaskState) Successful() bool {
return ts.State == TaskStateDead && !ts.Failed
}
Expand Down

0 comments on commit 37c14b2

Please sign in to comment.