Skip to content

Commit

Permalink
ignore poststop task in alloc health tracker (#9548), fixes #9361
Browse files Browse the repository at this point in the history
* investigating where to ignore poststop task in alloc health tracker

* ignore poststop when setting latest start time for allocation

* clean up logic

* lifecycle: isolate mocks for poststop deployment test

* lifecycle: update comments in tracker

Co-authored-by: Jasmine Dahilig <jasmine@dahilig.com>
  • Loading branch information
drewbailey and jazzyfresh committed Jan 12, 2021
1 parent 0514e8d commit 492d62d
Show file tree
Hide file tree
Showing 3 changed files with 234 additions and 6 deletions.
20 changes: 14 additions & 6 deletions client/allochealth/tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ type Tracker struct {
// not needed
allocStopped chan struct{}

// lifecycleTasks is a set of tasks with lifecycle hook set and may
// terminate without affecting alloc health
lifecycleTasks map[string]bool
// lifecycleTasks is a map of ephemeral tasks and their lifecycle hooks.
// These tasks may terminate without affecting alloc health
lifecycleTasks map[string]string

// l is used to lock shared fields listed below
l sync.Mutex
Expand Down Expand Up @@ -110,15 +110,15 @@ func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.A
consulClient: consulClient,
checkLookupInterval: consulCheckLookupInterval,
logger: logger,
lifecycleTasks: map[string]bool{},
lifecycleTasks: map[string]string{},
}

t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks))
for _, task := range t.tg.Tasks {
t.taskHealth[task.Name] = &taskHealthState{task: task}

if task.Lifecycle != nil && !task.Lifecycle.Sidecar {
t.lifecycleTasks[task.Name] = true
t.lifecycleTasks[task.Name] = task.Lifecycle.Hook
}

for _, s := range task.Services {
Expand Down Expand Up @@ -277,8 +277,15 @@ func (t *Tracker) watchTaskEvents() {
// Detect if the alloc is unhealthy or if all tasks have started yet
latestStartTime := time.Time{}
for taskName, state := range alloc.TaskStates {
// If the task is a poststop task we do not want to evaluate it
// since it will remain pending until the main task has finished
// or exited.
if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststop {
continue
}

// One of the tasks has failed so we can exit watching
if state.Failed || (!state.FinishedAt.IsZero() && !t.lifecycleTasks[taskName]) {
if state.Failed || !state.FinishedAt.IsZero() {
t.setTaskHealth(false, true)
return
}
Expand All @@ -299,6 +306,7 @@ func (t *Tracker) watchTaskEvents() {
t.l.Lock()
t.allocFailed = true
t.l.Unlock()

t.setTaskHealth(false, true)
return
}
Expand Down
40 changes: 40 additions & 0 deletions client/allochealth/tracker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,46 @@ func TestTracker_Checks_Healthy(t *testing.T) {
}
}

func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) {
t.Parallel()

alloc := mock.LifecycleAllocWithPoststopDeploy()
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up

// Synthesize running alloc and tasks
alloc.ClientStatus = structs.AllocClientStatusRunning
alloc.TaskStates = map[string]*structs.TaskState{
"web": {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
"post": {
State: structs.TaskStatePending,
},
}

logger := testlog.HCLogger(t)
b := cstructs.NewAllocBroadcaster(logger)
defer b.Close()

consul := consul.NewMockConsulServiceClient(t, logger)
ctx, cancelFn := context.WithCancel(context.Background())
defer cancelFn()

checkInterval := 10 * time.Millisecond
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
time.Millisecond, true)
tracker.checkLookupInterval = checkInterval
tracker.Start()

select {
case <-time.After(4 * checkInterval):
require.Fail(t, "timed out while waiting for health")
case h := <-tracker.HealthyCh():
require.True(t, h)
}
}

func TestTracker_Checks_Unhealthy(t *testing.T) {
t.Parallel()

Expand Down
180 changes: 180 additions & 0 deletions nomad/mock/mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,186 @@ func LifecycleAlloc() *structs.Allocation {
return alloc
}

func LifecycleJobWithPoststopDeploy() *structs.Job {
job := &structs.Job{
Region: "global",
ID: fmt.Sprintf("mock-service-%s", uuid.Generate()),
Name: "my-job",
Namespace: structs.DefaultNamespace,
Type: structs.JobTypeBatch,
Priority: 50,
AllAtOnce: false,
Datacenters: []string{"dc1"},
Constraints: []*structs.Constraint{
{
LTarget: "${attr.kernel.name}",
RTarget: "linux",
Operand: "=",
},
},
TaskGroups: []*structs.TaskGroup{
{
Name: "web",
Count: 1,
Migrate: structs.DefaultMigrateStrategy(),
RestartPolicy: &structs.RestartPolicy{
Attempts: 0,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
Mode: structs.RestartPolicyModeFail,
},
Tasks: []*structs.Task{
{
Name: "web",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "side",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPrestart,
Sidecar: true,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "post",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPoststop,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "init",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPrestart,
Sidecar: false,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
},
},
},
Meta: map[string]string{
"owner": "armon",
},
Status: structs.JobStatusPending,
Version: 0,
CreateIndex: 42,
ModifyIndex: 99,
JobModifyIndex: 99,
}
job.Canonicalize()
return job
}

func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
alloc := &structs.Allocation{
ID: uuid.Generate(),
EvalID: uuid.Generate(),
NodeID: "12345678-abcd-efab-cdef-123456789abc",
Namespace: structs.DefaultNamespace,
TaskGroup: "web",

// TODO Remove once clientv2 gets merged
Resources: &structs.Resources{
CPU: 500,
MemoryMB: 256,
},
TaskResources: map[string]*structs.Resources{
"web": {
CPU: 1000,
MemoryMB: 256,
},
"init": {
CPU: 1000,
MemoryMB: 256,
},
"side": {
CPU: 1000,
MemoryMB: 256,
},
"post": {
CPU: 1000,
MemoryMB: 256,
},
},

AllocatedResources: &structs.AllocatedResources{
Tasks: map[string]*structs.AllocatedTaskResources{
"web": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"init": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"side": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"post": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
},
},
Job: LifecycleJobWithPoststopDeploy(),
DesiredStatus: structs.AllocDesiredStatusRun,
ClientStatus: structs.AllocClientStatusPending,
}
alloc.JobID = alloc.Job.ID
return alloc
}

func MaxParallelJob() *structs.Job {
update := *structs.DefaultUpdateStrategy
update.MaxParallel = 0
Expand Down

0 comments on commit 492d62d

Please sign in to comment.