Skip to content

Commit

Permalink
Merge pull request #12616 from hashicorp/backport/fix-post-start-task…
Browse files Browse the repository at this point in the history
…s/optionally-just-pup

This pull request was automerged via backport-assistant
  • Loading branch information
hc-github-team-nomad-core committed Apr 19, 2022
2 parents 58655b5 + c346943 commit ef77d35
Show file tree
Hide file tree
Showing 5 changed files with 233 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .changelog/11945.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
Fixed a bug where successful poststart tasks were marked as unhealthy
```
6 changes: 6 additions & 0 deletions client/allochealth/tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,12 @@ func (t *Tracker) watchTaskEvents() {
continue
}

// If this is a poststart task which has already succeeded, we
// should skip evaluation.
if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststart && state.Successful() {
continue
}

// One of the tasks has failed so we can exit watching
if state.Failed || (!state.FinishedAt.IsZero() && t.lifecycleTasks[taskName] != structs.TaskLifecycleHookPrestart) {
t.setTaskHealth(false, true)
Expand Down
41 changes: 41 additions & 0 deletions client/allochealth/tracker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,47 @@ func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) {
}
}

func TestTracker_Succeeded_PostStart_Healthy(t *testing.T) {
t.Parallel()

alloc := mock.LifecycleAllocWithPoststartDeploy()
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = time.Millisecond * 1
// Synthesize running alloc and tasks
alloc.ClientStatus = structs.AllocClientStatusRunning
alloc.TaskStates = map[string]*structs.TaskState{
"web": {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
"post": {
State: structs.TaskStateDead,
StartedAt: time.Now(),
FinishedAt: time.Now().Add(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime / 2),
},
}

logger := testlog.HCLogger(t)
b := cstructs.NewAllocBroadcaster(logger)
defer b.Close()

consul := consul.NewMockConsulServiceClient(t, logger)
ctx, cancelFn := context.WithCancel(context.Background())
defer cancelFn()

checkInterval := 10 * time.Millisecond
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime, true)
tracker.checkLookupInterval = checkInterval
tracker.Start()

select {
case <-time.After(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime * 2):
require.Fail(t, "timed out while waiting for health")
case h := <-tracker.HealthyCh():
require.True(t, h)
}
}

func TestTracker_Checks_Unhealthy(t *testing.T) {
t.Parallel()

Expand Down
180 changes: 180 additions & 0 deletions nomad/mock/mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,110 @@ func LifecycleJobWithPoststopDeploy() *structs.Job {
return job
}

func LifecycleJobWithPoststartDeploy() *structs.Job {
job := &structs.Job{
Region: "global",
ID: fmt.Sprintf("mock-service-%s", uuid.Generate()),
Name: "my-job",
Namespace: structs.DefaultNamespace,
Type: structs.JobTypeBatch,
Priority: 50,
AllAtOnce: false,
Datacenters: []string{"dc1"},
Constraints: []*structs.Constraint{
{
LTarget: "${attr.kernel.name}",
RTarget: "linux",
Operand: "=",
},
},
TaskGroups: []*structs.TaskGroup{
{
Name: "web",
Count: 1,
Migrate: structs.DefaultMigrateStrategy(),
RestartPolicy: &structs.RestartPolicy{
Attempts: 0,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
Mode: structs.RestartPolicyModeFail,
},
Tasks: []*structs.Task{
{
Name: "web",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "side",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPrestart,
Sidecar: true,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "post",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPoststart,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "init",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPrestart,
Sidecar: false,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
},
},
},
Meta: map[string]string{
"owner": "armon",
},
Status: structs.JobStatusPending,
Version: 0,
CreateIndex: 42,
ModifyIndex: 99,
JobModifyIndex: 99,
}
job.Canonicalize()
return job
}

func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
alloc := &structs.Allocation{
ID: uuid.Generate(),
Expand Down Expand Up @@ -841,6 +945,82 @@ func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
return alloc
}

func LifecycleAllocWithPoststartDeploy() *structs.Allocation {
alloc := &structs.Allocation{
ID: uuid.Generate(),
EvalID: uuid.Generate(),
NodeID: "12345678-abcd-efab-cdef-123456789xyz",
Namespace: structs.DefaultNamespace,
TaskGroup: "web",

// TODO Remove once clientv2 gets merged
Resources: &structs.Resources{
CPU: 500,
MemoryMB: 256,
},
TaskResources: map[string]*structs.Resources{
"web": {
CPU: 1000,
MemoryMB: 256,
},
"init": {
CPU: 1000,
MemoryMB: 256,
},
"side": {
CPU: 1000,
MemoryMB: 256,
},
"post": {
CPU: 1000,
MemoryMB: 256,
},
},

AllocatedResources: &structs.AllocatedResources{
Tasks: map[string]*structs.AllocatedTaskResources{
"web": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"init": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"side": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"post": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
},
},
Job: LifecycleJobWithPoststartDeploy(),
DesiredStatus: structs.AllocDesiredStatusRun,
ClientStatus: structs.AllocClientStatusPending,
}
alloc.JobID = alloc.Job.ID
return alloc
}

func MaxParallelJob() *structs.Job {
update := *structs.DefaultUpdateStrategy
update.MaxParallel = 0
Expand Down
6 changes: 3 additions & 3 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -7769,9 +7769,9 @@ func (ts *TaskState) Copy() *TaskState {
return newTS
}

// Successful returns whether a task finished successfully. This doesn't really
// have meaning on a non-batch allocation because a service and system
// allocation should not finish.
// Successful returns whether a task finished successfully. Only meaningful for
// for batch allocations or ephemeral (non-sidecar) lifecycle tasks part of a
// service or system allocation.
func (ts *TaskState) Successful() bool {
return ts.State == TaskStateDead && !ts.Failed
}
Expand Down

0 comments on commit ef77d35

Please sign in to comment.