diff --git a/client/allochealth/tracker.go b/client/allochealth/tracker.go index fe5aac69f360..fd124b656ff9 100644 --- a/client/allochealth/tracker.go +++ b/client/allochealth/tracker.go @@ -210,7 +210,8 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) { } // setCheckHealth is used to mark the checks as either healthy or unhealthy. -func (t *Tracker) setCheckHealth(healthy bool) { +// returns true if health is propagated and no more health monitoring is needed +func (t *Tracker) setCheckHealth(healthy bool) bool { t.l.Lock() defer t.l.Unlock() @@ -220,7 +221,7 @@ func (t *Tracker) setCheckHealth(healthy bool) { // Only signal if we are healthy and so is the tasks if !t.checksHealthy { - return + return false } select { @@ -230,6 +231,7 @@ func (t *Tracker) setCheckHealth(healthy bool) { // Shutdown the tracker t.cancelFn() + return true } // markAllocStopped is used to mark the allocation as having stopped. @@ -379,7 +381,12 @@ OUTER: allocReg = newAllocReg } case <-healthyTimer.C: - t.setCheckHealth(true) + if t.setCheckHealth(true) { + // final health set and propagated + return + } + // tasks are unhealthy, reset and wait until all is healthy + primed = false } if allocReg == nil { diff --git a/client/allochealth/tracker_test.go b/client/allochealth/tracker_test.go index 6e50ef636468..979af227b34d 100644 --- a/client/allochealth/tracker_test.go +++ b/client/allochealth/tracker_test.go @@ -227,3 +227,112 @@ func TestTracker_Healthy_IfBothTasksAndConsulChecksAreHealthy(t *testing.T) { require.Fail(t, "expected a health status") } } + +// TestTracker_Checks_Healthy_Before_TaskHealth asserts that we mark an alloc +// healthy, if the checks pass before task health pass +func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) { + t.Parallel() + + alloc := mock.Alloc() + alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up + task := alloc.Job.TaskGroups[0].Tasks[0] + + // new task starting unhealthy, without services + task2 := task.Copy() + task2.Name = task2.Name + "2" + task2.Services = nil + alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) + + // Synthesize running alloc and tasks + alloc.ClientStatus = structs.AllocClientStatusRunning + alloc.TaskStates = map[string]*structs.TaskState{ + task.Name: { + State: structs.TaskStateRunning, + StartedAt: time.Now(), + }, + task2.Name: { + State: structs.TaskStatePending, + }, + } + + // Make Consul response + check := &consulapi.AgentCheck{ + Name: task.Services[0].Checks[0].Name, + Status: consulapi.HealthPassing, + } + taskRegs := map[string]*agentconsul.ServiceRegistrations{ + task.Name: { + Services: map[string]*agentconsul.ServiceRegistration{ + task.Services[0].Name: { + Service: &consulapi.AgentService{ + ID: "foo", + Service: task.Services[0].Name, + }, + Checks: []*consulapi.AgentCheck{check}, + }, + }, + }, + } + + logger := testlog.HCLogger(t) + b := cstructs.NewAllocBroadcaster(logger) + defer b.Close() + + // Don't reply on the first call + var called uint64 + consul := consul.NewMockConsulServiceClient(t, logger) + consul.AllocRegistrationsFn = func(string) (*agentconsul.AllocRegistration, error) { + if atomic.AddUint64(&called, 1) == 1 { + return nil, nil + } + + reg := &agentconsul.AllocRegistration{ + Tasks: taskRegs, + } + + return reg, nil + } + + ctx, cancelFn := context.WithCancel(context.Background()) + defer cancelFn() + + checkInterval := 10 * time.Millisecond + tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, + time.Millisecond, true) + tracker.checkLookupInterval = checkInterval + tracker.Start() + + // assert that we don't get marked healthy + select { + case <-time.After(4 * checkInterval): + // still unhealthy, good + case h := <-tracker.HealthyCh(): + require.Fail(t, "unexpected health event", h) + } + require.False(t, tracker.tasksHealthy) + require.False(t, tracker.checksHealthy) + + // now set task to healthy + runningAlloc := alloc.Copy() + runningAlloc.TaskStates = map[string]*structs.TaskState{ + task.Name: { + State: structs.TaskStateRunning, + StartedAt: time.Now(), + }, + task2.Name: { + State: structs.TaskStateRunning, + StartedAt: time.Now(), + }, + } + err := b.Send(runningAlloc) + require.NoError(t, err) + + // eventually, it is marked as healthy + select { + case <-time.After(4 * checkInterval): + require.Fail(t, "timed out while waiting for health") + case h := <-tracker.HealthyCh(): + require.True(t, h) + } + +}