Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allocs are healthy if service checks get healthy before task health #7944

Merged
merged 2 commits into from
May 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions client/allochealth/tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) {
}

// setCheckHealth is used to mark the checks as either healthy or unhealthy.
func (t *Tracker) setCheckHealth(healthy bool) {
// returns true if health is propagated and no more health monitoring is needed
func (t *Tracker) setCheckHealth(healthy bool) bool {
t.l.Lock()
defer t.l.Unlock()

Expand All @@ -220,7 +221,7 @@ func (t *Tracker) setCheckHealth(healthy bool) {

// Only signal if we are healthy and so is the tasks
if !t.checksHealthy {
return
return false
}

select {
Expand All @@ -230,6 +231,7 @@ func (t *Tracker) setCheckHealth(healthy bool) {

// Shutdown the tracker
t.cancelFn()
return true
}

// markAllocStopped is used to mark the allocation as having stopped.
Expand Down Expand Up @@ -379,7 +381,12 @@ OUTER:
allocReg = newAllocReg
}
case <-healthyTimer.C:
t.setCheckHealth(true)
if t.setCheckHealth(true) {
// final health set and propagated
return
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring for this function says this is a "long lived goroutine". If we return here, how does the tracker detect service check failures that happen later on?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"long lived goroutine" is a bit misnomer - it runs until the alloc is marked healthy or terminally unhealthy. Once an alloc is marked healthy, it never marks it unhealthy.

The exiting goroutine happens today already. setCheckHealth and setTaskHealth call t.cancelFn() which will cause both watchTaskEvents and watchConsulEvents as they handle t.ctx.Done().

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, gotcha.

}
// tasks are unhealthy, reset and wait until all is healthy
primed = false
}

if allocReg == nil {
Expand Down
109 changes: 109 additions & 0 deletions client/allochealth/tracker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,112 @@ func TestTracker_Healthy_IfBothTasksAndConsulChecksAreHealthy(t *testing.T) {
require.Fail(t, "expected a health status")
}
}

// TestTracker_Checks_Healthy_Before_TaskHealth asserts that we mark an alloc
// healthy, if the checks pass before task health pass
func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) {
t.Parallel()

alloc := mock.Alloc()
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
task := alloc.Job.TaskGroups[0].Tasks[0]

// new task starting unhealthy, without services
task2 := task.Copy()
task2.Name = task2.Name + "2"
task2.Services = nil
alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)

// Synthesize running alloc and tasks
alloc.ClientStatus = structs.AllocClientStatusRunning
alloc.TaskStates = map[string]*structs.TaskState{
task.Name: {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
task2.Name: {
State: structs.TaskStatePending,
},
}

// Make Consul response
check := &consulapi.AgentCheck{
Name: task.Services[0].Checks[0].Name,
Status: consulapi.HealthPassing,
}
taskRegs := map[string]*agentconsul.ServiceRegistrations{
task.Name: {
Services: map[string]*agentconsul.ServiceRegistration{
task.Services[0].Name: {
Service: &consulapi.AgentService{
ID: "foo",
Service: task.Services[0].Name,
},
Checks: []*consulapi.AgentCheck{check},
},
},
},
}

logger := testlog.HCLogger(t)
b := cstructs.NewAllocBroadcaster(logger)
defer b.Close()

// Don't reply on the first call
var called uint64
consul := consul.NewMockConsulServiceClient(t, logger)
consul.AllocRegistrationsFn = func(string) (*agentconsul.AllocRegistration, error) {
if atomic.AddUint64(&called, 1) == 1 {
return nil, nil
}

reg := &agentconsul.AllocRegistration{
Tasks: taskRegs,
}

return reg, nil
}

ctx, cancelFn := context.WithCancel(context.Background())
defer cancelFn()

checkInterval := 10 * time.Millisecond
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
time.Millisecond, true)
tracker.checkLookupInterval = checkInterval
tracker.Start()

// assert that we don't get marked healthy
select {
case <-time.After(4 * checkInterval):
// still unhealthy, good
case h := <-tracker.HealthyCh():
require.Fail(t, "unexpected health event", h)
}
require.False(t, tracker.tasksHealthy)
require.False(t, tracker.checksHealthy)

// now set task to healthy
runningAlloc := alloc.Copy()
runningAlloc.TaskStates = map[string]*structs.TaskState{
task.Name: {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
task2.Name: {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
}
err := b.Send(runningAlloc)
require.NoError(t, err)

// eventually, it is marked as healthy
select {
case <-time.After(4 * checkInterval):
require.Fail(t, "timed out while waiting for health")
case h := <-tracker.HealthyCh():
require.True(t, h)
}

}