Skip to content

Commit

Permalink
[no ci] wip debugging unhealthy checks test
Browse files Browse the repository at this point in the history
  • Loading branch information
shoenig committed Jul 12, 2022
1 parent 0f10573 commit 18b4561
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 26 deletions.
47 changes: 28 additions & 19 deletions client/allochealth/tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
"gophers.dev/pkgs/netlog"
)

const (
Expand Down Expand Up @@ -155,10 +156,10 @@ func NewTracker(
func countChecks(services []*structs.Service) (consul, nomad int) {
for _, service := range services {
switch service.Provider {
case structs.ServiceProviderConsul:
consul += len(service.Checks)
case structs.ServiceProviderNomad:
nomad += len(service.Checks)
default:
consul += len(service.Checks)
}
}
return
Expand Down Expand Up @@ -217,6 +218,8 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) {
t.lock.Lock()
defer t.lock.Unlock()

netlog.Yellow("T setTaskHealth healthy: %t, terminal: %t", healthy, terminal)

t.tasksHealthy = healthy

// if unhealthy, force waiting for new checks health status
Expand All @@ -228,13 +231,17 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) {
// If we are marked healthy but we also require Consul checks to be healthy
// and they are not yet, return, unless the task is terminal.
usesConsulChecks := t.useChecks && t.consulCheckCount > 0
netlog.Yellow("uses consul checks: %t, use checks: %t ccount: %d", usesConsulChecks, t.useChecks, t.consulCheckCount)
if !terminal && healthy && usesConsulChecks && !t.checksHealthy {
return
}

netlog.Yellow("should not get here")

// If we are marked healthy but also require Nomad checks to be healthy and
// they are not yet, return, unless the task is terminal.
usesNomadChecks := t.useChecks && t.nomadCheckCount > 0
netlog.Yellow("uses nomad checks: %t", usesNomadChecks)
if !terminal && healthy && usesNomadChecks && !t.checksHealthy {
return
}
Expand Down Expand Up @@ -291,14 +298,7 @@ func (t *Tracker) watchTaskEvents() {
alloc := t.alloc
allStartedTime := time.Time{}

// todo(shoenig): refactor with healthyFuture
healthyTimer := time.NewTimer(0)
if !healthyTimer.Stop() {
select {
case <-healthyTimer.C:
default:
}
}
waiter := newHealthyFuture()

for {
// If the alloc is being stopped by the server just exit
Expand All @@ -315,7 +315,6 @@ func (t *Tracker) watchTaskEvents() {
//TODO(schmichael) for now skip unknown tasks as
//they're task group services which don't currently
//support checks anyway
//TODO(shoenig) is ^ still true?
if v, ok := t.taskHealth[task]; ok {
v.state = state
}
Expand Down Expand Up @@ -370,17 +369,12 @@ func (t *Tracker) watchTaskEvents() {
t.setTaskHealth(false, false)

// Avoid the timer from firing at the old start time
if !healthyTimer.Stop() {
select {
case <-healthyTimer.C:
default:
}
}
waiter.disable()

// Set the timer since all tasks are started
if !latestStartTime.IsZero() {
allStartedTime = latestStartTime
healthyTimer.Reset(t.minHealthyTime)
waiter.wait(t.minHealthyTime)
}
}

Expand All @@ -392,7 +386,8 @@ func (t *Tracker) watchTaskEvents() {
return
}
alloc = newAlloc
case <-healthyTimer.C:
case <-waiter.C():
netlog.Yellow("watchTaskEvents, healthy timer trigger, manual setTaskHealth(true, false)")
t.setTaskHealth(true, false)
}
}
Expand Down Expand Up @@ -428,6 +423,7 @@ func (h *healthyFuture) disable() {
func (h *healthyFuture) wait(dur time.Duration) {
// must ensure timer is stopped
// https://pkg.go.dev/time#Timer.Reset
netlog.Blue("future wait: %v", dur)
h.disable()
h.timer.Reset(dur)
}
Expand All @@ -448,6 +444,9 @@ func (t *Tracker) watchConsulEvents() {
checkTicker := time.NewTicker(t.checkLookupInterval)
defer checkTicker.Stop()

netlog.Blue("t.checkLookupInterval: %v", t.checkLookupInterval)
netlog.Blue("minHealthyTime: %v", t.minHealthyTime)

// waiter is used to fire when the checks have been healthy for the MinHealthyTime
waiter := newHealthyFuture()

Expand All @@ -466,11 +465,13 @@ OUTER:

// we are shutting down
case <-t.ctx.Done():
netlog.Yellow("CE shutdown")
return

// it is time to check the checks
case <-checkTicker.C:
newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID)
netlog.Yellow("CE time to check checks %s %v, %v", t.alloc.ID, newAllocReg, err)
if err != nil {
if !consulChecksErr {
consulChecksErr = true
Expand All @@ -484,6 +485,7 @@ OUTER:

// enough time has passed with healthy checks
case <-waiter.C():
netlog.Yellow("CE set check health true")
if t.setCheckHealth(true) {
// final health set and propagated
return
Expand Down Expand Up @@ -517,27 +519,34 @@ OUTER:
for _, sreg := range treg.Services {
for _, check := range sreg.Checks {
onUpdate := sreg.CheckOnUpdate[check.CheckID]
netlog.Yellow("check: %s, status: %v, onUpdate: %s", check.Name, check.Status, onUpdate)
switch check.Status {
case api.HealthPassing:
continue
case api.HealthWarning:
netlog.Yellow("warn A")
if onUpdate == structs.OnUpdateIgnoreWarn || onUpdate == structs.OnUpdateIgnore {
netlog.Yellow("warn B")
continue
}
netlog.Yellow("warn C")
case api.HealthCritical:
if onUpdate == structs.OnUpdateIgnore {
continue
}
default:
}

netlog.Yellow("passed false")
passed = false
t.setCheckHealth(false)
break CHECKS
}
}
}

netlog.Yellow("passed is %t", passed)

if !passed {
// Reset the timer since we have transitioned back to unhealthy
if primed {
Expand Down
10 changes: 6 additions & 4 deletions client/allocrunner/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/require"
"gophers.dev/pkgs/netlog"
)

// destroy does a blocking destroy on an alloc runner
Expand Down Expand Up @@ -1037,6 +1038,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
// Only return the check as healthy after a duration
consulClient := conf.Consul.(*regMock.ServiceRegistrationHandler)
consulClient.AllocRegistrationsFn = func(allocID string) (*serviceregistration.AllocRegistration, error) {
netlog.Blue("serving alloc regs fn %s", allocID)
return &serviceregistration.AllocRegistration{
Tasks: map[string]*serviceregistration.ServiceRegistrations{
task.Name: {
Expand Down Expand Up @@ -1076,10 +1078,10 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {

// Assert that we have an event explaining why we are unhealthy.
require.Len(t, lastUpdate.TaskStates, 1)
state := lastUpdate.TaskStates[task.Name]
require.NotNil(t, state)
require.NotEmpty(t, state.Events)
last := state.Events[len(state.Events)-1]
taskState := lastUpdate.TaskStates[task.Name]
require.NotNil(t, taskState)
require.NotEmpty(t, taskState.Events)
last := taskState.Events[len(taskState.Events)-1]
require.Equal(t, allochealth.AllocHealthEventSource, last.Type)
require.Contains(t, last.Message, "by healthy_deadline")
}
Expand Down
7 changes: 4 additions & 3 deletions nomad/structs/services_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1847,7 +1847,7 @@ func TestService_validateNomadService(t *testing.T) {
Provider: "nomad",
},
inputErr: &multierror.Error{},
expectedOutputErrors: []error{},
expectedOutputErrors: nil,
name: "valid service",
},
{
Expand All @@ -1859,13 +1859,14 @@ func TestService_validateNomadService(t *testing.T) {
Checks: []*ServiceCheck{{
Name: "webapp",
Type: ServiceCheckHTTP,
Method: "GET",
Path: "/health",
Interval: 3 * time.Second,
Timeout: 1 * time.Second,
}},
},
inputErr: &multierror.Error{},
expectedOutputErrors: []error{},
expectedOutputErrors: nil,
name: "valid service with checks",
},
{
Expand Down Expand Up @@ -1903,7 +1904,7 @@ func TestService_validateNomadService(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
tc.inputService.validateNomadService(tc.inputErr)
require.ElementsMatch(t, tc.expectedOutputErrors, tc.inputErr.Errors)
must.Eq(t, tc.expectedOutputErrors, tc.inputErr.Errors)
})
}
}

0 comments on commit 18b4561

Please sign in to comment.