Skip to content

Commit

Permalink
consul: correctly understand missing consul checks as unhealthy
Browse files Browse the repository at this point in the history
This PR fixes a bug where Nomad assumed any registered Checks would exist
in the service registration coming back from Consul. In some cases, the
Consul may be slow in processing the check registration, and the response
object would not contain checks. Nomad would then scan the empty response
looking for Checks with failing health status, finding none, and then
marking a task/alloc as healthy.

In reality, we must always use Nomad's view of what checks should exist as
the source of truth, and compare that with the response Consul gives us,
making sure they match, before scanning the Consul response for failing
check statuses.

Fixes #15536
  • Loading branch information
shoenig committed Jan 19, 2023
1 parent 294da1b commit dcc19d4
Show file tree
Hide file tree
Showing 5 changed files with 464 additions and 41 deletions.
3 changes: 3 additions & 0 deletions .changelog/15822.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
consul: correctly interpret missing consul checks as unhealthy
```
89 changes: 61 additions & 28 deletions client/allochealth/tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/hashicorp/consul/api"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-set"
"github.com/hashicorp/nomad/client/serviceregistration"
"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
cstructs "github.com/hashicorp/nomad/client/structs"
Expand Down Expand Up @@ -257,10 +258,9 @@ func (t *Tracker) setTaskHealth(healthy, terminal bool) {
// setCheckHealth is used to mark the checks as either healthy or unhealthy.
// returns true if health is propagated and no more health monitoring is needed
//
// todo: this is currently being shared by watchConsulEvents and watchNomadEvents,
//
// and must be split up if/when we support registering services (and thus checks)
// of different providers.
// todo: this is currently being shared by watchConsulEvents and watchNomadEvents
// and must be split up if/when we support registering services (and thus checks)
// of different providers.
func (t *Tracker) setCheckHealth(healthy bool) bool {
t.lock.Lock()
defer t.lock.Unlock()
Expand Down Expand Up @@ -437,6 +437,7 @@ func (h *healthyFuture) C() <-chan time.Time {
//
// Does not watch Nomad service checks; see watchNomadEvents for those.
func (t *Tracker) watchConsulEvents() {

// checkTicker is the ticker that triggers us to look at the checks in Consul
checkTicker := time.NewTicker(t.checkLookupInterval)
defer checkTicker.Stop()
Expand Down Expand Up @@ -502,30 +503,9 @@ OUTER:
// Detect if all the checks are passing
passed := true

CHECKS:
for _, treg := range allocReg.Tasks {
for _, sreg := range treg.Services {
for _, check := range sreg.Checks {
onUpdate := sreg.CheckOnUpdate[check.CheckID]
switch check.Status {
case api.HealthPassing:
continue
case api.HealthWarning:
if onUpdate == structs.OnUpdateIgnoreWarn || onUpdate == structs.OnUpdateIgnore {
continue
}
case api.HealthCritical:
if onUpdate == structs.OnUpdateIgnore {
continue
}
default:
}

passed = false
t.setCheckHealth(false)
break CHECKS
}
}
// scan for missing or unhealthy consul checks
if !evaluateConsulChecks(t.tg, allocReg) {
passed = false
}

if !passed {
Expand All @@ -543,6 +523,59 @@ OUTER:
}
}

func evaluateConsulChecks(tg *structs.TaskGroup, registrations *serviceregistration.AllocRegistration) bool {
// First, identify any case where a check definition is missing or outdated
// on the Consul side. Note that because check names are not unique, we must
// also keep track of the counts on each side and make sure those also match.
services := tg.ConsulServices()
expChecks := set.New[string](10)
expCount := 0
regChecks := set.New[string](10)
regCount := 0
for _, service := range services {
for _, check := range service.Checks {
expChecks.Insert(check.Name)
expCount++
}
}
for _, task := range registrations.Tasks {
for _, service := range task.Services {
for _, check := range service.Checks {
regChecks.Insert(check.Name)
regCount++
}
}
}
if expCount != regCount {
return false
}
if !expChecks.Equal(regChecks) {
return false
}

// Now we can simply scan the status of each Check reported by Consul.
for _, task := range registrations.Tasks {
for _, service := range task.Services {
for _, check := range service.Checks {
onUpdate := service.CheckOnUpdate[check.CheckID]
switch check.Status {
case api.HealthWarning:
if onUpdate != structs.OnUpdateIgnoreWarn && onUpdate != structs.OnUpdateIgnore {
return false
}
case api.HealthCritical:
if onUpdate != structs.OnUpdateIgnore {
return false
}
}
}
}
}

// All checks are present and healthy.
return true
}

// watchNomadEvents is a watcher for the health of the allocation's Nomad checks.
// If all checks report healthy the watcher will exit after the MinHealthyTime has
// been reached, otherwise the watcher will continue to check unhealthy checks until
Expand Down
Loading

0 comments on commit dcc19d4

Please sign in to comment.