Skip to content

Commit

Permalink
Merge pull request #2984 from hashicorp/b-tags
Browse files Browse the repository at this point in the history
Fix alloc health with checks using interpolation
  • Loading branch information
dadgar committed Aug 10, 2017
2 parents 582addd + 4d323e1 commit f49c026
Show file tree
Hide file tree
Showing 8 changed files with 440 additions and 87 deletions.
43 changes: 25 additions & 18 deletions client/alloc_runner_health_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"time"

"github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
)
Expand Down Expand Up @@ -105,7 +106,7 @@ func (r *AllocRunner) watchHealth(ctx context.Context) {
// Store whether the last consul checks call was successful or not
consulChecksErr := false

var checks []*api.AgentCheck
var allocReg *consul.AllocRegistration
first := true
OUTER:
for {
Expand All @@ -121,15 +122,15 @@ OUTER:
alloc = newAlloc
r.logger.Printf("[TRACE] client.alloc_watcher: new alloc version for %q", alloc.ID)
case <-checkCh:
newChecks, err := r.consulClient.Checks(alloc)
newAllocReg, err := r.consulClient.AllocRegistrations(alloc.ID)
if err != nil {
if !consulChecksErr {
consulChecksErr = true
r.logger.Printf("[WARN] client.alloc_watcher: failed to lookup consul checks for allocation %q: %v", alloc.ID, err)
r.logger.Printf("[WARN] client.alloc_watcher: failed to lookup consul registrations for allocation %q: %v", alloc.ID, err)
}
} else {
consulChecksErr = false
checks = newChecks
allocReg = newAllocReg
}
case <-deadline.C:
// We have exceeded our deadline without being healthy.
Expand Down Expand Up @@ -174,23 +175,29 @@ OUTER:
}

// If we should have checks and they aren't all healthy continue
if len(checks) != desiredChecks {
r.logger.Printf("[TRACE] client.alloc_watcher: continuing since all checks (want %d; got %d) haven't been registered for alloc %q", desiredChecks, len(checks), alloc.ID)
cancelHealthyTimer()
continue OUTER
}

// Check if all the checks are passing
for _, check := range checks {
if check.Status != api.HealthPassing {
r.logger.Printf("[TRACE] client.alloc_watcher: continuing since check %q isn't passing for alloc %q", check.CheckID, alloc.ID)
latestChecksHealthy = time.Time{}
if desiredChecks > 0 {
if allocReg.NumChecks() != desiredChecks {
r.logger.Printf("[TRACE] client.alloc_watcher: continuing since all checks (want %d; got %d) haven't been registered for alloc %q", desiredChecks, allocReg.NumChecks(), alloc.ID)
cancelHealthyTimer()
continue OUTER
}
}
if latestChecksHealthy.IsZero() {
latestChecksHealthy = time.Now()

// Check if all the checks are passing
for _, treg := range allocReg.Tasks {
for _, sreg := range treg.Services {
for _, check := range sreg.Checks {
if check.Status != api.HealthPassing {
r.logger.Printf("[TRACE] client.alloc_watcher: continuing since check %q isn't passing for alloc %q", check.CheckID, alloc.ID)
latestChecksHealthy = time.Time{}
cancelHealthyTimer()
continue OUTER
}
}
}
}
if latestChecksHealthy.IsZero() {
latestChecksHealthy = time.Now()
}
}

// Determine if the allocation is healthy
Expand Down
27 changes: 24 additions & 3 deletions client/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/boltdb/bolt"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
Expand Down Expand Up @@ -254,12 +255,32 @@ func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) {

// Only return the check as healthy after a duration
trigger := time.After(500 * time.Millisecond)
ar.consulClient.(*mockConsulServiceClient).checksFn = func(a *structs.Allocation) ([]*api.AgentCheck, error) {
ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
select {
case <-trigger:
return []*api.AgentCheck{checkHealthy}, nil
return &consul.AllocRegistration{
Tasks: map[string]*consul.TaskRegistration{
task.Name: {
Services: map[string]*consul.ServiceRegistration{
"123": {
Checks: []*api.AgentCheck{checkHealthy},
},
},
},
},
}, nil
default:
return []*api.AgentCheck{checkUnhealthy}, nil
return &consul.AllocRegistration{
Tasks: map[string]*consul.TaskRegistration{
task.Name: {
Services: map[string]*consul.ServiceRegistration{
"123": {
Checks: []*api.AgentCheck{checkUnhealthy},
},
},
},
},
}, nil
}
}

Expand Down
4 changes: 2 additions & 2 deletions client/consul.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package client

import (
"github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/client/driver"
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/nomad/structs"
)

Expand All @@ -13,5 +13,5 @@ type ConsulServiceAPI interface {
RegisterTask(allocID string, task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error
RemoveTask(allocID string, task *structs.Task)
UpdateTask(allocID string, existing, newTask *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error
Checks(alloc *structs.Allocation) ([]*api.AgentCheck, error)
AllocRegistrations(allocID string) (*consul.AllocRegistration, error)
}
19 changes: 10 additions & 9 deletions client/consul_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ import (
"sync"
"testing"

"github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/client/driver"
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/nomad/structs"
)

Expand All @@ -24,7 +24,7 @@ type mockConsulOp struct {
}

func newMockConsulOp(op, allocID string, task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) mockConsulOp {
if op != "add" && op != "remove" && op != "update" && op != "checks" {
if op != "add" && op != "remove" && op != "update" && op != "alloc_registrations" {
panic(fmt.Errorf("invalid consul op: %s", op))
}
return mockConsulOp{
Expand All @@ -44,8 +44,9 @@ type mockConsulServiceClient struct {

logger *log.Logger

// checksFn allows injecting return values for the Checks function.
checksFn func(*structs.Allocation) ([]*api.AgentCheck, error)
// allocRegistrationsFn allows injecting return values for the
// AllocRegistrations function.
allocRegistrationsFn func(allocID string) (*consul.AllocRegistration, error)
}

func newMockConsulServiceClient() *mockConsulServiceClient {
Expand Down Expand Up @@ -82,14 +83,14 @@ func (m *mockConsulServiceClient) RemoveTask(allocID string, task *structs.Task)
m.ops = append(m.ops, newMockConsulOp("remove", allocID, task, nil, nil))
}

func (m *mockConsulServiceClient) Checks(alloc *structs.Allocation) ([]*api.AgentCheck, error) {
func (m *mockConsulServiceClient) AllocRegistrations(allocID string) (*consul.AllocRegistration, error) {
m.mu.Lock()
defer m.mu.Unlock()
m.logger.Printf("[TEST] mock_consul: Checks(%q)", alloc.ID)
m.ops = append(m.ops, newMockConsulOp("checks", alloc.ID, nil, nil, nil))
m.logger.Printf("[TEST] mock_consul: AllocRegistrations(%q)", allocID)
m.ops = append(m.ops, newMockConsulOp("alloc_registrations", allocID, nil, nil, nil))

if m.checksFn != nil {
return m.checksFn(alloc)
if m.allocRegistrationsFn != nil {
return m.allocRegistrationsFn(allocID)
}

return nil, nil
Expand Down
Loading

0 comments on commit f49c026

Please sign in to comment.