Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix alloc health with checks using interpolation #2984

Merged
merged 2 commits into from
Aug 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 25 additions & 18 deletions client/alloc_runner_health_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"time"

"github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
)
Expand Down Expand Up @@ -105,7 +106,7 @@ func (r *AllocRunner) watchHealth(ctx context.Context) {
// Store whether the last consul checks call was successful or not
consulChecksErr := false

var checks []*api.AgentCheck
var allocReg *consul.AllocRegistration
first := true
OUTER:
for {
Expand All @@ -121,15 +122,15 @@ OUTER:
alloc = newAlloc
r.logger.Printf("[TRACE] client.alloc_watcher: new alloc version for %q", alloc.ID)
case <-checkCh:
newChecks, err := r.consulClient.Checks(alloc)
newAllocReg, err := r.consulClient.AllocRegistrations(alloc.ID)
if err != nil {
if !consulChecksErr {
consulChecksErr = true
r.logger.Printf("[WARN] client.alloc_watcher: failed to lookup consul checks for allocation %q: %v", alloc.ID, err)
r.logger.Printf("[WARN] client.alloc_watcher: failed to lookup consul registrations for allocation %q: %v", alloc.ID, err)
}
} else {
consulChecksErr = false
checks = newChecks
allocReg = newAllocReg
}
case <-deadline.C:
// We have exceeded our deadline without being healthy.
Expand Down Expand Up @@ -174,23 +175,29 @@ OUTER:
}

// If we should have checks and they aren't all healthy continue
if len(checks) != desiredChecks {
r.logger.Printf("[TRACE] client.alloc_watcher: continuing since all checks (want %d; got %d) haven't been registered for alloc %q", desiredChecks, len(checks), alloc.ID)
cancelHealthyTimer()
continue OUTER
}

// Check if all the checks are passing
for _, check := range checks {
if check.Status != api.HealthPassing {
r.logger.Printf("[TRACE] client.alloc_watcher: continuing since check %q isn't passing for alloc %q", check.CheckID, alloc.ID)
latestChecksHealthy = time.Time{}
if desiredChecks > 0 {
if allocReg.NumChecks() != desiredChecks {
r.logger.Printf("[TRACE] client.alloc_watcher: continuing since all checks (want %d; got %d) haven't been registered for alloc %q", desiredChecks, allocReg.NumChecks(), alloc.ID)
cancelHealthyTimer()
continue OUTER
}
}
if latestChecksHealthy.IsZero() {
latestChecksHealthy = time.Now()

// Check if all the checks are passing
for _, treg := range allocReg.Tasks {
for _, sreg := range treg.Services {
for _, check := range sreg.Checks {
if check.Status != api.HealthPassing {
r.logger.Printf("[TRACE] client.alloc_watcher: continuing since check %q isn't passing for alloc %q", check.CheckID, alloc.ID)
latestChecksHealthy = time.Time{}
cancelHealthyTimer()
continue OUTER
}
}
}
}
if latestChecksHealthy.IsZero() {
latestChecksHealthy = time.Now()
}
}

// Determine if the allocation is healthy
Expand Down
27 changes: 24 additions & 3 deletions client/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/boltdb/bolt"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
Expand Down Expand Up @@ -254,12 +255,32 @@ func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) {

// Only return the check as healthy after a duration
trigger := time.After(500 * time.Millisecond)
ar.consulClient.(*mockConsulServiceClient).checksFn = func(a *structs.Allocation) ([]*api.AgentCheck, error) {
ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
select {
case <-trigger:
return []*api.AgentCheck{checkHealthy}, nil
return &consul.AllocRegistration{
Tasks: map[string]*consul.TaskRegistration{
task.Name: {
Services: map[string]*consul.ServiceRegistration{
"123": {
Checks: []*api.AgentCheck{checkHealthy},
},
},
},
},
}, nil
default:
return []*api.AgentCheck{checkUnhealthy}, nil
return &consul.AllocRegistration{
Tasks: map[string]*consul.TaskRegistration{
task.Name: {
Services: map[string]*consul.ServiceRegistration{
"123": {
Checks: []*api.AgentCheck{checkUnhealthy},
},
},
},
},
}, nil
}
}

Expand Down
4 changes: 2 additions & 2 deletions client/consul.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package client

import (
"github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/client/driver"
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/nomad/structs"
)

Expand All @@ -13,5 +13,5 @@ type ConsulServiceAPI interface {
RegisterTask(allocID string, task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error
RemoveTask(allocID string, task *structs.Task)
UpdateTask(allocID string, existing, newTask *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error
Checks(alloc *structs.Allocation) ([]*api.AgentCheck, error)
AllocRegistrations(allocID string) (*consul.AllocRegistration, error)
}
19 changes: 10 additions & 9 deletions client/consul_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ import (
"sync"
"testing"

"github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/client/driver"
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/nomad/structs"
)

Expand All @@ -24,7 +24,7 @@ type mockConsulOp struct {
}

func newMockConsulOp(op, allocID string, task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) mockConsulOp {
if op != "add" && op != "remove" && op != "update" && op != "checks" {
if op != "add" && op != "remove" && op != "update" && op != "alloc_registrations" {
panic(fmt.Errorf("invalid consul op: %s", op))
}
return mockConsulOp{
Expand All @@ -44,8 +44,9 @@ type mockConsulServiceClient struct {

logger *log.Logger

// checksFn allows injecting return values for the Checks function.
checksFn func(*structs.Allocation) ([]*api.AgentCheck, error)
// allocRegistrationsFn allows injecting return values for the
// AllocRegistrations function.
allocRegistrationsFn func(allocID string) (*consul.AllocRegistration, error)
}

func newMockConsulServiceClient() *mockConsulServiceClient {
Expand Down Expand Up @@ -82,14 +83,14 @@ func (m *mockConsulServiceClient) RemoveTask(allocID string, task *structs.Task)
m.ops = append(m.ops, newMockConsulOp("remove", allocID, task, nil, nil))
}

func (m *mockConsulServiceClient) Checks(alloc *structs.Allocation) ([]*api.AgentCheck, error) {
func (m *mockConsulServiceClient) AllocRegistrations(allocID string) (*consul.AllocRegistration, error) {
m.mu.Lock()
defer m.mu.Unlock()
m.logger.Printf("[TEST] mock_consul: Checks(%q)", alloc.ID)
m.ops = append(m.ops, newMockConsulOp("checks", alloc.ID, nil, nil, nil))
m.logger.Printf("[TEST] mock_consul: AllocRegistrations(%q)", allocID)
m.ops = append(m.ops, newMockConsulOp("alloc_registrations", allocID, nil, nil, nil))

if m.checksFn != nil {
return m.checksFn(alloc)
if m.allocRegistrationsFn != nil {
return m.allocRegistrationsFn(allocID)
}

return nil, nil
Expand Down
Loading